Compare commits

..

15 Commits

Author SHA1 Message Date
Cullen
1ffdb1756f fix: dup line 2024-04-30 12:11:48 -05:00
Cullen Watson
1185693422 delete empty file 2024-04-30 12:06:20 -05:00
Lluís Salord Quetglas
dcd7144318 FIX: Allow Indeed search term with complex syntax (#139) 2024-04-30 12:05:43 -05:00
Cullen Watson
bf73c061bd enh: linkedin company logo (#141) 2024-04-30 12:03:10 -05:00
Lluís Salord Quetglas
8dd08ed9fd FEAT: Allow LinkedIn scraper to get external job apply url (#140) 2024-04-30 11:36:01 -05:00
Cullen Watson
5d3df732e6 docs: readme 2024-03-12 20:46:25 -05:00
Kellen Mace
86f858e06d Update scrape_jobs() parameters info in readme (#130) 2024-03-12 20:45:13 -05:00
Cullen
1089d1f0a5 docs: readme 2024-03-11 21:30:57 -05:00
Cullen
3e93454738 fix(indeed): readd param 2024-03-11 21:23:20 -05:00
Cullen Watson
0d150d519f docs: readme 2024-03-11 14:52:20 -05:00
Cullen Watson
cc3497f929 docs: readme 2024-03-11 14:45:17 -05:00
Cullen Watson
5986f75346 docs: readme 2024-03-11 14:41:12 -05:00
VitaminB16
4b7bdb9313 feat: Adjust log verbosity via verbose arg (#128) 2024-03-11 14:38:44 -05:00
Cullen Watson
80213f28d2 chore: version 2024-03-11 09:43:12 -05:00
Cullen Watson
ada38532c3 fix: indeed empty location term 2024-03-11 09:42:43 -05:00
10 changed files with 1242 additions and 988 deletions

View File

@@ -38,7 +38,8 @@ jobs = scrape_jobs(
location="Dallas, TX",
results_wanted=20,
hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old)
country_indeed='USA' # only needed for indeed / glassdoor
country_indeed='USA', # only needed for indeed / glassdoor
# linkedin_fetch_description=True # get full description and direct job url for linkedin (slower)
)
print(f"Found {len(jobs)} jobs")
print(jobs.head())
@@ -48,7 +49,7 @@ jobs.to_csv("jobs.csv", quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=Fal
### Output
```
SITE TITLE COMPANY_NAME CITY STATE JOB_TYPE INTERVAL MIN_AMOUNT MAX_AMOUNT JOB_URL DESCRIPTION
SITE TITLE COMPANY CITY STATE JOB_TYPE INTERVAL MIN_AMOUNT MAX_AMOUNT JOB_URL DESCRIPTION
indeed Software Engineer AMERICAN SYSTEMS Arlington VA None yearly 200000 150000 https://www.indeed.com/viewjob?jk=5e409e577046... THIS POSITION COMES WITH A 10K SIGNING BONUS!...
indeed Senior Software Engineer TherapyNotes.com Philadelphia PA fulltime yearly 135000 110000 https://www.indeed.com/viewjob?jk=da39574a40cb... About Us TherapyNotes is the national leader i...
linkedin Software Engineer - Early Career Lockheed Martin Sunnyvale CA fulltime yearly None None https://www.linkedin.com/jobs/view/3693012711 Description:By bringing together people that u...
@@ -60,23 +61,24 @@ zip_recruiter Software Developer TEKsystems Phoenix
### Parameters for `scrape_jobs()`
```plaintext
Required
├── site_type (List[enum]): linkedin, zip_recruiter, indeed, glassdoor
└── search_term (str)
Optional
├── site_name (list|str): linkedin, zip_recruiter, indeed, glassdoor (default is all four)
├── search_term (str)
├── location (str)
├── distance (int): in miles, default 50
├── job_type (enum): fulltime, parttime, internship, contract
├── job_type (str): fulltime, parttime, internship, contract
├── proxy (str): in format 'http://user:pass@host:port'
├── is_remote (bool)
├── linkedin_fetch_description (bool): fetches full description for LinkedIn (slower)
├── results_wanted (int): number of job results to retrieve for each site specified in 'site_type'
├── easy_apply (bool): filters for jobs that are hosted on the job board site (not supported on Indeed)
├── linkedin_company_ids (list[int): searches for linkedin jobs with specific company ids
├── description_format (enum): markdown, html (format type of the job descriptions)
├── country_indeed (enum): filters the country on Indeed (see below for correct spelling)
├── offset (num): starts the search from an offset (e.g. 25 will start the search from the 25th result)
├── hours_old (int): filters jobs by the number of hours since the job was posted (ZipRecruiter and Glassdoor round up to next day. If you use this on Indeed, it will not filter by job_type or is_remote)
├── results_wanted (int): number of job results to retrieve for each site specified in 'site_name'
├── easy_apply (bool): filters for jobs that are hosted on the job board site (LinkedIn & Indeed do not allow pairing this with hours_old)
├── linkedin_fetch_description (bool): fetches full description and direct job url for LinkedIn (slower)
├── linkedin_company_ids (list[int]): searches for linkedin jobs with specific company ids
├── description_format (str): markdown, html (Format type of the job descriptions. Default is markdown.)
├── country_indeed (str): filters the country on Indeed (see below for correct spelling)
├── offset (int): starts the search from an offset (e.g. 25 will start the search from the 25th result)
├── hours_old (int): filters jobs by the number of hours since the job was posted (ZipRecruiter and Glassdoor round up to next day. If you use this on Indeed, it will not filter by job_type/is_remote/easy_apply)
├── verbose (int) {0, 1, 2}: Controls the verbosity of the runtime printouts (0 prints only errors, 1 is errors+warnings, 2 is all logs. Default is 2.)
├── hyperlinks (bool): Whether to turn `job_url`s into hyperlinks. Default is false.
```
### JobPost Schema
@@ -119,7 +121,7 @@ Indeed specific
### **LinkedIn**
LinkedIn searches globally & uses only the `location` parameter. You can only fetch 1000 jobs max from the LinkedIn endpoint we are using
LinkedIn searches globally & uses only the `location` parameter.
### **ZipRecruiter**
@@ -154,7 +156,7 @@ You can specify the following countries when searching on Indeed (use the exact
## Notes
* Indeed is the best scraper currently with no rate limiting.
* Glassdoor/Ziprecruiter can only fetch 900/1000 jobs from the endpoints we are using on a given search.
* All the job board endpoints are capped at around 1000 jobs on a given search.
* LinkedIn is the most restrictive and usually rate limits around the 10th page.
## Frequently Asked Questions

View File

@@ -32,17 +32,18 @@ while len(all_jobs) < results_wanted:
search_term="software engineer",
# New York, NY
# Dallas, TX
# Los Angeles, CA
location="Los Angeles, CA",
results_wanted=min(results_in_each_iteration, results_wanted - len(all_jobs)),
results_wanted=min(
results_in_each_iteration, results_wanted - len(all_jobs)
),
country_indeed="USA",
offset=offset,
# proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
)
# Add the scraped jobs to the list
all_jobs.extend(jobs.to_dict('records'))
all_jobs.extend(jobs.to_dict("records"))
# Increment the offset for the next page of results
offset += results_in_each_iteration

2068
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -1,6 +1,6 @@
[tool.poetry]
name = "python-jobspy"
version = "1.1.49"
version = "1.1.52"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy"
@@ -19,13 +19,14 @@ NUMPY = "1.24.2"
pydantic = "^2.3.0"
tls-client = "^1.0.1"
markdownify = "^0.11.6"
regex = "^2024.4.28"
[tool.poetry.group.dev.dependencies]
pytest = "^7.4.1"
jupyter = "^1.0.0"
black = "^24.2.0"
pre-commit = "^3.6.2"
black = "*"
pre-commit = "*"
[build-system]
requires = ["poetry-core"]

View File

@@ -5,7 +5,7 @@ from typing import Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed
from .jobs import JobType, Location
from .scrapers.utils import logger
from .scrapers.utils import logger, set_logger_level
from .scrapers.indeed import IndeedScraper
from .scrapers.ziprecruiter import ZipRecruiterScraper
from .scrapers.glassdoor import GlassdoorScraper
@@ -36,11 +36,12 @@ def scrape_jobs(
linkedin_company_ids: list[int] | None = None,
offset: int | None = 0,
hours_old: int = None,
verbose: int = 2,
**kwargs,
) -> pd.DataFrame:
"""
Simultaneously scrapes job data from multiple job sites.
:return: results_wanted: pandas dataframe containing job data
:return: pandas dataframe containing job data
"""
SCRAPER_MAPPING = {
Site.LINKEDIN: LinkedInScraper,
@@ -48,6 +49,7 @@ def scrape_jobs(
Site.ZIP_RECRUITER: ZipRecruiterScraper,
Site.GLASSDOOR: GlassdoorScraper,
}
set_logger_level(verbose)
def map_str_to_site(site_name: str) -> Site:
return Site[site_name.upper()]

View File

@@ -1,5 +1,7 @@
from __future__ import annotations
from abc import ABC, abstractmethod
from ..jobs import (
Enum,
BaseModel,
@@ -36,9 +38,10 @@ class ScraperInput(BaseModel):
hours_old: int | None = None
class Scraper:
class Scraper(ABC):
def __init__(self, site: Site, proxy: list[str] | None = None):
self.site = site
self.proxy = (lambda p: {"http": p, "https": p} if p else None)(proxy)
@abstractmethod
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...

View File

@@ -90,18 +90,18 @@ class IndeedScraper(Scraper):
jobs = []
new_cursor = None
filters = self._build_filters()
location = (
self.scraper_input.location
or self.scraper_input.country.value[0].split(",")[-1]
)
search_term = self.scraper_input.search_term.replace('"', '\\"') if self.scraper_input.search_term else ""
query = self.job_search_query.format(
what=(
f'what: "{self.scraper_input.search_term}"'
if self.scraper_input.search_term
f'what: "{search_term}"'
if search_term
else ""
),
location=(
f'location: {{where: "{self.scraper_input.location}", radius: {self.scraper_input.distance}, radiusUnit: MILES}}'
if self.scraper_input.location
else ""
),
location=location,
radius=self.scraper_input.distance,
dateOnIndeed=self.scraper_input.hours_old,
cursor=f'cursor: "{cursor}"' if cursor else "",
filters=filters,
@@ -120,7 +120,7 @@ class IndeedScraper(Scraper):
)
if response.status_code != 200:
logger.info(
f"Indeed responded with status code: {response.status_code} (submit GitHub issue if this appears to be a beg)"
f"Indeed responded with status code: {response.status_code} (submit GitHub issue if this appears to be a bug)"
)
return jobs, new_cursor
data = response.json()
@@ -151,6 +151,15 @@ class IndeedScraper(Scraper):
""".format(
start=self.scraper_input.hours_old
)
elif self.scraper_input.easy_apply:
filters_str = """
filters: {
keyword: {
field: "indeedApplyScope",
keys: ["DESKTOP"]
}
}
"""
elif self.scraper_input.job_type or self.scraper_input.is_remote:
job_type_key_mapping = {
JobType.FULL_TIME: "CF3CP",
@@ -343,7 +352,7 @@ class IndeedScraper(Scraper):
query GetJobData {{
jobSearch(
{what}
location: {{ where: "{location}", radius: {radius}, radiusUnit: MILES }}
{location}
includeSponsoredResults: NONE
limit: 100
sort: DATE

View File

@@ -9,6 +9,8 @@ from __future__ import annotations
import time
import random
import regex as re
import urllib.parse
from typing import Optional
from datetime import datetime
@@ -51,6 +53,7 @@ class LinkedInScraper(Scraper):
super().__init__(Site(Site.LINKEDIN), proxy=proxy)
self.scraper_input = None
self.country = "worldwide"
self.job_url_direct_regex = re.compile(r'(?<=\?url=)[^"]+')
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
@@ -194,16 +197,16 @@ class LinkedInScraper(Scraper):
if metadata_card
else None
)
date_posted = description = job_type = None
date_posted = None
if datetime_tag and "datetime" in datetime_tag.attrs:
datetime_str = datetime_tag["datetime"]
try:
date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")
except:
date_posted = None
benefits_tag = job_card.find("span", class_="result-benefits__text")
job_details = {}
if full_descr:
description, job_type = self._get_job_description(job_url)
job_details = self._get_job_details(job_url)
return JobPost(
title=title,
@@ -213,18 +216,18 @@ class LinkedInScraper(Scraper):
date_posted=date_posted,
job_url=job_url,
compensation=compensation,
job_type=job_type,
description=description,
emails=extract_emails_from_text(description) if description else None,
job_type=job_details.get("job_type"),
description=job_details.get("description"),
job_url_direct=job_details.get("job_url_direct"),
emails=extract_emails_from_text(job_details.get("description")),
logo_photo_url=job_details.get("logo_photo_url"),
)
def _get_job_description(
self, job_page_url: str
) -> tuple[None, None] | tuple[str | None, tuple[str | None, JobType | None]]:
def _get_job_details(self, job_page_url: str) -> dict:
"""
Retrieves job description by going to the job page url
Retrieves job description and other job details by going to the job page url
:param job_page_url:
:return: description or None
:return: dict
"""
try:
session = create_session(is_tls=False, has_retry=True)
@@ -233,9 +236,9 @@ class LinkedInScraper(Scraper):
)
response.raise_for_status()
except:
return None, None
return {}
if response.url == "https://www.linkedin.com/signup":
return None, None
return {}
soup = BeautifulSoup(response.text, "html.parser")
div_content = soup.find(
@@ -253,7 +256,14 @@ class LinkedInScraper(Scraper):
description = div_content.prettify(formatter="html")
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
description = markdown_converter(description)
return description, self._parse_job_type(soup)
return {
"description": description,
"job_type": self._parse_job_type(soup),
"job_url_direct": self._parse_job_url_direct(soup),
"logo_photo_url": soup.find("img", {"class": "artdeco-entity-image"}).get(
"data-delayed-url"
),
}
def _get_location(self, metadata_card: Optional[Tag]) -> Location:
"""
@@ -306,6 +316,23 @@ class LinkedInScraper(Scraper):
return [get_enum_from_job_type(employment_type)] if employment_type else []
def _parse_job_url_direct(self, soup: BeautifulSoup) -> str | None:
"""
Gets the job url direct from job page
:param soup:
:return: str
"""
job_url_direct = None
job_url_direct_content = soup.find("code", id="applyUrl")
if job_url_direct_content:
job_url_direct_match = self.job_url_direct_regex.search(
job_url_direct_content.decode_contents().strip()
)
if job_url_direct_match:
job_url_direct = urllib.parse.unquote(job_url_direct_match.group())
return job_url_direct
@staticmethod
def job_type_code(job_type_enum: JobType) -> str:
return {

View File

@@ -21,6 +21,23 @@ if not logger.handlers:
logger.addHandler(console_handler)
def set_logger_level(verbose: int = 2):
"""
Adjusts the logger's level. This function allows the logging level to be changed at runtime.
Parameters:
- verbose: int {0, 1, 2} (default=2, all logs)
"""
if verbose is None:
return
level_name = {2: "INFO", 1: "WARNING", 0: "ERROR"}.get(verbose, "INFO")
level = getattr(logging, level_name.upper(), None)
if level is not None:
logger.setLevel(level)
else:
raise ValueError(f"Invalid log level: {level_name}")
def markdown_converter(description_html: str):
if description_html is None:
return None