Description format (#107)

pull/117/head v1.1.45
Cullen Watson 2024-02-14 16:04:23 -06:00 committed by GitHub
parent aeb1a50d2c
commit ba3a16b228
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 592 additions and 592 deletions

View File

@ -11,7 +11,7 @@ work with us.*
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, & **ZipRecruiter** simultaneously - Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, & **ZipRecruiter** simultaneously
- Aggregates the job postings in a Pandas DataFrame - Aggregates the job postings in a Pandas DataFrame
- Proxy support (HTTP/S, SOCKS) - Proxy support
[Video Guide for JobSpy](https://www.youtube.com/watch?v=RuP1HrAZnxs&pp=ygUgam9icyBzY3JhcGVyIGJvdCBsaW5rZWRpbiBpbmRlZWQ%3D) - [Video Guide for JobSpy](https://www.youtube.com/watch?v=RuP1HrAZnxs&pp=ygUgam9icyBzY3JhcGVyIGJvdCBsaW5rZWRpbiBpbmRlZWQ%3D) -
Updated for release v1.1.3 Updated for release v1.1.3
@ -67,12 +67,13 @@ Optional
├── location (int) ├── location (int)
├── distance (int): in miles ├── distance (int): in miles
├── job_type (enum): fulltime, parttime, internship, contract ├── job_type (enum): fulltime, parttime, internship, contract
├── proxy (str): in format 'http://user:pass@host:port' or [https, socks] ├── proxy (str): in format 'http://user:pass@host:port'
├── is_remote (bool) ├── is_remote (bool)
├── full_description (bool): fetches full description for LinkedIn (slower) ├── linkedin_fetch_description (bool): fetches full description for LinkedIn (slower)
├── results_wanted (int): number of job results to retrieve for each site specified in 'site_type' ├── results_wanted (int): number of job results to retrieve for each site specified in 'site_type'
├── easy_apply (bool): filters for jobs that are hosted on the job board site ├── easy_apply (bool): filters for jobs that are hosted on the job board site
├── linkedin_company_ids (list[int): searches for linkedin jobs with specific company ids ├── linkedin_company_ids (list[int): searches for linkedin jobs with specific company ids
├── description_format (enum): markdown, html (format type of the job descriptions)
├── country_indeed (enum): filters the country on Indeed (see below for correct spelling) ├── country_indeed (enum): filters the country on Indeed (see below for correct spelling)
├── offset (num): starts the search from an offset (e.g. 25 will start the search from the 25th result) ├── offset (num): starts the search from an offset (e.g. 25 will start the search from the 25th result)
├── hours_old (int): filters jobs by the number of hours since the job was posted (all but LinkedIn rounds up to next day) ├── hours_old (int): filters jobs by the number of hours since the job was posted (all but LinkedIn rounds up to next day)

13
poetry.lock generated
View File

@ -524,6 +524,17 @@ files = [
{file = "fqdn-1.5.1.tar.gz", hash = "sha256:105ed3677e767fb5ca086a0c1f4bb66ebc3c100be518f0e0d755d9eae164d89f"}, {file = "fqdn-1.5.1.tar.gz", hash = "sha256:105ed3677e767fb5ca086a0c1f4bb66ebc3c100be518f0e0d755d9eae164d89f"},
] ]
[[package]]
name = "html2text"
version = "2020.1.16"
description = "Turn HTML into equivalent Markdown-structured text."
optional = false
python-versions = ">=3.5"
files = [
{file = "html2text-2020.1.16-py3-none-any.whl", hash = "sha256:c7c629882da0cf377d66f073329ccf34a12ed2adf0169b9285ae4e63ef54c82b"},
{file = "html2text-2020.1.16.tar.gz", hash = "sha256:e296318e16b059ddb97f7a8a1d6a5c1d7af4544049a01e261731d2d5cc277bbb"},
]
[[package]] [[package]]
name = "idna" name = "idna"
version = "3.4" version = "3.4"
@ -2435,4 +2446,4 @@ files = [
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.10" python-versions = "^3.10"
content-hash = "404a77d78066cbb2ef71015562baf44aa11d12aac29a191c1ccc7758bfda598a" content-hash = "40cdc19a57cba0d21ff4f0fcfa53e14a073fcccd9f2a871440e056ab6e8fade0"

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "python-jobspy" name = "python-jobspy"
version = "1.1.44" version = "1.1.45"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy" homepage = "https://github.com/Bunsly/JobSpy"
@ -18,6 +18,7 @@ beautifulsoup4 = "^4.12.2"
pandas = "^2.1.0" pandas = "^2.1.0"
NUMPY = "1.24.2" NUMPY = "1.24.2"
pydantic = "^2.3.0" pydantic = "^2.3.0"
html2text = "^2020.1.16"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]

View File

@ -15,17 +15,6 @@ from .scrapers.exceptions import (
GlassdoorException, GlassdoorException,
) )
SCRAPER_MAPPING = {
Site.LINKEDIN: LinkedInScraper,
Site.INDEED: IndeedScraper,
Site.ZIP_RECRUITER: ZipRecruiterScraper,
Site.GLASSDOOR: GlassdoorScraper,
}
def _map_str_to_site(site_name: str) -> Site:
return Site[site_name.upper()]
def scrape_jobs( def scrape_jobs(
site_name: str | list[str] | Site | list[Site] | None = None, site_name: str | list[str] | Site | list[Site] | None = None,
@ -39,7 +28,8 @@ def scrape_jobs(
country_indeed: str = "usa", country_indeed: str = "usa",
hyperlinks: bool = False, hyperlinks: bool = False,
proxy: str | None = None, proxy: str | None = None,
full_description: bool | None = False, description_format: str = "markdown",
linkedin_fetch_description: bool | None = False,
linkedin_company_ids: list[int] | None = None, linkedin_company_ids: list[int] | None = None,
offset: int | None = 0, offset: int | None = 0,
hours_old: int = None, hours_old: int = None,
@ -49,6 +39,15 @@ def scrape_jobs(
Simultaneously scrapes job data from multiple job sites. Simultaneously scrapes job data from multiple job sites.
:return: results_wanted: pandas dataframe containing job data :return: results_wanted: pandas dataframe containing job data
""" """
SCRAPER_MAPPING = {
Site.LINKEDIN: LinkedInScraper,
Site.INDEED: IndeedScraper,
Site.ZIP_RECRUITER: ZipRecruiterScraper,
Site.GLASSDOOR: GlassdoorScraper,
}
def map_str_to_site(site_name: str) -> Site:
return Site[site_name.upper()]
def get_enum_from_value(value_str): def get_enum_from_value(value_str):
for job_type in JobType: for job_type in JobType:
@ -61,16 +60,15 @@ def scrape_jobs(
def get_site_type(): def get_site_type():
site_types = list(Site) site_types = list(Site)
if isinstance(site_name, str): if isinstance(site_name, str):
site_types = [_map_str_to_site(site_name)] site_types = [map_str_to_site(site_name)]
elif isinstance(site_name, Site): elif isinstance(site_name, Site):
site_types = [site_name] site_types = [site_name]
elif isinstance(site_name, list): elif isinstance(site_name, list):
site_types = [ site_types = [
_map_str_to_site(site) if isinstance(site, str) else site map_str_to_site(site) if isinstance(site, str) else site
for site in site_name for site in site_name
] ]
return site_types return site_types
country_enum = Country.from_string(country_indeed) country_enum = Country.from_string(country_indeed)
scraper_input = ScraperInput( scraper_input = ScraperInput(
@ -82,7 +80,8 @@ def scrape_jobs(
is_remote=is_remote, is_remote=is_remote,
job_type=job_type, job_type=job_type,
easy_apply=easy_apply, easy_apply=easy_apply,
full_description=full_description, description_format=description_format,
linkedin_fetch_description=linkedin_fetch_description,
results_wanted=results_wanted, results_wanted=results_wanted,
linkedin_company_ids=linkedin_company_ids, linkedin_company_ids=linkedin_company_ids,
offset=offset, offset=offset,
@ -92,22 +91,7 @@ def scrape_jobs(
def scrape_site(site: Site) -> Tuple[str, JobResponse]: def scrape_site(site: Site) -> Tuple[str, JobResponse]:
scraper_class = SCRAPER_MAPPING[site] scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class(proxy=proxy) scraper = scraper_class(proxy=proxy)
try:
scraped_data: JobResponse = scraper.scrape(scraper_input) scraped_data: JobResponse = scraper.scrape(scraper_input)
except (LinkedInException, IndeedException, ZipRecruiterException) as lie:
raise lie
except Exception as e:
if site == Site.LINKEDIN:
raise LinkedInException(str(e))
if site == Site.INDEED:
raise IndeedException(str(e))
if site == Site.ZIP_RECRUITER:
raise ZipRecruiterException(str(e))
if site == Site.GLASSDOOR:
raise GlassdoorException(str(e))
else:
raise e
return site.value, scraped_data return site.value, scraped_data
site_to_jobs_dict = {} site_to_jobs_dict = {}
@ -188,8 +172,6 @@ def scrape_jobs(
"emails", "emails",
"description", "description",
] ]
jobs_formatted_df = jobs_df[desired_order] return jobs_df[desired_order].sort_values(by=['site', 'date_posted'], ascending=[True, False])
else: else:
jobs_formatted_df = pd.DataFrame() return pd.DataFrame()
return jobs_formatted_df.sort_values(by=['site', 'date_posted'], ascending=[True, False])

View File

@ -210,6 +210,11 @@ class Compensation(BaseModel):
currency: Optional[str] = "USD" currency: Optional[str] = "USD"
class DescriptionFormat(Enum):
MARKDOWN = "markdown"
HTML = "html"
class JobPost(BaseModel): class JobPost(BaseModel):
title: str title: str
company_name: str company_name: str

View File

@ -1,4 +1,11 @@
from ..jobs import Enum, BaseModel, JobType, JobResponse, Country from ..jobs import (
Enum,
BaseModel,
JobType,
JobResponse,
Country,
DescriptionFormat
)
class Site(Enum): class Site(Enum):
@ -18,9 +25,10 @@ class ScraperInput(BaseModel):
is_remote: bool = False is_remote: bool = False
job_type: JobType | None = None job_type: JobType | None = None
easy_apply: bool | None = None easy_apply: bool | None = None
full_description: bool = False
offset: int = 0 offset: int = 0
linkedin_fetch_description: bool = False
linkedin_company_ids: list[int] | None = None linkedin_company_ids: list[int] | None = None
description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN
results_wanted: int = 15 results_wanted: int = 15
hours_old: int | None = None hours_old: int | None = None

View File

@ -13,7 +13,11 @@ from ..utils import count_urgent_words, extract_emails_from_text
from .. import Scraper, ScraperInput, Site from .. import Scraper, ScraperInput, Site
from ..exceptions import GlassdoorException from ..exceptions import GlassdoorException
from ..utils import create_session from ..utils import (
create_session,
markdown_converter,
logger
)
from ...jobs import ( from ...jobs import (
JobPost, JobPost,
Compensation, Compensation,
@ -21,6 +25,7 @@ from ...jobs import (
Location, Location,
JobResponse, JobResponse,
JobType, JobType,
DescriptionFormat
) )
@ -32,13 +37,57 @@ class GlassdoorScraper(Scraper):
site = Site(Site.GLASSDOOR) site = Site(Site.GLASSDOOR)
super().__init__(site, proxy=proxy) super().__init__(site, proxy=proxy)
self.url = None self.base_url = None
self.country = None self.country = None
self.session = None self.session = None
self.scraper_input = None
self.jobs_per_page = 30 self.jobs_per_page = 30
self.seen_urls = set() self.seen_urls = set()
def fetch_jobs_page( def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Scrapes Glassdoor for jobs with scraper_input criteria.
:param scraper_input: Information about job search criteria.
:return: JobResponse containing a list of jobs.
"""
self.scraper_input = scraper_input
self.scraper_input.results_wanted = min(900, scraper_input.results_wanted)
self.base_url = self.scraper_input.country.get_url()
location_id, location_type = self._get_location(
scraper_input.location, scraper_input.is_remote
)
if location_type is None:
return JobResponse(jobs=[])
all_jobs: list[JobPost] = []
cursor = None
max_pages = 30
self.session = create_session(self.proxy, is_tls=False, has_retry=True)
self.session.get(self.base_url)
try:
for page in range(
1 + (scraper_input.offset // self.jobs_per_page),
min(
(scraper_input.results_wanted // self.jobs_per_page) + 2,
max_pages + 1,
),
):
try:
jobs, cursor = self._fetch_jobs_page(
scraper_input, location_id, location_type, page, cursor
)
all_jobs.extend(jobs)
if len(all_jobs) >= scraper_input.results_wanted:
all_jobs = all_jobs[: scraper_input.results_wanted]
break
except Exception as e:
raise GlassdoorException(str(e))
except Exception as e:
raise GlassdoorException(str(e))
return JobResponse(jobs=all_jobs)
def _fetch_jobs_page(
self, self,
scraper_input: ScraperInput, scraper_input: ScraperInput,
location_id: int, location_id: int,
@ -49,12 +98,13 @@ class GlassdoorScraper(Scraper):
""" """
Scrapes a page of Glassdoor for jobs with scraper_input criteria Scrapes a page of Glassdoor for jobs with scraper_input criteria
""" """
self.scraper_input = scraper_input
try: try:
payload = self.add_payload( payload = self._add_payload(
scraper_input, location_id, location_type, page_num, cursor location_id, location_type, page_num, cursor
) )
response = self.session.post( response = self.session.post(
f"{self.url}/graph", headers=self.headers(), timeout=10, data=payload f"{self.base_url}/graph", headers=self.headers, timeout=10, data=payload
) )
if response.status_code != 200: if response.status_code != 200:
raise GlassdoorException( raise GlassdoorException(
@ -70,7 +120,7 @@ class GlassdoorScraper(Scraper):
jobs = [] jobs = []
with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor: with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor:
future_to_job_data = {executor.submit(self.process_job, job): job for job in jobs_data} future_to_job_data = {executor.submit(self._process_job, job): job for job in jobs_data}
for future in as_completed(future_to_job_data): for future in as_completed(future_to_job_data):
try: try:
job_post = future.result() job_post = future.result()
@ -83,10 +133,12 @@ class GlassdoorScraper(Scraper):
res_json["data"]["jobListings"]["paginationCursors"], page_num + 1 res_json["data"]["jobListings"]["paginationCursors"], page_num + 1
) )
def process_job(self, job_data): def _process_job(self, job_data):
"""Processes a single job and fetches its description.""" """
Processes a single job and fetches its description.
"""
job_id = job_data["jobview"]["job"]["listingId"] job_id = job_data["jobview"]["job"]["listingId"]
job_url = f'{self.url}job-listing/j?jl={job_id}' job_url = f'{self.base_url}job-listing/j?jl={job_id}'
if job_url in self.seen_urls: if job_url in self.seen_urls:
return None return None
self.seen_urls.add(job_url) self.seen_urls.add(job_url)
@ -106,15 +158,13 @@ class GlassdoorScraper(Scraper):
location = self.parse_location(location_name) location = self.parse_location(location_name)
compensation = self.parse_compensation(job["header"]) compensation = self.parse_compensation(job["header"])
try: try:
description = self.fetch_job_description(job_id) description = self._fetch_job_description(job_id)
except: except:
description = None description = None
return JobPost(
job_post = JobPost(
title=title, title=title,
company_url=f"{self.url}Overview/W-EI_IE{company_id}.htm" if company_id else None, company_url=f"{self.base_url}Overview/W-EI_IE{company_id}.htm" if company_id else None,
company_name=company_name, company_name=company_name,
date_posted=date_posted, date_posted=date_posted,
job_url=job_url, job_url=job_url,
@ -125,53 +175,12 @@ class GlassdoorScraper(Scraper):
emails=extract_emails_from_text(description) if description else None, emails=extract_emails_from_text(description) if description else None,
num_urgent_words=count_urgent_words(description) if description else None, num_urgent_words=count_urgent_words(description) if description else None,
) )
return job_post
def scrape(self, scraper_input: ScraperInput) -> JobResponse: def _fetch_job_description(self, job_id):
""" """
Scrapes Glassdoor for jobs with scraper_input criteria. Fetches the job description for a single job ID.
:param scraper_input: Information about job search criteria.
:return: JobResponse containing a list of jobs.
""" """
scraper_input.results_wanted = min(900, scraper_input.results_wanted) url = f"{self.base_url}/graph"
self.country = scraper_input.country
self.url = self.country.get_url()
location_id, location_type = self.get_location(
scraper_input.location, scraper_input.is_remote
)
all_jobs: list[JobPost] = []
cursor = None
max_pages = 30
self.session = create_session(self.proxy, is_tls=False, has_retry=True)
self.session.get(self.url)
try:
for page in range(
1 + (scraper_input.offset // self.jobs_per_page),
min(
(scraper_input.results_wanted // self.jobs_per_page) + 2,
max_pages + 1,
),
):
try:
jobs, cursor = self.fetch_jobs_page(
scraper_input, location_id, location_type, page, cursor
)
all_jobs.extend(jobs)
if len(all_jobs) >= scraper_input.results_wanted:
all_jobs = all_jobs[: scraper_input.results_wanted]
break
except Exception as e:
raise GlassdoorException(str(e))
except Exception as e:
raise GlassdoorException(str(e))
return JobResponse(jobs=all_jobs)
def fetch_job_description(self, job_id):
"""Fetches the job description for a single job ID."""
url = f"{self.url}/graph"
body = [ body = [
{ {
"operationName": "JobDetailQuery", "operationName": "JobDetailQuery",
@ -196,48 +205,28 @@ class GlassdoorScraper(Scraper):
""" """
} }
] ]
response = requests.post(url, json=body, headers=GlassdoorScraper.headers()) res = requests.post(url, json=body, headers=self.headers)
if response.status_code != 200: if res.status_code != 200:
return None return None
data = response.json()[0] data = res.json()[0]
desc = data['data']['jobview']['job']['description'] desc = data['data']['jobview']['job']['description']
return desc return markdown_converter(desc) if self.scraper_input.description_format == DescriptionFormat.MARKDOWN else desc
@staticmethod def _get_location(self, location: str, is_remote: bool) -> (int, str):
def parse_compensation(data: dict) -> Optional[Compensation]:
pay_period = data.get("payPeriod")
adjusted_pay = data.get("payPeriodAdjustedPay")
currency = data.get("payCurrency", "USD")
if not pay_period or not adjusted_pay:
return None
interval = None
if pay_period == "ANNUAL":
interval = CompensationInterval.YEARLY
elif pay_period:
interval = CompensationInterval.get_interval(pay_period)
min_amount = int(adjusted_pay.get("p10") // 1)
max_amount = int(adjusted_pay.get("p90") // 1)
return Compensation(
interval=interval,
min_amount=min_amount,
max_amount=max_amount,
currency=currency,
)
def get_location(self, location: str, is_remote: bool) -> (int, str):
if not location or is_remote: if not location or is_remote:
return "11047", "STATE" # remote options return "11047", "STATE" # remote options
url = f"{self.url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}" url = f"{self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}"
session = create_session(self.proxy, has_retry=True) session = create_session(self.proxy, has_retry=True)
response = session.get(url) res = session.get(url)
if response.status_code != 200: if res.status_code != 200:
raise GlassdoorException( if res.status_code == 429:
f"bad response status code: {response.status_code}" logger.error(f'429 Response - Blocked by Glassdoor for too many requests')
) return None, None
items = response.json() else:
logger.error(f'Glassdoor response status code {res.status_code}')
return None, None
items = res.json()
if not items: if not items:
raise ValueError(f"Location '{location}' not found on Glassdoor") raise ValueError(f"Location '{location}' not found on Glassdoor")
location_type = items[0]["locationType"] location_type = items[0]["locationType"]
@ -249,18 +238,16 @@ class GlassdoorScraper(Scraper):
location_type = "COUNTRY" location_type = "COUNTRY"
return int(items[0]["locationId"]), location_type return int(items[0]["locationId"]), location_type
@staticmethod def _add_payload(
def add_payload( self,
scraper_input,
location_id: int, location_id: int,
location_type: str, location_type: str,
page_num: int, page_num: int,
cursor: str | None = None, cursor: str | None = None,
) -> str: ) -> str:
# `fromage` is the posting time filter in days fromage = max(self.scraper_input.hours_old // 24, 1) if self.scraper_input.hours_old else None
fromage = max(scraper_input.hours_old // 24, 1) if scraper_input.hours_old else None
filter_params = [] filter_params = []
if scraper_input.easy_apply: if self.scraper_input.easy_apply:
filter_params.append({"filterKey": "applicationType", "values": "1"}) filter_params.append({"filterKey": "applicationType", "values": "1"})
if fromage: if fromage:
filter_params.append({"filterKey": "fromAge", "values": str(fromage)}) filter_params.append({"filterKey": "fromAge", "values": str(fromage)})
@ -269,7 +256,7 @@ class GlassdoorScraper(Scraper):
"variables": { "variables": {
"excludeJobListingIds": [], "excludeJobListingIds": [],
"filterParams": filter_params, "filterParams": filter_params,
"keyword": scraper_input.search_term, "keyword": self.scraper_input.search_term,
"numJobsToShow": 30, "numJobsToShow": 30,
"locationType": location_type, "locationType": location_type,
"locationId": int(location_id), "locationId": int(location_id),
@ -446,13 +433,34 @@ class GlassdoorScraper(Scraper):
} }
""" """
} }
if self.scraper_input.job_type:
if scraper_input.job_type:
payload["variables"]["filterParams"].append( payload["variables"]["filterParams"].append(
{"filterKey": "jobType", "values": scraper_input.job_type.value[0]} {"filterKey": "jobType", "values": self.scraper_input.job_type.value[0]}
) )
return json.dumps([payload]) return json.dumps([payload])
@staticmethod
def parse_compensation(data: dict) -> Optional[Compensation]:
pay_period = data.get("payPeriod")
adjusted_pay = data.get("payPeriodAdjustedPay")
currency = data.get("payCurrency", "USD")
if not pay_period or not adjusted_pay:
return None
interval = None
if pay_period == "ANNUAL":
interval = CompensationInterval.YEARLY
elif pay_period:
interval = CompensationInterval.get_interval(pay_period)
min_amount = int(adjusted_pay.get("p10") // 1)
max_amount = int(adjusted_pay.get("p90") // 1)
return Compensation(
interval=interval,
min_amount=min_amount,
max_amount=max_amount,
currency=currency,
)
@staticmethod @staticmethod
def get_job_type_enum(job_type_str: str) -> list[JobType] | None: def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
for job_type in JobType: for job_type in JobType:
@ -472,13 +480,7 @@ class GlassdoorScraper(Scraper):
if cursor_data["pageNumber"] == page_num: if cursor_data["pageNumber"] == page_num:
return cursor_data["cursor"] return cursor_data["cursor"]
@staticmethod headers = {
def headers() -> dict:
"""
Returns headers needed for requests
:return: dict - Dictionary containing headers
"""
return {
"authority": "www.glassdoor.com", "authority": "www.glassdoor.com",
"accept": "*/*", "accept": "*/*",
"accept-language": "en-US,en;q=0.9", "accept-language": "en-US,en;q=0.9",

View File

@ -21,6 +21,7 @@ from ..utils import (
extract_emails_from_text, extract_emails_from_text,
create_session, create_session,
get_enum_from_job_type, get_enum_from_job_type,
markdown_converter,
logger logger
) )
from ...jobs import ( from ...jobs import (
@ -30,6 +31,7 @@ from ...jobs import (
Location, Location,
JobResponse, JobResponse,
JobType, JobType,
DescriptionFormat
) )
from .. import Scraper, ScraperInput, Site from .. import Scraper, ScraperInput, Site
@ -39,121 +41,23 @@ class IndeedScraper(Scraper):
""" """
Initializes IndeedScraper with the Indeed job search url Initializes IndeedScraper with the Indeed job search url
""" """
self.url = None self.scraper_input = None
self.country = None self.jobs_per_page = 25
self.num_workers = 10
self.seen_urls = set()
self.base_url = None
self.api_url = "https://apis.indeed.com/graphql"
site = Site(Site.INDEED) site = Site(Site.INDEED)
super().__init__(site, proxy=proxy) super().__init__(site, proxy=proxy)
self.jobs_per_page = 25
self.seen_urls = set()
def scrape_page(
self, scraper_input: ScraperInput, page: int
) -> list[JobPost]:
"""
Scrapes a page of Indeed for jobs with scraper_input criteria
:param scraper_input:
:param page:
:return: jobs found on page, total number of jobs found for search
"""
job_list = []
self.country = scraper_input.country
domain = self.country.indeed_domain_value
self.url = f"https://{domain}.indeed.com"
try:
session = create_session(self.proxy)
response = session.get(
f"{self.url}/m/jobs",
headers=self.get_headers(),
params=self.add_params(scraper_input, page),
allow_redirects=True,
timeout_seconds=10,
)
if response.status_code not in range(200, 400):
raise IndeedException(
f"bad response with status code: {response.status_code}"
)
except Exception as e:
if "Proxy responded with" in str(e):
logger.error(f'Indeed: Bad proxy')
else:
logger.error(f'Indeed: {str(e)}')
return job_list
soup = BeautifulSoup(response.content, "html.parser")
if "did not match any jobs" in response.text:
return job_list
jobs = IndeedScraper.parse_jobs(
soup
) #: can raise exception, handled by main scrape function
if (
not jobs.get("metaData", {})
.get("mosaicProviderJobCardsModel", {})
.get("results")
):
raise IndeedException("No jobs found.")
def process_job(job: dict, job_detailed: dict) -> JobPost | None:
job_url = f'{self.url}/m/jobs/viewjob?jk={job["jobkey"]}'
job_url_client = f'{self.url}/viewjob?jk={job["jobkey"]}'
if job_url in self.seen_urls:
return None
self.seen_urls.add(job_url)
description = job_detailed['description']['html']
job_type = IndeedScraper.get_job_type(job)
timestamp_seconds = job["pubDate"] / 1000
date_posted = datetime.fromtimestamp(timestamp_seconds)
date_posted = date_posted.strftime("%Y-%m-%d")
job_post = JobPost(
title=job["normTitle"],
description=description,
company_name=job["company"],
company_url=f"{self.url}{job_detailed['employer']['relativeCompanyPageUrl']}" if job_detailed['employer'] else None,
location=Location(
city=job.get("jobLocationCity"),
state=job.get("jobLocationState"),
country=self.country,
),
job_type=job_type,
compensation=self.get_compensation(job, job_detailed),
date_posted=date_posted,
job_url=job_url_client,
emails=extract_emails_from_text(description) if description else None,
num_urgent_words=count_urgent_words(description)
if description
else None,
is_remote=IndeedScraper.is_job_remote(job, job_detailed, description)
)
return job_post
workers = 10
jobs = jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]
job_keys = [job['jobkey'] for job in jobs]
jobs_detailed = self.get_job_details(job_keys)
with ThreadPoolExecutor(max_workers=workers) as executor:
job_results: list[Future] = [
executor.submit(process_job, job, job_detailed['job']) for job, job_detailed in zip(jobs, jobs_detailed)
]
job_list = [result.result() for result in job_results if result.result()]
return job_list
def scrape(self, scraper_input: ScraperInput) -> JobResponse: def scrape(self, scraper_input: ScraperInput) -> JobResponse:
""" """
Scrapes Indeed for jobs with scraper_input criteria Scrapes Indeed for jobs with scraper_input criteria
:param scraper_input: :param scraper_input:
:return: job_response :return: job_response
""" """
job_list = self.scrape_page(scraper_input, 0) self.scraper_input = scraper_input
job_list = self._scrape_page()
pages_processed = 1 pages_processed = 1
while len(self.seen_urls) < scraper_input.results_wanted: while len(self.seen_urls) < scraper_input.results_wanted:
@ -162,7 +66,7 @@ class IndeedScraper(Scraper):
with ThreadPoolExecutor(max_workers=10) as executor: with ThreadPoolExecutor(max_workers=10) as executor:
futures: list[Future] = [ futures: list[Future] = [
executor.submit(self.scrape_page, scraper_input, page + pages_processed) executor.submit(self._scrape_page, page + pages_processed)
for page in range(pages_to_process) for page in range(pages_to_process)
] ]
@ -184,8 +88,136 @@ class IndeedScraper(Scraper):
return JobResponse(jobs=job_list) return JobResponse(jobs=job_list)
def _scrape_page(self, page: int=0) -> list[JobPost]:
"""
Scrapes a page of Indeed for jobs with scraper_input criteria
:param page:
:return: jobs found on page, total number of jobs found for search
"""
job_list = []
domain = self.scraper_input.country.indeed_domain_value
self.base_url = f"https://{domain}.indeed.com"
try:
session = create_session(self.proxy)
response = session.get(
f"{self.base_url}/m/jobs",
headers=self.headers,
params=self._add_params(page),
)
if response.status_code not in range(200, 400):
if response.status_code == 429:
logger.error(f'429 Response - Blocked by Indeed for too many requests')
else:
logger.error(f'Indeed response status code {response.status_code}')
return job_list
except Exception as e:
if "Proxy responded with" in str(e):
logger.error(f'Indeed: Bad proxy')
else:
logger.error(f'Indeed: {str(e)}')
return job_list
soup = BeautifulSoup(response.content, "html.parser")
if "did not match any jobs" in response.text:
return job_list
jobs = IndeedScraper._parse_jobs(soup)
if (
not jobs.get("metaData", {})
.get("mosaicProviderJobCardsModel", {})
.get("results")
):
raise IndeedException("No jobs found.")
jobs = jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]
job_keys = [job['jobkey'] for job in jobs]
jobs_detailed = self._get_job_details(job_keys)
with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
job_results: list[Future] = [
executor.submit(self._process_job, job, job_detailed['job']) for job, job_detailed in zip(jobs, jobs_detailed)
]
job_list = [result.result() for result in job_results if result.result()]
return job_list
def _process_job(self, job: dict, job_detailed: dict) -> JobPost | None:
job_url = f'{self.base_url}/m/jobs/viewjob?jk={job["jobkey"]}'
job_url_client = f'{self.base_url}/viewjob?jk={job["jobkey"]}'
if job_url in self.seen_urls:
return None
self.seen_urls.add(job_url)
description = job_detailed['description']['html']
description = markdown_converter(description) if self.scraper_input.description_format == DescriptionFormat.MARKDOWN else description
job_type = self._get_job_type(job)
timestamp_seconds = job["pubDate"] / 1000
date_posted = datetime.fromtimestamp(timestamp_seconds)
date_posted = date_posted.strftime("%Y-%m-%d")
return JobPost(
title=job["normTitle"],
description=description,
company_name=job["company"],
company_url=f"{self.base_url}{job_detailed['employer']['relativeCompanyPageUrl']}" if job_detailed[
'employer'] else None,
location=Location(
city=job.get("jobLocationCity"),
state=job.get("jobLocationState"),
country=self.scraper_input.country,
),
job_type=job_type,
compensation=self._get_compensation(job, job_detailed),
date_posted=date_posted,
job_url=job_url_client,
emails=extract_emails_from_text(description) if description else None,
num_urgent_words=count_urgent_words(description) if description else None,
is_remote=self._is_job_remote(job, job_detailed, description)
)
def _get_job_details(self, job_keys: list[str]) -> dict:
"""
Queries the GraphQL endpoint for detailed job information for the given job keys.
"""
job_keys_gql = '[' + ', '.join(f'"{key}"' for key in job_keys) + ']'
payload = dict(self.api_payload)
payload["query"] = self.api_payload["query"].format(job_keys_gql=job_keys_gql)
response = requests.post(self.api_url, headers=self.api_headers, json=payload, proxies=self.proxy)
if response.status_code == 200:
return response.json()['data']['jobData']['results']
else:
return {}
def _add_params(self, page: int) -> dict[str, str | Any]:
fromage = max(self.scraper_input.hours_old // 24, 1) if self.scraper_input.hours_old else None
params = {
"q": self.scraper_input.search_term,
"l": self.scraper_input.location if self.scraper_input.location else self.scraper_input.country.value[0].split(',')[-1],
"filter": 0,
"start": self.scraper_input.offset + page * 10,
"sort": "date",
"fromage": fromage,
}
if self.scraper_input.distance:
params["radius"] = self.scraper_input.distance
sc_values = []
if self.scraper_input.is_remote:
sc_values.append("attr(DSQF7)")
if self.scraper_input.job_type:
sc_values.append("jt({})".format(self.scraper_input.job_type.value[0]))
if sc_values:
params["sc"] = "0kf:" + "".join(sc_values) + ";"
if self.scraper_input.easy_apply:
params['iafilter'] = 1
return params
@staticmethod @staticmethod
def get_job_type(job: dict) -> list[JobType] | None: def _get_job_type(job: dict) -> list[JobType] | None:
""" """
Parses the job to get list of job types Parses the job to get list of job types
:param job: :param job:
@ -204,7 +236,7 @@ class IndeedScraper(Scraper):
return job_types return job_types
@staticmethod @staticmethod
def get_compensation(job: dict, job_detailed: dict) -> Compensation: def _get_compensation(job: dict, job_detailed: dict) -> Compensation:
""" """
Parses the job to get Parses the job to get
:param job: :param job:
@ -213,7 +245,7 @@ class IndeedScraper(Scraper):
""" """
comp = job_detailed['compensation']['baseSalary'] comp = job_detailed['compensation']['baseSalary']
if comp: if comp:
interval = IndeedScraper.get_correct_interval(comp['unitOfWork']) interval = IndeedScraper._get_correct_interval(comp['unitOfWork'])
if interval: if interval:
return Compensation( return Compensation(
interval=interval, interval=interval,
@ -242,18 +274,13 @@ class IndeedScraper(Scraper):
return compensation return compensation
@staticmethod @staticmethod
def parse_jobs(soup: BeautifulSoup) -> dict: def _parse_jobs(soup: BeautifulSoup) -> dict:
""" """
Parses the jobs from the soup object Parses the jobs from the soup object
:param soup: :param soup:
:return: jobs :return: jobs
""" """
def find_mosaic_script() -> Tag | None: def find_mosaic_script() -> Tag | None:
"""
Finds jobcards script tag
:return: script_tag
"""
script_tags = soup.find_all("script") script_tags = soup.find_all("script")
for tag in script_tags: for tag in script_tags:
@ -266,7 +293,6 @@ class IndeedScraper(Scraper):
return None return None
script_tag = find_mosaic_script() script_tag = find_mosaic_script()
if script_tag: if script_tag:
script_str = script_tag.string script_str = script_tag.string
pattern = r'window.mosaic.providerData\["mosaic-provider-jobcards"\]\s*=\s*({.*?});' pattern = r'window.mosaic.providerData\["mosaic-provider-jobcards"\]\s*=\s*({.*?});'
@ -283,49 +309,7 @@ class IndeedScraper(Scraper):
) )
@staticmethod @staticmethod
def get_headers(): def _is_job_remote(job: dict, job_detailed: dict, description: str) -> bool:
return {
'Host': 'www.indeed.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'sec-fetch-site': 'same-origin',
'sec-fetch-dest': 'document',
'accept-language': 'en-US,en;q=0.9',
'sec-fetch-mode': 'navigate',
'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Indeed App 192.0',
'referer': 'https://www.indeed.com/m/jobs?q=software%20intern&l=Dallas%2C%20TX&from=serpso&rq=1&rsIdx=3',
}
@staticmethod
def add_params(scraper_input: ScraperInput, page: int) -> dict[str, str | Any]:
# `fromage` is the posting time filter in days
fromage = max(scraper_input.hours_old // 24, 1) if scraper_input.hours_old else None
params = {
"q": scraper_input.search_term,
"l": scraper_input.location if scraper_input.location else scraper_input.country.value[0].split(',')[-1],
"filter": 0,
"start": scraper_input.offset + page * 10,
"sort": "date",
"fromage": fromage,
}
if scraper_input.distance:
params["radius"] = scraper_input.distance
sc_values = []
if scraper_input.is_remote:
sc_values.append("attr(DSQF7)")
if scraper_input.job_type:
sc_values.append("jt({})".format(scraper_input.job_type.value[0]))
if sc_values:
params["sc"] = "0kf:" + "".join(sc_values) + ";"
if scraper_input.easy_apply:
params['iafilter'] = 1
return params
@staticmethod
def is_job_remote(job: dict, job_detailed: dict, description: str) -> bool:
remote_keywords = ['remote', 'work from home', 'wfh'] remote_keywords = ['remote', 'work from home', 'wfh']
is_remote_in_attributes = any( is_remote_in_attributes = any(
any(keyword in attr['label'].lower() for keyword in remote_keywords) any(keyword in attr['label'].lower() for keyword in remote_keywords)
@ -342,12 +326,32 @@ class IndeedScraper(Scraper):
) )
return is_remote_in_attributes or is_remote_in_description or is_remote_in_location or is_remote_in_taxonomy return is_remote_in_attributes or is_remote_in_description or is_remote_in_location or is_remote_in_taxonomy
def get_job_details(self, job_keys: list[str]) -> dict: @staticmethod
""" def _get_correct_interval(interval: str) -> CompensationInterval:
Queries the GraphQL endpoint for detailed job information for the given job keys. interval_mapping = {
""" "DAY": "DAILY",
url = "https://apis.indeed.com/graphql" "YEAR": "YEARLY",
"HOUR": "HOURLY",
"WEEK": "WEEKLY",
"MONTH": "MONTHLY"
}
mapped_interval = interval_mapping.get(interval.upper(), None)
if mapped_interval and mapped_interval in CompensationInterval.__members__:
return CompensationInterval[mapped_interval]
else:
raise ValueError(f"Unsupported interval: {interval}")
headers = { headers = {
'Host': 'www.indeed.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'sec-fetch-site': 'same-origin',
'sec-fetch-dest': 'document',
'accept-language': 'en-US,en;q=0.9',
'sec-fetch-mode': 'navigate',
'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Indeed App 192.0',
'referer': 'https://www.indeed.com/m/jobs?q=software%20intern&l=Dallas%2C%20TX&from=serpso&rq=1&rsIdx=3',
}
api_headers = {
'Host': 'apis.indeed.com', 'Host': 'apis.indeed.com',
'content-type': 'application/json', 'content-type': 'application/json',
'indeed-api-key': '161092c2017b5bbab13edb12461a62d5a833871e7cad6d9d475304573de67ac8', 'indeed-api-key': '161092c2017b5bbab13edb12461a62d5a833871e7cad6d9d475304573de67ac8',
@ -358,11 +362,8 @@ class IndeedScraper(Scraper):
'indeed-app-info': 'appv=193.1; appid=com.indeed.jobsearch; osv=16.6.1; os=ios; dtype=phone', 'indeed-app-info': 'appv=193.1; appid=com.indeed.jobsearch; osv=16.6.1; os=ios; dtype=phone',
'indeed-co': 'US', 'indeed-co': 'US',
} }
api_payload = {
job_keys_gql = '[' + ', '.join(f'"{key}"' for key in job_keys) + ']' "query": """
payload = {
"query": f"""
query GetJobData {{ query GetJobData {{
jobData(input: {{ jobData(input: {{
jobKeys: {job_keys_gql} jobKeys: {job_keys_gql}
@ -414,23 +415,3 @@ class IndeedScraper(Scraper):
}} }}
""" """
} }
response = requests.post(url, headers=headers, json=payload, proxies=self.proxy)
if response.status_code == 200:
return response.json()['data']['jobData']['results']
else:
return {}
@staticmethod
def get_correct_interval(interval: str) -> CompensationInterval:
interval_mapping = {
"DAY": "DAILY",
"YEAR": "YEARLY",
"HOUR": "HOURLY",
"WEEK": "WEEKLY",
"MONTH": "MONTHLY"
}
mapped_interval = interval_mapping.get(interval.upper(), None)
if mapped_interval and mapped_interval in CompensationInterval.__members__:
return CompensationInterval[mapped_interval]
else:
raise ValueError(f"Unsupported interval: {interval}")

View File

@ -25,26 +25,30 @@ from ...jobs import (
JobResponse, JobResponse,
JobType, JobType,
Country, Country,
Compensation Compensation,
DescriptionFormat
) )
from ..utils import ( from ..utils import (
logger,
count_urgent_words, count_urgent_words,
extract_emails_from_text, extract_emails_from_text,
get_enum_from_job_type, get_enum_from_job_type,
currency_parser currency_parser,
markdown_converter
) )
class LinkedInScraper(Scraper): class LinkedInScraper(Scraper):
DELAY = 3 base_url = "https://www.linkedin.com"
delay = 3
def __init__(self, proxy: Optional[str] = None): def __init__(self, proxy: Optional[str] = None):
""" """
Initializes LinkedInScraper with the LinkedIn job search url Initializes LinkedInScraper with the LinkedIn job search url
""" """
self.scraper_input = None
site = Site(Site.LINKEDIN) site = Site(Site.LINKEDIN)
self.country = "worldwide" self.country = "worldwide"
self.url = "https://www.linkedin.com"
super().__init__(site, proxy=proxy) super().__init__(site, proxy=proxy)
def scrape(self, scraper_input: ScraperInput) -> JobResponse: def scrape(self, scraper_input: ScraperInput) -> JobResponse:
@ -53,28 +57,16 @@ class LinkedInScraper(Scraper):
:param scraper_input: :param scraper_input:
:return: job_response :return: job_response
""" """
self.scraper_input = scraper_input
job_list: list[JobPost] = [] job_list: list[JobPost] = []
seen_urls = set() seen_urls = set()
url_lock = Lock() url_lock = Lock()
page = scraper_input.offset // 25 + 25 if scraper_input.offset else 0 page = scraper_input.offset // 25 + 25 if scraper_input.offset else 0
seconds_old = ( seconds_old = (
scraper_input.hours_old * 3600 scraper_input.hours_old * 3600
if scraper_input.hours_old if scraper_input.hours_old
else None else None
) )
def job_type_code(job_type_enum):
mapping = {
JobType.FULL_TIME: "F",
JobType.PART_TIME: "P",
JobType.INTERNSHIP: "I",
JobType.CONTRACT: "C",
JobType.TEMPORARY: "T",
}
return mapping.get(job_type_enum, "")
continue_search = lambda: len(job_list) < scraper_input.results_wanted and page < 1000 continue_search = lambda: len(job_list) < scraper_input.results_wanted and page < 1000
while continue_search(): while continue_search():
@ -84,7 +76,7 @@ class LinkedInScraper(Scraper):
"location": scraper_input.location, "location": scraper_input.location,
"distance": scraper_input.distance, "distance": scraper_input.distance,
"f_WT": 2 if scraper_input.is_remote else None, "f_WT": 2 if scraper_input.is_remote else None,
"f_JT": job_type_code(scraper_input.job_type) "f_JT": self.job_type_code(scraper_input.job_type)
if scraper_input.job_type if scraper_input.job_type
else None, else None,
"pageNum": 0, "pageNum": 0,
@ -97,23 +89,25 @@ class LinkedInScraper(Scraper):
params = {k: v for k, v in params.items() if v is not None} params = {k: v for k, v in params.items() if v is not None}
try: try:
response = session.get( response = session.get(
f"{self.url}/jobs-guest/jobs/api/seeMoreJobPostings/search?", f"{self.base_url}/jobs-guest/jobs/api/seeMoreJobPostings/search?",
params=params, params=params,
allow_redirects=True, allow_redirects=True,
proxies=self.proxy, proxies=self.proxy,
headers=self.headers(), headers=self.headers,
timeout=10, timeout=10,
) )
response.raise_for_status() if response.status_code not in range(200, 400):
if response.status_code == 429:
except requests.HTTPError as e: logger.error(f'429 Response - Blocked by LinkedIn for too many requests')
raise LinkedInException( else:
f"bad response status code: {e.response.status_code}" logger.error(f'LinkedIn response status code {response.status_code}')
) return JobResponse(job_list=job_list)
except ProxyError as e:
raise LinkedInException("bad proxy")
except Exception as e: except Exception as e:
raise LinkedInException(str(e)) if "Proxy responded with" in str(e):
logger.error(f'Indeed: Bad proxy')
else:
logger.error(f'Indeed: {str(e)}')
return JobResponse(job_list=job_list)
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
job_cards = soup.find_all("div", class_="base-search-card") job_cards = soup.find_all("div", class_="base-search-card")
@ -126,29 +120,29 @@ class LinkedInScraper(Scraper):
if href_tag and "href" in href_tag.attrs: if href_tag and "href" in href_tag.attrs:
href = href_tag.attrs["href"].split("?")[0] href = href_tag.attrs["href"].split("?")[0]
job_id = href.split("-")[-1] job_id = href.split("-")[-1]
job_url = f"{self.url}/jobs/view/{job_id}" job_url = f"{self.base_url}/jobs/view/{job_id}"
with url_lock: with url_lock:
if job_url in seen_urls: if job_url in seen_urls:
continue continue
seen_urls.add(job_url) seen_urls.add(job_url)
# Call process_job directly without threading
try: try:
job_post = self.process_job(job_card, job_url, scraper_input.full_description) job_post = self._process_job(job_card, job_url, scraper_input.linkedin_fetch_description)
if job_post: if job_post:
job_list.append(job_post) job_list.append(job_post)
if not continue_search():
break
except Exception as e: except Exception as e:
raise LinkedInException("Exception occurred while processing jobs") raise LinkedInException(str(e))
if continue_search(): if continue_search():
time.sleep(random.uniform(LinkedInScraper.DELAY, LinkedInScraper.DELAY + 2)) time.sleep(random.uniform(self.delay, self.delay + 2))
page += 25 page += 25
job_list = job_list[: scraper_input.results_wanted] job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list) return JobResponse(jobs=job_list)
def process_job(self, job_card: Tag, job_url: str, full_descr: bool) -> Optional[JobPost]: def _process_job(self, job_card: Tag, job_url: str, full_descr: bool) -> Optional[JobPost]:
salary_tag = job_card.find('span', class_='job-search-card__salary-info') salary_tag = job_card.find('span', class_='job-search-card__salary-info')
compensation = None compensation = None
@ -178,7 +172,7 @@ class LinkedInScraper(Scraper):
company = company_a_tag.get_text(strip=True) if company_a_tag else "N/A" company = company_a_tag.get_text(strip=True) if company_a_tag else "N/A"
metadata_card = job_card.find("div", class_="base-search-card__metadata") metadata_card = job_card.find("div", class_="base-search-card__metadata")
location = self.get_location(metadata_card) location = self._get_location(metadata_card)
datetime_tag = ( datetime_tag = (
metadata_card.find("time", class_="job-search-card__listdate") metadata_card.find("time", class_="job-search-card__listdate")
@ -190,12 +184,12 @@ class LinkedInScraper(Scraper):
datetime_str = datetime_tag["datetime"] datetime_str = datetime_tag["datetime"]
try: try:
date_posted = datetime.strptime(datetime_str, "%Y-%m-%d") date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")
except Exception as e: except:
date_posted = None date_posted = None
benefits_tag = job_card.find("span", class_="result-benefits__text") benefits_tag = job_card.find("span", class_="result-benefits__text")
benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None
if full_descr: if full_descr:
description, job_type = self.get_job_description(job_url) description, job_type = self._get_job_description(job_url)
return JobPost( return JobPost(
title=title, title=title,
@ -212,7 +206,7 @@ class LinkedInScraper(Scraper):
num_urgent_words=count_urgent_words(description) if description else None, num_urgent_words=count_urgent_words(description) if description else None,
) )
def get_job_description( def _get_job_description(
self, job_page_url: str self, job_page_url: str
) -> tuple[None, None] | tuple[str | None, tuple[str | None, JobType | None]]: ) -> tuple[None, None] | tuple[str | None, tuple[str | None, JobType | None]]:
""" """
@ -222,11 +216,9 @@ class LinkedInScraper(Scraper):
""" """
try: try:
session = create_session(is_tls=False, has_retry=True) session = create_session(is_tls=False, has_retry=True)
response = session.get(job_page_url, timeout=5, proxies=self.proxy) response = session.get(job_page_url, headers=self.headers, timeout=5, proxies=self.proxy)
response.raise_for_status() response.raise_for_status()
except requests.HTTPError as e: except:
return None, None
except Exception as e:
return None, None return None, None
if response.url == "https://www.linkedin.com/signup": if response.url == "https://www.linkedin.com/signup":
return None, None return None, None
@ -241,40 +233,13 @@ class LinkedInScraper(Scraper):
for attr in list(tag.attrs): for attr in list(tag.attrs):
del tag[attr] del tag[attr]
return tag return tag
div_content = remove_attributes(div_content) div_content = remove_attributes(div_content)
description = div_content.prettify(formatter="html") description = div_content.prettify(formatter="html")
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
description = markdown_converter(description)
return description, self._parse_job_type(soup)
def get_job_type( def _get_location(self, metadata_card: Optional[Tag]) -> Location:
soup_job_type: BeautifulSoup,
) -> list[JobType] | None:
"""
Gets the job type from job page
:param soup_job_type:
:return: JobType
"""
h3_tag = soup_job_type.find(
"h3",
class_="description__job-criteria-subheader",
string=lambda text: "Employment type" in text,
)
employment_type = None
if h3_tag:
employment_type_span = h3_tag.find_next_sibling(
"span",
class_="description__job-criteria-text description__job-criteria-text--criteria",
)
if employment_type_span:
employment_type = employment_type_span.get_text(strip=True)
employment_type = employment_type.lower()
employment_type = employment_type.replace("-", "")
return [get_enum_from_job_type(employment_type)] if employment_type else []
return description, get_job_type(soup)
def get_location(self, metadata_card: Optional[Tag]) -> Location:
""" """
Extracts the location data from the job metadata card. Extracts the location data from the job metadata card.
:param metadata_card :param metadata_card
@ -299,25 +264,50 @@ class LinkedInScraper(Scraper):
location = Location( location = Location(
city=city, city=city,
state=state, state=state,
country=Country.from_string(country), country=Country.from_string(country)
) )
return location return location
@staticmethod @staticmethod
def headers() -> dict: def _parse_job_type(soup_job_type: BeautifulSoup) -> list[JobType] | None:
"""
Gets the job type from job page
:param soup_job_type:
:return: JobType
"""
h3_tag = soup_job_type.find(
"h3",
class_="description__job-criteria-subheader",
string=lambda text: "Employment type" in text,
)
employment_type = None
if h3_tag:
employment_type_span = h3_tag.find_next_sibling(
"span",
class_="description__job-criteria-text description__job-criteria-text--criteria",
)
if employment_type_span:
employment_type = employment_type_span.get_text(strip=True)
employment_type = employment_type.lower()
employment_type = employment_type.replace("-", "")
return [get_enum_from_job_type(employment_type)] if employment_type else []
@staticmethod
def job_type_code(job_type_enum: JobType) -> str:
return { return {
JobType.FULL_TIME: "F",
JobType.PART_TIME: "P",
JobType.INTERNSHIP: "I",
JobType.CONTRACT: "C",
JobType.TEMPORARY: "T",
}.get(job_type_enum, "")
headers = {
"authority": "www.linkedin.com", "authority": "www.linkedin.com",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "en-US,en;q=0.9", "accept-language": "en-US,en;q=0.9",
"cache-control": "max-age=0", "cache-control": "max-age=0",
"sec-ch-ua": '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
# 'sec-ch-ua-mobile': '?0',
# 'sec-ch-ua-platform': '"macOS"',
# 'sec-fetch-dest': 'document',
# 'sec-fetch-mode': 'navigate',
# 'sec-fetch-site': 'none',
# 'sec-fetch-user': '?1',
"upgrade-insecure-requests": "1", "upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
} }

View File

@ -2,13 +2,16 @@ import re
import logging import logging
import numpy as np import numpy as np
import html2text
import tls_client import tls_client
import requests import requests
from requests.adapters import HTTPAdapter, Retry from requests.adapters import HTTPAdapter, Retry
from ..jobs import JobType from ..jobs import JobType
text_maker = html2text.HTML2Text()
logger = logging.getLogger("JobSpy") logger = logging.getLogger("JobSpy")
logger.propagate = False
if not logger.handlers: if not logger.handlers:
logger.setLevel(logging.ERROR) logger.setLevel(logging.ERROR)
console_handler = logging.StreamHandler() console_handler = logging.StreamHandler()
@ -32,6 +35,17 @@ def count_urgent_words(description: str) -> int:
return count return count
def markdown_converter(description_html: str):
if description_html is None:
return ""
text_maker.ignore_links = False
try:
markdown = text_maker.handle(description_html)
return markdown.strip()
except AssertionError as e:
return ""
def extract_emails_from_text(text: str) -> list[str] | None: def extract_emails_from_text(text: str) -> list[str] | None:
if not text: if not text:
return None return None
@ -42,14 +56,10 @@ def extract_emails_from_text(text: str) -> list[str] | None:
def create_session(proxy: dict | None = None, is_tls: bool = True, has_retry: bool = False, delay: int = 1) -> requests.Session: def create_session(proxy: dict | None = None, is_tls: bool = True, has_retry: bool = False, delay: int = 1) -> requests.Session:
""" """
Creates a requests session with optional tls, proxy, and retry settings. Creates a requests session with optional tls, proxy, and retry settings.
:return: A session object :return: A session object
""" """
if is_tls: if is_tls:
session = tls_client.Session( session = tls_client.Session(random_tls_extension_order=True)
client_identifier="chrome112",
random_tls_extension_order=True,
)
session.proxies = proxy session.proxies = proxy
else: else:
session = requests.Session() session = requests.Session()
@ -66,7 +76,6 @@ def create_session(proxy: dict | None = None, is_tls: bool = True, has_retry: bo
session.mount('http://', adapter) session.mount('http://', adapter)
session.mount('https://', adapter) session.mount('https://', adapter)
return session return session

View File

@ -6,33 +6,76 @@ This module contains routines to scrape ZipRecruiter.
""" """
import math import math
import time import time
from datetime import datetime, timezone from datetime import datetime
from typing import Optional, Tuple, Any from typing import Optional, Tuple, Any
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from .. import Scraper, ScraperInput, Site from .. import Scraper, ScraperInput, Site
from ..exceptions import ZipRecruiterException from ..utils import (
from ...jobs import JobPost, Compensation, Location, JobResponse, JobType, Country logger,
from ..utils import count_urgent_words, extract_emails_from_text, create_session count_urgent_words,
extract_emails_from_text,
create_session,
markdown_converter
)
from ...jobs import (
JobPost,
Compensation,
Location,
JobResponse,
JobType,
Country,
DescriptionFormat
)
class ZipRecruiterScraper(Scraper): class ZipRecruiterScraper(Scraper):
base_url = "https://www.ziprecruiter.com"
api_url = "https://api.ziprecruiter.com"
def __init__(self, proxy: Optional[str] = None): def __init__(self, proxy: Optional[str] = None):
""" """
Initializes ZipRecruiterScraper with the ZipRecruiter job search url Initializes ZipRecruiterScraper with the ZipRecruiter job search url
""" """
site = Site(Site.ZIP_RECRUITER) self.scraper_input = None
self.url = "https://www.ziprecruiter.com"
self.session = create_session(proxy) self.session = create_session(proxy)
self.get_cookies() self._get_cookies()
super().__init__(site, proxy=proxy) super().__init__(Site.ZIP_RECRUITER, proxy=proxy)
self.delay = 5
self.jobs_per_page = 20 self.jobs_per_page = 20
self.seen_urls = set() self.seen_urls = set()
self.delay = 5
def find_jobs_in_page( def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Scrapes ZipRecruiter for jobs with scraper_input criteria.
:param scraper_input: Information about job search criteria.
:return: JobResponse containing a list of jobs.
"""
self.scraper_input = scraper_input
job_list: list[JobPost] = []
continue_token = None
max_pages = math.ceil(scraper_input.results_wanted / self.jobs_per_page)
for page in range(1, max_pages + 1):
if len(job_list) >= scraper_input.results_wanted:
break
if page > 1:
time.sleep(self.delay)
jobs_on_page, continue_token = self._find_jobs_in_page(
scraper_input, continue_token
)
if jobs_on_page:
job_list.extend(jobs_on_page)
else:
break
if not continue_token:
break
return JobResponse(jobs=job_list[: scraper_input.results_wanted])
def _find_jobs_in_page(
self, scraper_input: ScraperInput, continue_token: str | None = None self, scraper_input: ScraperInput, continue_token: str | None = None
) -> Tuple[list[JobPost], Optional[str]]: ) -> Tuple[list[JobPost], Optional[str]]:
""" """
@ -41,73 +84,51 @@ class ZipRecruiterScraper(Scraper):
:param continue_token: :param continue_token:
:return: jobs found on page :return: jobs found on page
""" """
params = self.add_params(scraper_input) jobs_list = []
params = self._add_params(scraper_input)
if continue_token: if continue_token:
params["continue_from"] = continue_token params["continue_from"] = continue_token
try: try:
response = self.session.get( res= self.session.get(
f"https://api.ziprecruiter.com/jobs-app/jobs", f"{self.api_url}/jobs-app/jobs",
headers=self.headers(), headers=self.headers,
params=params params=params
) )
if response.status_code != 200: if res.status_code not in range(200, 400):
raise ZipRecruiterException( if res.status_code == 429:
f"bad response status code: {response.status_code}" logger.error(f'429 Response - Blocked by ZipRecruiter for too many requests')
) else:
logger.error(f'ZipRecruiter response status code {res.status_code}')
return jobs_list, ""
except Exception as e: except Exception as e:
if "Proxy responded with non 200 code" in str(e): if "Proxy responded with" in str(e):
raise ZipRecruiterException("bad proxy") logger.error(f'Indeed: Bad proxy')
raise ZipRecruiterException(str(e)) else:
logger.error(f'Indeed: {str(e)}')
return jobs_list, ""
response_data = response.json()
jobs_list = response_data.get("jobs", [])
next_continue_token = response_data.get("continue", None)
res_data = res.json()
jobs_list = res_data.get("jobs", [])
next_continue_token = res_data.get("continue", None)
with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor: with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor:
job_results = [executor.submit(self.process_job, job) for job in jobs_list] job_results = [executor.submit(self._process_job, job) for job in jobs_list]
job_list = list(filter(None, (result.result() for result in job_results))) job_list = list(filter(None, (result.result() for result in job_results)))
return job_list, next_continue_token return job_list, next_continue_token
def scrape(self, scraper_input: ScraperInput) -> JobResponse: def _process_job(self, job: dict) -> JobPost | None:
""" """
Scrapes ZipRecruiter for jobs with scraper_input criteria. Processes an individual job dict from the response
:param scraper_input: Information about job search criteria.
:return: JobResponse containing a list of jobs.
""" """
job_list: list[JobPost] = []
continue_token = None
max_pages = math.ceil(scraper_input.results_wanted / self.jobs_per_page)
for page in range(1, max_pages + 1):
if len(job_list) >= scraper_input.results_wanted:
break
if page > 1:
time.sleep(self.delay)
jobs_on_page, continue_token = self.find_jobs_in_page(
scraper_input, continue_token
)
if jobs_on_page:
job_list.extend(jobs_on_page)
if not continue_token:
break
return JobResponse(jobs=job_list[: scraper_input.results_wanted])
def process_job(self, job: dict) -> JobPost | None:
"""Processes an individual job dict from the response"""
title = job.get("name") title = job.get("name")
job_url = f"https://www.ziprecruiter.com/jobs//j?lvk={job['listing_key']}" job_url = f"{self.base_url}/jobs//j?lvk={job['listing_key']}"
if job_url in self.seen_urls: if job_url in self.seen_urls:
return return
self.seen_urls.add(job_url) self.seen_urls.add(job_url)
description = job.get("job_description", "").strip() description = job.get("job_description", "").strip()
description = markdown_converter(description) if self.scraper_input.description_format == DescriptionFormat.MARKDOWN else description
company = job.get("hiring_company", {}).get("name") company = job.get("hiring_company", {}).get("name")
country_value = "usa" if job.get("job_country") == "US" else "canada" country_value = "usa" if job.get("job_country") == "US" else "canada"
country_enum = Country.from_string(country_value) country_enum = Country.from_string(country_value)
@ -115,11 +136,10 @@ class ZipRecruiterScraper(Scraper):
location = Location( location = Location(
city=job.get("job_city"), state=job.get("job_state"), country=country_enum city=job.get("job_city"), state=job.get("job_state"), country=country_enum
) )
job_type = ZipRecruiterScraper.get_job_type_enum( job_type = self._get_job_type_enum(
job.get("employment_type", "").replace("_", "").lower() job.get("employment_type", "").replace("_", "").lower()
) )
date_posted = datetime.fromisoformat(job['posted_time'].rstrip("Z")).date() date_posted = datetime.fromisoformat(job['posted_time'].rstrip("Z")).date()
return JobPost( return JobPost(
title=title, title=title,
company_name=company, company_name=company,
@ -144,20 +164,19 @@ class ZipRecruiterScraper(Scraper):
num_urgent_words=count_urgent_words(description) if description else None, num_urgent_words=count_urgent_words(description) if description else None,
) )
def get_cookies(self): def _get_cookies(self):
url="https://api.ziprecruiter.com/jobs-app/event"
data="event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple" data="event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple"
self.session.post(url, data=data, headers=ZipRecruiterScraper.headers()) self.session.post(f"{self.api_url}/jobs-app/event", data=data, headers=self.headers)
@staticmethod @staticmethod
def get_job_type_enum(job_type_str: str) -> list[JobType] | None: def _get_job_type_enum(job_type_str: str) -> list[JobType] | None:
for job_type in JobType: for job_type in JobType:
if job_type_str in job_type.value: if job_type_str in job_type.value:
return [job_type] return [job_type]
return None return None
@staticmethod @staticmethod
def add_params(scraper_input) -> dict[str, str | Any]: def _add_params(scraper_input) -> dict[str, str | Any]:
params = { params = {
"search": scraper_input.search_term, "search": scraper_input.search_term,
"location": scraper_input.location, "location": scraper_input.location,
@ -177,18 +196,9 @@ class ZipRecruiterScraper(Scraper):
params["remote"] = 1 params["remote"] = 1
if scraper_input.distance: if scraper_input.distance:
params["radius"] = scraper_input.distance params["radius"] = scraper_input.distance
return {k: v for k, v in params.items() if v is not None}
params = {k: v for k, v in params.items() if v is not None} headers = {
return params
@staticmethod
def headers() -> dict:
"""
Returns headers needed for requests
:return: dict - Dictionary containing headers
"""
return {
"Host": "api.ziprecruiter.com", "Host": "api.ziprecruiter.com",
"accept": "*/*", "accept": "*/*",
"x-zr-zva-override": "100000000;vid:ZT1huzm_EQlDTVEc", "x-zr-zva-override": "100000000;vid:ZT1huzm_EQlDTVEc",