Compare commits

...

2 Commits

Author SHA1 Message Date
Cullen Watson
ba3a16b228 Description format (#107) 2024-02-14 16:04:23 -06:00
Cullen Watson
aeb1a50d2c fix job type search (#106) 2024-02-12 11:02:48 -06:00
11 changed files with 776 additions and 693 deletions

View File

@@ -11,7 +11,7 @@ work with us.*
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, & **ZipRecruiter** simultaneously - Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, & **ZipRecruiter** simultaneously
- Aggregates the job postings in a Pandas DataFrame - Aggregates the job postings in a Pandas DataFrame
- Proxy support (HTTP/S, SOCKS) - Proxy support
[Video Guide for JobSpy](https://www.youtube.com/watch?v=RuP1HrAZnxs&pp=ygUgam9icyBzY3JhcGVyIGJvdCBsaW5rZWRpbiBpbmRlZWQ%3D) - [Video Guide for JobSpy](https://www.youtube.com/watch?v=RuP1HrAZnxs&pp=ygUgam9icyBzY3JhcGVyIGJvdCBsaW5rZWRpbiBpbmRlZWQ%3D) -
Updated for release v1.1.3 Updated for release v1.1.3
@@ -67,12 +67,13 @@ Optional
├── location (int) ├── location (int)
├── distance (int): in miles ├── distance (int): in miles
├── job_type (enum): fulltime, parttime, internship, contract ├── job_type (enum): fulltime, parttime, internship, contract
├── proxy (str): in format 'http://user:pass@host:port' or [https, socks] ├── proxy (str): in format 'http://user:pass@host:port'
├── is_remote (bool) ├── is_remote (bool)
├── full_description (bool): fetches full description for LinkedIn (slower) ├── linkedin_fetch_description (bool): fetches full description for LinkedIn (slower)
├── results_wanted (int): number of job results to retrieve for each site specified in 'site_type' ├── results_wanted (int): number of job results to retrieve for each site specified in 'site_type'
├── easy_apply (bool): filters for jobs that are hosted on the job board site ├── easy_apply (bool): filters for jobs that are hosted on the job board site
├── linkedin_company_ids (list[int): searches for linkedin jobs with specific company ids ├── linkedin_company_ids (list[int): searches for linkedin jobs with specific company ids
├── description_format (enum): markdown, html (format type of the job descriptions)
├── country_indeed (enum): filters the country on Indeed (see below for correct spelling) ├── country_indeed (enum): filters the country on Indeed (see below for correct spelling)
├── offset (num): starts the search from an offset (e.g. 25 will start the search from the 25th result) ├── offset (num): starts the search from an offset (e.g. 25 will start the search from the 25th result)
├── hours_old (int): filters jobs by the number of hours since the job was posted (all but LinkedIn rounds up to next day) ├── hours_old (int): filters jobs by the number of hours since the job was posted (all but LinkedIn rounds up to next day)

13
poetry.lock generated
View File

@@ -524,6 +524,17 @@ files = [
{file = "fqdn-1.5.1.tar.gz", hash = "sha256:105ed3677e767fb5ca086a0c1f4bb66ebc3c100be518f0e0d755d9eae164d89f"}, {file = "fqdn-1.5.1.tar.gz", hash = "sha256:105ed3677e767fb5ca086a0c1f4bb66ebc3c100be518f0e0d755d9eae164d89f"},
] ]
[[package]]
name = "html2text"
version = "2020.1.16"
description = "Turn HTML into equivalent Markdown-structured text."
optional = false
python-versions = ">=3.5"
files = [
{file = "html2text-2020.1.16-py3-none-any.whl", hash = "sha256:c7c629882da0cf377d66f073329ccf34a12ed2adf0169b9285ae4e63ef54c82b"},
{file = "html2text-2020.1.16.tar.gz", hash = "sha256:e296318e16b059ddb97f7a8a1d6a5c1d7af4544049a01e261731d2d5cc277bbb"},
]
[[package]] [[package]]
name = "idna" name = "idna"
version = "3.4" version = "3.4"
@@ -2435,4 +2446,4 @@ files = [
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.10" python-versions = "^3.10"
content-hash = "404a77d78066cbb2ef71015562baf44aa11d12aac29a191c1ccc7758bfda598a" content-hash = "40cdc19a57cba0d21ff4f0fcfa53e14a073fcccd9f2a871440e056ab6e8fade0"

View File

@@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "python-jobspy" name = "python-jobspy"
version = "1.1.43" version = "1.1.45"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy" homepage = "https://github.com/Bunsly/JobSpy"
@@ -18,6 +18,7 @@ beautifulsoup4 = "^4.12.2"
pandas = "^2.1.0" pandas = "^2.1.0"
NUMPY = "1.24.2" NUMPY = "1.24.2"
pydantic = "^2.3.0" pydantic = "^2.3.0"
html2text = "^2020.1.16"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]

View File

@@ -15,17 +15,6 @@ from .scrapers.exceptions import (
GlassdoorException, GlassdoorException,
) )
SCRAPER_MAPPING = {
Site.LINKEDIN: LinkedInScraper,
Site.INDEED: IndeedScraper,
Site.ZIP_RECRUITER: ZipRecruiterScraper,
Site.GLASSDOOR: GlassdoorScraper,
}
def _map_str_to_site(site_name: str) -> Site:
return Site[site_name.upper()]
def scrape_jobs( def scrape_jobs(
site_name: str | list[str] | Site | list[Site] | None = None, site_name: str | list[str] | Site | list[Site] | None = None,
@@ -39,7 +28,8 @@ def scrape_jobs(
country_indeed: str = "usa", country_indeed: str = "usa",
hyperlinks: bool = False, hyperlinks: bool = False,
proxy: str | None = None, proxy: str | None = None,
full_description: bool | None = False, description_format: str = "markdown",
linkedin_fetch_description: bool | None = False,
linkedin_company_ids: list[int] | None = None, linkedin_company_ids: list[int] | None = None,
offset: int | None = 0, offset: int | None = 0,
hours_old: int = None, hours_old: int = None,
@@ -49,6 +39,15 @@ def scrape_jobs(
Simultaneously scrapes job data from multiple job sites. Simultaneously scrapes job data from multiple job sites.
:return: results_wanted: pandas dataframe containing job data :return: results_wanted: pandas dataframe containing job data
""" """
SCRAPER_MAPPING = {
Site.LINKEDIN: LinkedInScraper,
Site.INDEED: IndeedScraper,
Site.ZIP_RECRUITER: ZipRecruiterScraper,
Site.GLASSDOOR: GlassdoorScraper,
}
def map_str_to_site(site_name: str) -> Site:
return Site[site_name.upper()]
def get_enum_from_value(value_str): def get_enum_from_value(value_str):
for job_type in JobType: for job_type in JobType:
@@ -61,16 +60,15 @@ def scrape_jobs(
def get_site_type(): def get_site_type():
site_types = list(Site) site_types = list(Site)
if isinstance(site_name, str): if isinstance(site_name, str):
site_types = [_map_str_to_site(site_name)] site_types = [map_str_to_site(site_name)]
elif isinstance(site_name, Site): elif isinstance(site_name, Site):
site_types = [site_name] site_types = [site_name]
elif isinstance(site_name, list): elif isinstance(site_name, list):
site_types = [ site_types = [
_map_str_to_site(site) if isinstance(site, str) else site map_str_to_site(site) if isinstance(site, str) else site
for site in site_name for site in site_name
] ]
return site_types return site_types
country_enum = Country.from_string(country_indeed) country_enum = Country.from_string(country_indeed)
scraper_input = ScraperInput( scraper_input = ScraperInput(
@@ -82,7 +80,8 @@ def scrape_jobs(
is_remote=is_remote, is_remote=is_remote,
job_type=job_type, job_type=job_type,
easy_apply=easy_apply, easy_apply=easy_apply,
full_description=full_description, description_format=description_format,
linkedin_fetch_description=linkedin_fetch_description,
results_wanted=results_wanted, results_wanted=results_wanted,
linkedin_company_ids=linkedin_company_ids, linkedin_company_ids=linkedin_company_ids,
offset=offset, offset=offset,
@@ -92,22 +91,7 @@ def scrape_jobs(
def scrape_site(site: Site) -> Tuple[str, JobResponse]: def scrape_site(site: Site) -> Tuple[str, JobResponse]:
scraper_class = SCRAPER_MAPPING[site] scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class(proxy=proxy) scraper = scraper_class(proxy=proxy)
try:
scraped_data: JobResponse = scraper.scrape(scraper_input) scraped_data: JobResponse = scraper.scrape(scraper_input)
except (LinkedInException, IndeedException, ZipRecruiterException) as lie:
raise lie
except Exception as e:
if site == Site.LINKEDIN:
raise LinkedInException(str(e))
if site == Site.INDEED:
raise IndeedException(str(e))
if site == Site.ZIP_RECRUITER:
raise ZipRecruiterException(str(e))
if site == Site.GLASSDOOR:
raise GlassdoorException(str(e))
else:
raise e
return site.value, scraped_data return site.value, scraped_data
site_to_jobs_dict = {} site_to_jobs_dict = {}
@@ -188,8 +172,6 @@ def scrape_jobs(
"emails", "emails",
"description", "description",
] ]
jobs_formatted_df = jobs_df[desired_order] return jobs_df[desired_order].sort_values(by=['site', 'date_posted'], ascending=[True, False])
else: else:
jobs_formatted_df = pd.DataFrame() return pd.DataFrame()
return jobs_formatted_df.sort_values(by='date_posted', ascending=False)

View File

@@ -210,6 +210,11 @@ class Compensation(BaseModel):
currency: Optional[str] = "USD" currency: Optional[str] = "USD"
class DescriptionFormat(Enum):
MARKDOWN = "markdown"
HTML = "html"
class JobPost(BaseModel): class JobPost(BaseModel):
title: str title: str
company_name: str company_name: str

View File

@@ -1,4 +1,11 @@
from ..jobs import Enum, BaseModel, JobType, JobResponse, Country from ..jobs import (
Enum,
BaseModel,
JobType,
JobResponse,
Country,
DescriptionFormat
)
class Site(Enum): class Site(Enum):
@@ -18,9 +25,10 @@ class ScraperInput(BaseModel):
is_remote: bool = False is_remote: bool = False
job_type: JobType | None = None job_type: JobType | None = None
easy_apply: bool | None = None easy_apply: bool | None = None
full_description: bool = False
offset: int = 0 offset: int = 0
linkedin_fetch_description: bool = False
linkedin_company_ids: list[int] | None = None linkedin_company_ids: list[int] | None = None
description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN
results_wanted: int = 15 results_wanted: int = 15
hours_old: int | None = None hours_old: int | None = None

View File

@@ -6,7 +6,6 @@ This module contains routines to scrape Glassdoor.
""" """
import json import json
import requests import requests
from bs4 import BeautifulSoup
from typing import Optional from typing import Optional
from datetime import datetime, timedelta from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -14,7 +13,11 @@ from ..utils import count_urgent_words, extract_emails_from_text
from .. import Scraper, ScraperInput, Site from .. import Scraper, ScraperInput, Site
from ..exceptions import GlassdoorException from ..exceptions import GlassdoorException
from ..utils import create_session, modify_and_get_description from ..utils import (
create_session,
markdown_converter,
logger
)
from ...jobs import ( from ...jobs import (
JobPost, JobPost,
Compensation, Compensation,
@@ -22,6 +25,7 @@ from ...jobs import (
Location, Location,
JobResponse, JobResponse,
JobType, JobType,
DescriptionFormat
) )
@@ -33,12 +37,57 @@ class GlassdoorScraper(Scraper):
site = Site(Site.GLASSDOOR) site = Site(Site.GLASSDOOR)
super().__init__(site, proxy=proxy) super().__init__(site, proxy=proxy)
self.url = None self.base_url = None
self.country = None self.country = None
self.session = None
self.scraper_input = None
self.jobs_per_page = 30 self.jobs_per_page = 30
self.seen_urls = set() self.seen_urls = set()
def fetch_jobs_page( def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Scrapes Glassdoor for jobs with scraper_input criteria.
:param scraper_input: Information about job search criteria.
:return: JobResponse containing a list of jobs.
"""
self.scraper_input = scraper_input
self.scraper_input.results_wanted = min(900, scraper_input.results_wanted)
self.base_url = self.scraper_input.country.get_url()
location_id, location_type = self._get_location(
scraper_input.location, scraper_input.is_remote
)
if location_type is None:
return JobResponse(jobs=[])
all_jobs: list[JobPost] = []
cursor = None
max_pages = 30
self.session = create_session(self.proxy, is_tls=False, has_retry=True)
self.session.get(self.base_url)
try:
for page in range(
1 + (scraper_input.offset // self.jobs_per_page),
min(
(scraper_input.results_wanted // self.jobs_per_page) + 2,
max_pages + 1,
),
):
try:
jobs, cursor = self._fetch_jobs_page(
scraper_input, location_id, location_type, page, cursor
)
all_jobs.extend(jobs)
if len(all_jobs) >= scraper_input.results_wanted:
all_jobs = all_jobs[: scraper_input.results_wanted]
break
except Exception as e:
raise GlassdoorException(str(e))
except Exception as e:
raise GlassdoorException(str(e))
return JobResponse(jobs=all_jobs)
def _fetch_jobs_page(
self, self,
scraper_input: ScraperInput, scraper_input: ScraperInput,
location_id: int, location_id: int,
@@ -49,13 +98,13 @@ class GlassdoorScraper(Scraper):
""" """
Scrapes a page of Glassdoor for jobs with scraper_input criteria Scrapes a page of Glassdoor for jobs with scraper_input criteria
""" """
self.scraper_input = scraper_input
try: try:
payload = self.add_payload( payload = self._add_payload(
scraper_input, location_id, location_type, page_num, cursor location_id, location_type, page_num, cursor
) )
session = create_session(self.proxy, is_tls=False, has_retry=True) response = self.session.post(
response = session.post( f"{self.base_url}/graph", headers=self.headers, timeout=10, data=payload
f"{self.url}/graph", headers=self.headers(), timeout=10, data=payload
) )
if response.status_code != 200: if response.status_code != 200:
raise GlassdoorException( raise GlassdoorException(
@@ -71,9 +120,8 @@ class GlassdoorScraper(Scraper):
jobs = [] jobs = []
with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor: with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor:
future_to_job_data = {executor.submit(self.process_job, job): job for job in jobs_data} future_to_job_data = {executor.submit(self._process_job, job): job for job in jobs_data}
for future in as_completed(future_to_job_data): for future in as_completed(future_to_job_data):
job_data = future_to_job_data[future]
try: try:
job_post = future.result() job_post = future.result()
if job_post: if job_post:
@@ -85,10 +133,12 @@ class GlassdoorScraper(Scraper):
res_json["data"]["jobListings"]["paginationCursors"], page_num + 1 res_json["data"]["jobListings"]["paginationCursors"], page_num + 1
) )
def process_job(self, job_data): def _process_job(self, job_data):
"""Processes a single job and fetches its description.""" """
Processes a single job and fetches its description.
"""
job_id = job_data["jobview"]["job"]["listingId"] job_id = job_data["jobview"]["job"]["listingId"]
job_url = f'{self.url}job-listing/j?jl={job_id}' job_url = f'{self.base_url}job-listing/j?jl={job_id}'
if job_url in self.seen_urls: if job_url in self.seen_urls:
return None return None
self.seen_urls.add(job_url) self.seen_urls.add(job_url)
@@ -108,15 +158,13 @@ class GlassdoorScraper(Scraper):
location = self.parse_location(location_name) location = self.parse_location(location_name)
compensation = self.parse_compensation(job["header"]) compensation = self.parse_compensation(job["header"])
try: try:
description = self.fetch_job_description(job_id) description = self._fetch_job_description(job_id)
except Exception as e : except:
description = None description = None
return JobPost(
job_post = JobPost(
title=title, title=title,
company_url=f"{self.url}Overview/W-EI_IE{company_id}.htm" if company_id else None, company_url=f"{self.base_url}Overview/W-EI_IE{company_id}.htm" if company_id else None,
company_name=company_name, company_name=company_name,
date_posted=date_posted, date_posted=date_posted,
job_url=job_url, job_url=job_url,
@@ -127,51 +175,12 @@ class GlassdoorScraper(Scraper):
emails=extract_emails_from_text(description) if description else None, emails=extract_emails_from_text(description) if description else None,
num_urgent_words=count_urgent_words(description) if description else None, num_urgent_words=count_urgent_words(description) if description else None,
) )
return job_post
def scrape(self, scraper_input: ScraperInput) -> JobResponse: def _fetch_job_description(self, job_id):
""" """
Scrapes Glassdoor for jobs with scraper_input criteria. Fetches the job description for a single job ID.
:param scraper_input: Information about job search criteria.
:return: JobResponse containing a list of jobs.
""" """
scraper_input.results_wanted = min(900, scraper_input.results_wanted) url = f"{self.base_url}/graph"
self.country = scraper_input.country
self.url = self.country.get_url()
location_id, location_type = self.get_location(
scraper_input.location, scraper_input.is_remote
)
all_jobs: list[JobPost] = []
cursor = None
max_pages = 30
try:
for page in range(
1 + (scraper_input.offset // self.jobs_per_page),
min(
(scraper_input.results_wanted // self.jobs_per_page) + 2,
max_pages + 1,
),
):
try:
jobs, cursor = self.fetch_jobs_page(
scraper_input, location_id, location_type, page, cursor
)
all_jobs.extend(jobs)
if len(all_jobs) >= scraper_input.results_wanted:
all_jobs = all_jobs[: scraper_input.results_wanted]
break
except Exception as e:
raise GlassdoorException(str(e))
except Exception as e:
raise GlassdoorException(str(e))
return JobResponse(jobs=all_jobs)
def fetch_job_description(self, job_id):
"""Fetches the job description for a single job ID."""
url = f"{self.url}/graph"
body = [ body = [
{ {
"operationName": "JobDetailQuery", "operationName": "JobDetailQuery",
@@ -196,49 +205,28 @@ class GlassdoorScraper(Scraper):
""" """
} }
] ]
response = requests.post(url, json=body, headers=GlassdoorScraper.headers()) res = requests.post(url, json=body, headers=self.headers)
if response.status_code != 200: if res.status_code != 200:
return None return None
data = response.json()[0] data = res.json()[0]
desc = data['data']['jobview']['job']['description'] desc = data['data']['jobview']['job']['description']
soup = BeautifulSoup(desc, 'html.parser') return markdown_converter(desc) if self.scraper_input.description_format == DescriptionFormat.MARKDOWN else desc
return modify_and_get_description(soup)
@staticmethod def _get_location(self, location: str, is_remote: bool) -> (int, str):
def parse_compensation(data: dict) -> Optional[Compensation]:
pay_period = data.get("payPeriod")
adjusted_pay = data.get("payPeriodAdjustedPay")
currency = data.get("payCurrency", "USD")
if not pay_period or not adjusted_pay:
return None
interval = None
if pay_period == "ANNUAL":
interval = CompensationInterval.YEARLY
elif pay_period:
interval = CompensationInterval.get_interval(pay_period)
min_amount = int(adjusted_pay.get("p10") // 1)
max_amount = int(adjusted_pay.get("p90") // 1)
return Compensation(
interval=interval,
min_amount=min_amount,
max_amount=max_amount,
currency=currency,
)
def get_location(self, location: str, is_remote: bool) -> (int, str):
if not location or is_remote: if not location or is_remote:
return "11047", "STATE" # remote options return "11047", "STATE" # remote options
url = f"{self.url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}" url = f"{self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}"
session = create_session(self.proxy, has_retry=True) session = create_session(self.proxy, has_retry=True)
response = session.get(url) res = session.get(url)
if response.status_code != 200: if res.status_code != 200:
raise GlassdoorException( if res.status_code == 429:
f"bad response status code: {response.status_code}" logger.error(f'429 Response - Blocked by Glassdoor for too many requests')
) return None, None
items = response.json() else:
logger.error(f'Glassdoor response status code {res.status_code}')
return None, None
items = res.json()
if not items: if not items:
raise ValueError(f"Location '{location}' not found on Glassdoor") raise ValueError(f"Location '{location}' not found on Glassdoor")
location_type = items[0]["locationType"] location_type = items[0]["locationType"]
@@ -250,28 +238,25 @@ class GlassdoorScraper(Scraper):
location_type = "COUNTRY" location_type = "COUNTRY"
return int(items[0]["locationId"]), location_type return int(items[0]["locationId"]), location_type
@staticmethod def _add_payload(
def add_payload( self,
scraper_input,
location_id: int, location_id: int,
location_type: str, location_type: str,
page_num: int, page_num: int,
cursor: str | None = None, cursor: str | None = None,
) -> str: ) -> str:
# `fromage` is the posting time filter in days fromage = max(self.scraper_input.hours_old // 24, 1) if self.scraper_input.hours_old else None
fromage = max(scraper_input.hours_old // 24, 1) if scraper_input.hours_old else None
filter_params = [] filter_params = []
if scraper_input.easy_apply: if self.scraper_input.easy_apply:
filter_params.append({"filterKey": "applicationType", "values": "1"}) filter_params.append({"filterKey": "applicationType", "values": "1"})
if fromage: if fromage:
filter_params.append({"filterKey": "fromAge", "values": str(fromage)}) filter_params.append({"filterKey": "fromAge", "values": str(fromage)})
payload = { payload = {
"operationName": "JobSearchResultsQuery", "operationName": "JobSearchResultsQuery",
"variables": { "variables": {
"excludeJobListingIds": [], "excludeJobListingIds": [],
"filterParams": filter_params, "filterParams": filter_params,
"keyword": scraper_input.search_term, "keyword": self.scraper_input.search_term,
"numJobsToShow": 30, "numJobsToShow": 30,
"locationType": location_type, "locationType": location_type,
"locationId": int(location_id), "locationId": int(location_id),
@@ -281,24 +266,201 @@ class GlassdoorScraper(Scraper):
"fromage": fromage, "fromage": fromage,
"sort": "date" "sort": "date"
}, },
"query": "query JobSearchResultsQuery($excludeJobListingIds: [Long!], $keyword: String, $locationId: Int, $locationType: LocationTypeEnum, $numJobsToShow: Int!, $pageCursor: String, $pageNumber: Int, $filterParams: [FilterParams], $originalPageUrl: String, $seoFriendlyUrlInput: String, $parameterUrlInput: String, $seoUrl: Boolean) {\n jobListings(\n contextHolder: {searchParams: {excludeJobListingIds: $excludeJobListingIds, keyword: $keyword, locationId: $locationId, locationType: $locationType, numPerPage: $numJobsToShow, pageCursor: $pageCursor, pageNumber: $pageNumber, filterParams: $filterParams, originalPageUrl: $originalPageUrl, seoFriendlyUrlInput: $seoFriendlyUrlInput, parameterUrlInput: $parameterUrlInput, seoUrl: $seoUrl, searchType: SR}}\n ) {\n companyFilterOptions {\n id\n shortName\n __typename\n }\n filterOptions\n indeedCtk\n jobListings {\n ...JobView\n __typename\n }\n jobListingSeoLinks {\n linkItems {\n position\n url\n __typename\n }\n __typename\n }\n jobSearchTrackingKey\n jobsPageSeoData {\n pageMetaDescription\n pageTitle\n __typename\n }\n paginationCursors {\n cursor\n pageNumber\n __typename\n }\n indexablePageForSeo\n searchResultsMetadata {\n searchCriteria {\n implicitLocation {\n id\n localizedDisplayName\n type\n __typename\n }\n keyword\n location {\n id\n shortName\n localizedShortName\n localizedDisplayName\n type\n __typename\n }\n __typename\n }\n footerVO {\n countryMenu {\n childNavigationLinks {\n id\n link\n textKey\n __typename\n }\n __typename\n }\n __typename\n }\n helpCenterDomain\n helpCenterLocale\n jobAlert {\n jobAlertExists\n __typename\n }\n jobSerpFaq {\n questions {\n answer\n question\n __typename\n }\n __typename\n }\n jobSerpJobOutlook {\n occupation\n paragraph\n __typename\n }\n showMachineReadableJobs\n __typename\n }\n serpSeoLinksVO {\n relatedJobTitlesResults\n searchedJobTitle\n searchedKeyword\n searchedLocationIdAsString\n searchedLocationSeoName\n searchedLocationType\n topCityIdsToNameResults {\n key\n value\n __typename\n }\n topEmployerIdsToNameResults {\n key\n value\n __typename\n }\n topEmployerNameResults\n topOccupationResults\n __typename\n }\n totalJobsCount\n __typename\n }\n}\n\nfragment JobView on JobListingSearchResult {\n jobview {\n header {\n adOrderId\n advertiserType\n adOrderSponsorshipLevel\n ageInDays\n divisionEmployerName\n easyApply\n employer {\n id\n name\n shortName\n __typename\n }\n employerNameFromSearch\n goc\n gocConfidence\n gocId\n jobCountryId\n jobLink\n jobResultTrackingKey\n jobTitleText\n locationName\n locationType\n locId\n needsCommission\n payCurrency\n payPeriod\n payPeriodAdjustedPay {\n p10\n p50\n p90\n __typename\n }\n rating\n salarySource\n savedJobId\n sponsored\n __typename\n }\n job {\n descriptionFragments\n importConfigId\n jobTitleId\n jobTitleText\n listingId\n __typename\n }\n jobListingAdminDetails {\n cpcVal\n importConfigId\n jobListingId\n jobSourceId\n userEligibleForAdminJobDetails\n __typename\n }\n overview {\n shortName\n squareLogoUrl\n __typename\n }\n __typename\n }\n __typename\n}\n", "query": """
query JobSearchResultsQuery(
$excludeJobListingIds: [Long!],
$keyword: String,
$locationId: Int,
$locationType: LocationTypeEnum,
$numJobsToShow: Int!,
$pageCursor: String,
$pageNumber: Int,
$filterParams: [FilterParams],
$originalPageUrl: String,
$seoFriendlyUrlInput: String,
$parameterUrlInput: String,
$seoUrl: Boolean
) {
jobListings(
contextHolder: {
searchParams: {
excludeJobListingIds: $excludeJobListingIds,
keyword: $keyword,
locationId: $locationId,
locationType: $locationType,
numPerPage: $numJobsToShow,
pageCursor: $pageCursor,
pageNumber: $pageNumber,
filterParams: $filterParams,
originalPageUrl: $originalPageUrl,
seoFriendlyUrlInput: $seoFriendlyUrlInput,
parameterUrlInput: $parameterUrlInput,
seoUrl: $seoUrl,
searchType: SR
}
}
) {
companyFilterOptions {
id
shortName
__typename
}
filterOptions
indeedCtk
jobListings {
...JobView
__typename
}
jobListingSeoLinks {
linkItems {
position
url
__typename
}
__typename
}
jobSearchTrackingKey
jobsPageSeoData {
pageMetaDescription
pageTitle
__typename
}
paginationCursors {
cursor
pageNumber
__typename
}
indexablePageForSeo
searchResultsMetadata {
searchCriteria {
implicitLocation {
id
localizedDisplayName
type
__typename
}
keyword
location {
id
shortName
localizedShortName
localizedDisplayName
type
__typename
}
__typename
}
helpCenterDomain
helpCenterLocale
jobSerpJobOutlook {
occupation
paragraph
__typename
}
showMachineReadableJobs
__typename
}
totalJobsCount
__typename
}
} }
job_type_filters = { fragment JobView on JobListingSearchResult {
JobType.FULL_TIME: "fulltime", jobview {
JobType.PART_TIME: "parttime", header {
JobType.CONTRACT: "contract", adOrderId
JobType.INTERNSHIP: "internship", advertiserType
JobType.TEMPORARY: "temporary", adOrderSponsorshipLevel
ageInDays
divisionEmployerName
easyApply
employer {
id
name
shortName
__typename
} }
employerNameFromSearch
if scraper_input.job_type in job_type_filters: goc
filter_value = job_type_filters[scraper_input.job_type] gocConfidence
gocId
jobCountryId
jobLink
jobResultTrackingKey
jobTitleText
locationName
locationType
locId
needsCommission
payCurrency
payPeriod
payPeriodAdjustedPay {
p10
p50
p90
__typename
}
rating
salarySource
savedJobId
sponsored
__typename
}
job {
description
importConfigId
jobTitleId
jobTitleText
listingId
__typename
}
jobListingAdminDetails {
cpcVal
importConfigId
jobListingId
jobSourceId
userEligibleForAdminJobDetails
__typename
}
overview {
shortName
squareLogoUrl
__typename
}
__typename
}
__typename
}
"""
}
if self.scraper_input.job_type:
payload["variables"]["filterParams"].append( payload["variables"]["filterParams"].append(
{"filterKey": "jobType", "values": filter_value} {"filterKey": "jobType", "values": self.scraper_input.job_type.value[0]}
) )
return json.dumps([payload]) return json.dumps([payload])
@staticmethod
def parse_compensation(data: dict) -> Optional[Compensation]:
pay_period = data.get("payPeriod")
adjusted_pay = data.get("payPeriodAdjustedPay")
currency = data.get("payCurrency", "USD")
if not pay_period or not adjusted_pay:
return None
interval = None
if pay_period == "ANNUAL":
interval = CompensationInterval.YEARLY
elif pay_period:
interval = CompensationInterval.get_interval(pay_period)
min_amount = int(adjusted_pay.get("p10") // 1)
max_amount = int(adjusted_pay.get("p90") // 1)
return Compensation(
interval=interval,
min_amount=min_amount,
max_amount=max_amount,
currency=currency,
)
@staticmethod @staticmethod
def get_job_type_enum(job_type_str: str) -> list[JobType] | None: def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
for job_type in JobType: for job_type in JobType:
@@ -318,20 +480,13 @@ class GlassdoorScraper(Scraper):
if cursor_data["pageNumber"] == page_num: if cursor_data["pageNumber"] == page_num:
return cursor_data["cursor"] return cursor_data["cursor"]
@staticmethod headers = {
def headers() -> dict:
"""
Returns headers needed for requests
:return: dict - Dictionary containing headers
"""
return {
"authority": "www.glassdoor.com", "authority": "www.glassdoor.com",
"accept": "*/*", "accept": "*/*",
"accept-language": "en-US,en;q=0.9", "accept-language": "en-US,en;q=0.9",
"apollographql-client-name": "job-search-next", "apollographql-client-name": "job-search-next",
"apollographql-client-version": "4.65.5", "apollographql-client-version": "4.65.5",
"content-type": "application/json", "content-type": "application/json",
"cookie": 'gdId=91e2dfc4-c8b5-4fa7-83d0-11512b80262c; G_ENABLED_IDPS=google; trs=https%3A%2F%2Fwww.redhat.com%2F:referral:referral:2023-07-05+09%3A50%3A14.862:undefined:undefined; g_state={"i_p":1688587331651,"i_l":1}; _cfuvid=.7llazxhYFZWi6EISSPdVjtqF0NMVwzxr_E.cB1jgLs-1697828392979-0-604800000; GSESSIONID=undefined; JSESSIONID=F03DD1B5EE02DB6D842FE42B142F88F3; cass=1; jobsClicked=true; indeedCtk=1hd77b301k79i801; asst=1697829114.2; G_AUTHUSER_H=0; uc=8013A8318C98C517FE6DD0024636DFDEF978FC33266D93A2FAFEF364EACA608949D8B8FA2DC243D62DE271D733EB189D809ABE5B08D7B1AE865D217BD4EEBB97C282F5DA5FEFE79C937E3F6110B2A3A0ADBBA3B4B6DF5A996FEE00516100A65FCB11DA26817BE8D1C1BF6CFE36B5B68A3FDC2CFEC83AB797F7841FBB157C202332FC7E077B56BD39B167BDF3D9866E3B; AWSALB=zxc/Yk1nbWXXT6HjNyn3H4h4950ckVsFV/zOrq5LSoChYLE1qV+hDI8Axi3fUa9rlskndcO0M+Fw+ZnJ+AQ2afBFpyOd1acouLMYgkbEpqpQaWhY6/Gv4QH1zBcJ; AWSALBCORS=zxc/Yk1nbWXXT6HjNyn3H4h4950ckVsFV/zOrq5LSoChYLE1qV+hDI8Axi3fUa9rlskndcO0M+Fw+ZnJ+AQ2afBFpyOd1acouLMYgkbEpqpQaWhY6/Gv4QH1zBcJ; gdsid=1697828393025:1697830776351:668396EDB9E6A832022D34414128093D; at=HkH8Hnqi9uaMC7eu0okqyIwqp07ht9hBvE1_St7E_hRqPvkO9pUeJ1Jcpds4F3g6LL5ADaCNlxrPn0o6DumGMfog8qI1-zxaV_jpiFs3pugntw6WpVyYWdfioIZ1IDKupyteeLQEM1AO4zhGjY_rPZynpsiZBPO_B1au94sKv64rv23yvP56OiWKKfI-8_9hhLACEwWvM-Az7X-4aE2QdFt93VJbXbbGVf07bdDZfimsIkTtgJCLSRhU1V0kEM1Efyu66vo3m77gFFaMW7lxyYnb36I5PdDtEXBm3aL-zR7-qa5ywd94ISEivgqQOA4FPItNhqIlX4XrfD1lxVz6rfPaoTIDi4DI6UMCUjwyPsuv8mn0rYqDfRnmJpZ97fJ5AnhrknAd_6ZWN5v1OrxJczHzcXd8LO820QPoqxzzG13bmSTXLwGSxMUCtSrVsq05hicimQ3jpRt0c1dA4OkTNqF7_770B9JfcHcM8cr8-C4IL56dnOjr9KBGfN1Q2IvZM2cOBRbV7okiNOzKVZ3qJ24AE34WA2F3U6Whiu6H8nIuGG5hSNkVygY6CtglNZfFF9p8pJAZm79PngrrBv-CXFBZmhYLFo46lmFetDkiJ6mirtez4tKpzTIYjIp4_JAkiZFwbLJ2QGH4mK8kyyW0lZiX1DTuQec50N_5wvRo0Gt7nlKxzLsApMnaNhuQeH5ygh_pa381ORo9mQGi0EYF9zk00pa2--z4PtjfQ8KFq36GgpxKy5-o4qgqygZj8F01L8r-FiX2G4C7PREMIpAyHX2A4-_JxA1IS2j12EyqKTLqE9VcP06qm2Z-YuIW3ctmpMxy5G9_KiEiGv17weizhSFnl6SbpAEY-2VSmQ5V6jm3hoMp2jemkuGCRkZeFstLDEPxlzFN7WM; __cf_bm=zGaVjIJw4irf40_7UVw54B6Ohm271RUX4Tc8KVScrbs-1697830777-0-AYv2GnKTnnCU+cY9xHbJunO0DwlLDO6SIBnC/s/qldpKsGK0rRAjD6y8lbyATT/KlS7g29OZaN4fbd0lrJg0KmWbIybZIzfWVLHSYePVuOhu; asst=1697829114.2; at=dFhXf64wsf2TlnWy41xLs7skJkuxgKToEGcjGtDfUvW4oEAJ4tTIR5dKQ8wbwT75aIaGgdCfvcb-da7vwrCGWscCncmfLFQpJ9l-LLwoRfk-pMsxHhd77wvf-W7I0HSm7-Q5lQJqI9WyNGRxOa-RpzBTf4L8_Et4-3FzjPaAoYY5pY1FhuwXbN5asGOAMW-p8cjpbfn3PumlIYuckguWnjrcY2F31YJ_1noeoHM9tCGpymANbqGXRkG6aXY7yCfVXtdgZU1K5SMeaSPZIuF_iLUxjc_corzpNiH6qq7BIAmh-e5Aa-g7cwpZcln1fmwTVw4uTMZf1eLIMTa9WzgqZNkvG-sGaq_XxKA_Wai6xTTkOHfRgm4632Ba2963wdJvkGmUUa3tb_L4_wTgk3eFnHp5JhghLfT2Pe3KidP-yX__vx8JOsqe3fndCkKXgVz7xQKe1Dur-sMNlGwi4LXfguTT2YUI8C5Miq3pj2IHc7dC97eyyAiAM4HvyGWfaXWZcei6oIGrOwMvYgy0AcwFry6SIP2SxLT5TrxinRRuem1r1IcOTJsMJyUPp1QsZ7bOyq9G_0060B4CPyovw5523hEuqLTM-R5e5yavY6C_1DHUyE15C3mrh7kdvmlGZeflnHqkFTEKwwOftm-Mv-CKD5Db9ABFGNxKB2FH7nDH67hfOvm4tGNMzceBPKYJ3wciTt9jK3wy39_7cOYVywfrZ-oLhw_XtsbGSSeGn3HytrfgSADAh2sT0Gg6eCC9Xy1vh-Za337SVLUDXZ73W2xJxxUHBkFzZs8L_Xndo5DsbpWhVs9IYUGyraJdqB3SLgDbAppIBCJl4fx6_DG8-xOQPBvuFMlTROe1JVdHOzXI1GElwFDTuH1pjkg4I2G0NhAbE06Y-1illQE; gdsid=1697828393025:1697831731408:99C30D94108AC3030D61C736DDCDF11C',
"gd-csrf-token": "Ft6oHEWlRZrxDww95Cpazw:0pGUrkb2y3TyOpAIqF2vbPmUXoXVkD3oEGDVkvfeCerceQ5-n8mBg3BovySUIjmCPHCaW0H2nQVdqzbtsYqf4Q:wcqRqeegRUa9MVLJGyujVXB7vWFPjdaS1CtrrzJq-ok", "gd-csrf-token": "Ft6oHEWlRZrxDww95Cpazw:0pGUrkb2y3TyOpAIqF2vbPmUXoXVkD3oEGDVkvfeCerceQ5-n8mBg3BovySUIjmCPHCaW0H2nQVdqzbtsYqf4Q:wcqRqeegRUa9MVLJGyujVXB7vWFPjdaS1CtrrzJq-ok",
"origin": "https://www.glassdoor.com", "origin": "https://www.glassdoor.com",
"referer": "https://www.glassdoor.com/", "referer": "https://www.glassdoor.com/",

View File

@@ -11,7 +11,6 @@ import requests
from typing import Any from typing import Any
from datetime import datetime from datetime import datetime
import urllib.parse
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bs4.element import Tag from bs4.element import Tag
from concurrent.futures import ThreadPoolExecutor, Future from concurrent.futures import ThreadPoolExecutor, Future
@@ -22,7 +21,8 @@ from ..utils import (
extract_emails_from_text, extract_emails_from_text,
create_session, create_session,
get_enum_from_job_type, get_enum_from_job_type,
modify_and_get_description markdown_converter,
logger
) )
from ...jobs import ( from ...jobs import (
JobPost, JobPost,
@@ -31,6 +31,7 @@ from ...jobs import (
Location, Location,
JobResponse, JobResponse,
JobType, JobType,
DescriptionFormat
) )
from .. import Scraper, ScraperInput, Site from .. import Scraper, ScraperInput, Site
@@ -40,120 +41,23 @@ class IndeedScraper(Scraper):
""" """
Initializes IndeedScraper with the Indeed job search url Initializes IndeedScraper with the Indeed job search url
""" """
self.url = None self.scraper_input = None
self.country = None self.jobs_per_page = 25
self.num_workers = 10
self.seen_urls = set()
self.base_url = None
self.api_url = "https://apis.indeed.com/graphql"
site = Site(Site.INDEED) site = Site(Site.INDEED)
super().__init__(site, proxy=proxy) super().__init__(site, proxy=proxy)
self.jobs_per_page = 25
self.seen_urls = set()
def scrape_page(
self, scraper_input: ScraperInput, page: int
) -> tuple[list[JobPost], int]:
"""
Scrapes a page of Indeed for jobs with scraper_input criteria
:param scraper_input:
:param page:
:return: jobs found on page, total number of jobs found for search
"""
self.country = scraper_input.country
domain = self.country.indeed_domain_value
self.url = f"https://{domain}.indeed.com"
try:
session = create_session(self.proxy)
response = session.get(
f"{self.url}/m/jobs",
headers=self.get_headers(),
params=self.add_params(scraper_input, page),
allow_redirects=True,
timeout_seconds=10,
)
if response.status_code not in range(200, 400):
raise IndeedException(
f"bad response with status code: {response.status_code}"
)
except Exception as e:
if "Proxy responded with" in str(e):
raise IndeedException("bad proxy")
raise IndeedException(str(e))
soup = BeautifulSoup(response.content, "html.parser")
job_list = []
total_num_jobs = IndeedScraper.total_jobs(soup)
if "did not match any jobs" in response.text:
return job_list, total_num_jobs
jobs = IndeedScraper.parse_jobs(
soup
) #: can raise exception, handled by main scrape function
if (
not jobs.get("metaData", {})
.get("mosaicProviderJobCardsModel", {})
.get("results")
):
raise IndeedException("No jobs found.")
def process_job(job: dict, job_detailed: dict) -> JobPost | None:
job_url = f'{self.url}/m/jobs/viewjob?jk={job["jobkey"]}'
job_url_client = f'{self.url}/viewjob?jk={job["jobkey"]}'
if job_url in self.seen_urls:
return None
self.seen_urls.add(job_url)
description = job_detailed['description']['html']
job_type = IndeedScraper.get_job_type(job)
timestamp_seconds = job["pubDate"] / 1000
date_posted = datetime.fromtimestamp(timestamp_seconds)
date_posted = date_posted.strftime("%Y-%m-%d")
job_post = JobPost(
title=job["normTitle"],
description=description,
company_name=job["company"],
company_url=f"{self.url}{job_detailed['employer']['relativeCompanyPageUrl']}" if job_detailed['employer'] else None,
location=Location(
city=job.get("jobLocationCity"),
state=job.get("jobLocationState"),
country=self.country,
),
job_type=job_type,
compensation=self.get_compensation(job, job_detailed),
date_posted=date_posted,
job_url=job_url_client,
emails=extract_emails_from_text(description) if description else None,
num_urgent_words=count_urgent_words(description)
if description
else None,
is_remote=IndeedScraper.is_job_remote(job, job_detailed, description)
)
return job_post
workers = 10
jobs = jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]
job_keys = [job['jobkey'] for job in jobs]
jobs_detailed = self.get_job_details(job_keys)
with ThreadPoolExecutor(max_workers=workers) as executor:
job_results: list[Future] = [
executor.submit(process_job, job, job_detailed['job']) for job, job_detailed in zip(jobs, jobs_detailed)
]
job_list = [result.result() for result in job_results if result.result()]
return job_list, total_num_jobs
def scrape(self, scraper_input: ScraperInput) -> JobResponse: def scrape(self, scraper_input: ScraperInput) -> JobResponse:
""" """
Scrapes Indeed for jobs with scraper_input criteria Scrapes Indeed for jobs with scraper_input criteria
:param scraper_input: :param scraper_input:
:return: job_response :return: job_response
""" """
job_list, total_results = self.scrape_page(scraper_input, 0) self.scraper_input = scraper_input
job_list = self._scrape_page()
pages_processed = 1 pages_processed = 1
while len(self.seen_urls) < scraper_input.results_wanted: while len(self.seen_urls) < scraper_input.results_wanted:
@@ -162,12 +66,12 @@ class IndeedScraper(Scraper):
with ThreadPoolExecutor(max_workers=10) as executor: with ThreadPoolExecutor(max_workers=10) as executor:
futures: list[Future] = [ futures: list[Future] = [
executor.submit(self.scrape_page, scraper_input, page + pages_processed) executor.submit(self._scrape_page, page + pages_processed)
for page in range(pages_to_process) for page in range(pages_to_process)
] ]
for future in futures: for future in futures:
jobs, _ = future.result() jobs = future.result()
if jobs: if jobs:
job_list += jobs job_list += jobs
new_jobs = True new_jobs = True
@@ -182,58 +86,138 @@ class IndeedScraper(Scraper):
if len(self.seen_urls) > scraper_input.results_wanted: if len(self.seen_urls) > scraper_input.results_wanted:
job_list = job_list[:scraper_input.results_wanted] job_list = job_list[:scraper_input.results_wanted]
job_response = JobResponse( return JobResponse(jobs=job_list)
jobs=job_list,
total_results=total_results,
)
return job_response
def get_description(self, job_page_url: str) -> str | None: def _scrape_page(self, page: int=0) -> list[JobPost]:
""" """
Retrieves job description by going to the job page url Scrapes a page of Indeed for jobs with scraper_input criteria
:param job_page_url: :param page:
:return: description :return: jobs found on page, total number of jobs found for search
""" """
parsed_url = urllib.parse.urlparse(job_page_url) job_list = []
params = urllib.parse.parse_qs(parsed_url.query) domain = self.scraper_input.country.indeed_domain_value
jk_value = params.get("jk", [None])[0] self.base_url = f"https://{domain}.indeed.com"
formatted_url = f"{self.url}/m/viewjob?jk={jk_value}&spa=1"
try:
session = create_session(self.proxy) session = create_session(self.proxy)
try:
response = session.get( response = session.get(
formatted_url, f"{self.base_url}/m/jobs",
headers=self.get_headers(), headers=self.headers,
allow_redirects=True, params=self._add_params(page),
timeout_seconds=5,
) )
except Exception as e:
return None
if response.status_code not in range(200, 400): if response.status_code not in range(200, 400):
if response.status_code == 429:
logger.error(f'429 Response - Blocked by Indeed for too many requests')
else:
logger.error(f'Indeed response status code {response.status_code}')
return job_list
except Exception as e:
if "Proxy responded with" in str(e):
logger.error(f'Indeed: Bad proxy')
else:
logger.error(f'Indeed: {str(e)}')
return job_list
soup = BeautifulSoup(response.content, "html.parser")
if "did not match any jobs" in response.text:
return job_list
jobs = IndeedScraper._parse_jobs(soup)
if (
not jobs.get("metaData", {})
.get("mosaicProviderJobCardsModel", {})
.get("results")
):
raise IndeedException("No jobs found.")
jobs = jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]
job_keys = [job['jobkey'] for job in jobs]
jobs_detailed = self._get_job_details(job_keys)
with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
job_results: list[Future] = [
executor.submit(self._process_job, job, job_detailed['job']) for job, job_detailed in zip(jobs, jobs_detailed)
]
job_list = [result.result() for result in job_results if result.result()]
return job_list
def _process_job(self, job: dict, job_detailed: dict) -> JobPost | None:
job_url = f'{self.base_url}/m/jobs/viewjob?jk={job["jobkey"]}'
job_url_client = f'{self.base_url}/viewjob?jk={job["jobkey"]}'
if job_url in self.seen_urls:
return None return None
self.seen_urls.add(job_url)
description = job_detailed['description']['html']
description = markdown_converter(description) if self.scraper_input.description_format == DescriptionFormat.MARKDOWN else description
job_type = self._get_job_type(job)
timestamp_seconds = job["pubDate"] / 1000
date_posted = datetime.fromtimestamp(timestamp_seconds)
date_posted = date_posted.strftime("%Y-%m-%d")
return JobPost(
title=job["normTitle"],
description=description,
company_name=job["company"],
company_url=f"{self.base_url}{job_detailed['employer']['relativeCompanyPageUrl']}" if job_detailed[
'employer'] else None,
location=Location(
city=job.get("jobLocationCity"),
state=job.get("jobLocationState"),
country=self.scraper_input.country,
),
job_type=job_type,
compensation=self._get_compensation(job, job_detailed),
date_posted=date_posted,
job_url=job_url_client,
emails=extract_emails_from_text(description) if description else None,
num_urgent_words=count_urgent_words(description) if description else None,
is_remote=self._is_job_remote(job, job_detailed, description)
)
try: def _get_job_details(self, job_keys: list[str]) -> dict:
soup = BeautifulSoup(response.text, 'html.parser') """
script_tags = soup.find_all('script') Queries the GraphQL endpoint for detailed job information for the given job keys.
"""
job_keys_gql = '[' + ', '.join(f'"{key}"' for key in job_keys) + ']'
payload = dict(self.api_payload)
payload["query"] = self.api_payload["query"].format(job_keys_gql=job_keys_gql)
response = requests.post(self.api_url, headers=self.api_headers, json=payload, proxies=self.proxy)
if response.status_code == 200:
return response.json()['data']['jobData']['results']
else:
return {}
job_description = '' def _add_params(self, page: int) -> dict[str, str | Any]:
for tag in script_tags: fromage = max(self.scraper_input.hours_old // 24, 1) if self.scraper_input.hours_old else None
if 'window._initialData' in tag.text: params = {
json_str = tag.text "q": self.scraper_input.search_term,
json_str = json_str.split('window._initialData=')[1] "l": self.scraper_input.location if self.scraper_input.location else self.scraper_input.country.value[0].split(',')[-1],
json_str = json_str.rsplit(';', 1)[0] "filter": 0,
data = json.loads(json_str) "start": self.scraper_input.offset + page * 10,
job_description = data["jobInfoWrapperModel"]["jobInfoModel"]["sanitizedJobDescription"] "sort": "date",
break "fromage": fromage,
except (KeyError, TypeError, IndexError): }
return None if self.scraper_input.distance:
params["radius"] = self.scraper_input.distance
soup = BeautifulSoup(job_description, "html.parser") sc_values = []
return modify_and_get_description(soup) if self.scraper_input.is_remote:
sc_values.append("attr(DSQF7)")
if self.scraper_input.job_type:
sc_values.append("jt({})".format(self.scraper_input.job_type.value[0]))
if sc_values:
params["sc"] = "0kf:" + "".join(sc_values) + ";"
if self.scraper_input.easy_apply:
params['iafilter'] = 1
return params
@staticmethod @staticmethod
def get_job_type(job: dict) -> list[JobType] | None: def _get_job_type(job: dict) -> list[JobType] | None:
""" """
Parses the job to get list of job types Parses the job to get list of job types
:param job: :param job:
@@ -252,7 +236,7 @@ class IndeedScraper(Scraper):
return job_types return job_types
@staticmethod @staticmethod
def get_compensation(job: dict, job_detailed: dict) -> Compensation: def _get_compensation(job: dict, job_detailed: dict) -> Compensation:
""" """
Parses the job to get Parses the job to get
:param job: :param job:
@@ -261,7 +245,7 @@ class IndeedScraper(Scraper):
""" """
comp = job_detailed['compensation']['baseSalary'] comp = job_detailed['compensation']['baseSalary']
if comp: if comp:
interval = IndeedScraper.get_correct_interval(comp['unitOfWork']) interval = IndeedScraper._get_correct_interval(comp['unitOfWork'])
if interval: if interval:
return Compensation( return Compensation(
interval=interval, interval=interval,
@@ -290,18 +274,13 @@ class IndeedScraper(Scraper):
return compensation return compensation
@staticmethod @staticmethod
def parse_jobs(soup: BeautifulSoup) -> dict: def _parse_jobs(soup: BeautifulSoup) -> dict:
""" """
Parses the jobs from the soup object Parses the jobs from the soup object
:param soup: :param soup:
:return: jobs :return: jobs
""" """
def find_mosaic_script() -> Tag | None: def find_mosaic_script() -> Tag | None:
"""
Finds jobcards script tag
:return: script_tag
"""
script_tags = soup.find_all("script") script_tags = soup.find_all("script")
for tag in script_tags: for tag in script_tags:
@@ -314,7 +293,6 @@ class IndeedScraper(Scraper):
return None return None
script_tag = find_mosaic_script() script_tag = find_mosaic_script()
if script_tag: if script_tag:
script_str = script_tag.string script_str = script_tag.string
pattern = r'window.mosaic.providerData\["mosaic-provider-jobcards"\]\s*=\s*({.*?});' pattern = r'window.mosaic.providerData\["mosaic-provider-jobcards"\]\s*=\s*({.*?});'
@@ -331,67 +309,7 @@ class IndeedScraper(Scraper):
) )
@staticmethod @staticmethod
def total_jobs(soup: BeautifulSoup) -> int: def _is_job_remote(job: dict, job_detailed: dict, description: str) -> bool:
"""
Parses the total jobs for that search from soup object
:param soup:
:return: total_num_jobs
"""
script = soup.find("script", string=lambda t: t and "window._initialData" in t)
pattern = re.compile(r"window._initialData\s*=\s*({.*})\s*;", re.DOTALL)
match = pattern.search(script.string)
total_num_jobs = 0
if match:
json_str = match.group(1)
data = json.loads(json_str)
total_num_jobs = int(data["searchTitleBarModel"]["totalNumResults"])
return total_num_jobs
@staticmethod
def get_headers():
return {
'Host': 'www.indeed.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'sec-fetch-site': 'same-origin',
'sec-fetch-dest': 'document',
'accept-language': 'en-US,en;q=0.9',
'sec-fetch-mode': 'navigate',
'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Indeed App 192.0',
'referer': 'https://www.indeed.com/m/jobs?q=software%20intern&l=Dallas%2C%20TX&from=serpso&rq=1&rsIdx=3',
}
@staticmethod
def add_params(scraper_input: ScraperInput, page: int) -> dict[str, str | Any]:
# `fromage` is the posting time filter in days
fromage = max(scraper_input.hours_old // 24, 1) if scraper_input.hours_old else None
params = {
"q": scraper_input.search_term,
"l": scraper_input.location if scraper_input.location else scraper_input.country.value[0].split(',')[-1],
"filter": 0,
"start": scraper_input.offset + page * 10,
"sort": "date",
"fromage": fromage,
}
if scraper_input.distance:
params["radius"] = scraper_input.distance
sc_values = []
if scraper_input.is_remote:
sc_values.append("attr(DSQF7)")
if scraper_input.job_type:
sc_values.append("jt({})".format(scraper_input.job_type.value))
if sc_values:
params["sc"] = "0kf:" + "".join(sc_values) + ";"
if scraper_input.easy_apply:
params['iafilter'] = 1
return params
@staticmethod
def is_job_remote(job: dict, job_detailed: dict, description: str) -> bool:
remote_keywords = ['remote', 'work from home', 'wfh'] remote_keywords = ['remote', 'work from home', 'wfh']
is_remote_in_attributes = any( is_remote_in_attributes = any(
any(keyword in attr['label'].lower() for keyword in remote_keywords) any(keyword in attr['label'].lower() for keyword in remote_keywords)
@@ -406,14 +324,34 @@ class IndeedScraper(Scraper):
taxonomy["label"] == "remote" and len(taxonomy["attributes"]) > 0 taxonomy["label"] == "remote" and len(taxonomy["attributes"]) > 0
for taxonomy in job.get("taxonomyAttributes", []) for taxonomy in job.get("taxonomyAttributes", [])
) )
return is_remote_in_attributes or is_remote_in_description or is_remote_in_location return is_remote_in_attributes or is_remote_in_description or is_remote_in_location or is_remote_in_taxonomy
@staticmethod
def _get_correct_interval(interval: str) -> CompensationInterval:
interval_mapping = {
"DAY": "DAILY",
"YEAR": "YEARLY",
"HOUR": "HOURLY",
"WEEK": "WEEKLY",
"MONTH": "MONTHLY"
}
mapped_interval = interval_mapping.get(interval.upper(), None)
if mapped_interval and mapped_interval in CompensationInterval.__members__:
return CompensationInterval[mapped_interval]
else:
raise ValueError(f"Unsupported interval: {interval}")
def get_job_details(self, job_keys: list[str]) -> dict:
"""
Queries the GraphQL endpoint for detailed job information for the given job keys.
"""
url = "https://apis.indeed.com/graphql"
headers = { headers = {
'Host': 'www.indeed.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'sec-fetch-site': 'same-origin',
'sec-fetch-dest': 'document',
'accept-language': 'en-US,en;q=0.9',
'sec-fetch-mode': 'navigate',
'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Indeed App 192.0',
'referer': 'https://www.indeed.com/m/jobs?q=software%20intern&l=Dallas%2C%20TX&from=serpso&rq=1&rsIdx=3',
}
api_headers = {
'Host': 'apis.indeed.com', 'Host': 'apis.indeed.com',
'content-type': 'application/json', 'content-type': 'application/json',
'indeed-api-key': '161092c2017b5bbab13edb12461a62d5a833871e7cad6d9d475304573de67ac8', 'indeed-api-key': '161092c2017b5bbab13edb12461a62d5a833871e7cad6d9d475304573de67ac8',
@@ -424,11 +362,8 @@ class IndeedScraper(Scraper):
'indeed-app-info': 'appv=193.1; appid=com.indeed.jobsearch; osv=16.6.1; os=ios; dtype=phone', 'indeed-app-info': 'appv=193.1; appid=com.indeed.jobsearch; osv=16.6.1; os=ios; dtype=phone',
'indeed-co': 'US', 'indeed-co': 'US',
} }
api_payload = {
job_keys_gql = '[' + ', '.join(f'"{key}"' for key in job_keys) + ']' "query": """
payload = {
"query": f"""
query GetJobData {{ query GetJobData {{
jobData(input: {{ jobData(input: {{
jobKeys: {job_keys_gql} jobKeys: {job_keys_gql}
@@ -480,23 +415,3 @@ class IndeedScraper(Scraper):
}} }}
""" """
} }
response = requests.post(url, headers=headers, json=payload, proxies=self.proxy)
if response.status_code == 200:
return response.json()['data']['jobData']['results']
else:
return {}
@staticmethod
def get_correct_interval(interval: str) -> CompensationInterval:
interval_mapping = {
"DAY": "DAILY",
"YEAR": "YEARLY",
"HOUR": "HOURLY",
"WEEK": "WEEKLY",
"MONTH": "MONTHLY"
}
mapped_interval = interval_mapping.get(interval.upper(), None)
if mapped_interval and mapped_interval in CompensationInterval.__members__:
return CompensationInterval[mapped_interval]
else:
raise ValueError(f"Unsupported interval: {interval}")

View File

@@ -25,27 +25,30 @@ from ...jobs import (
JobResponse, JobResponse,
JobType, JobType,
Country, Country,
Compensation Compensation,
DescriptionFormat
) )
from ..utils import ( from ..utils import (
logger,
count_urgent_words, count_urgent_words,
extract_emails_from_text, extract_emails_from_text,
get_enum_from_job_type, get_enum_from_job_type,
currency_parser, currency_parser,
modify_and_get_description markdown_converter
) )
class LinkedInScraper(Scraper): class LinkedInScraper(Scraper):
DELAY = 3 base_url = "https://www.linkedin.com"
delay = 3
def __init__(self, proxy: Optional[str] = None): def __init__(self, proxy: Optional[str] = None):
""" """
Initializes LinkedInScraper with the LinkedIn job search url Initializes LinkedInScraper with the LinkedIn job search url
""" """
self.scraper_input = None
site = Site(Site.LINKEDIN) site = Site(Site.LINKEDIN)
self.country = "worldwide" self.country = "worldwide"
self.url = "https://www.linkedin.com"
super().__init__(site, proxy=proxy) super().__init__(site, proxy=proxy)
def scrape(self, scraper_input: ScraperInput) -> JobResponse: def scrape(self, scraper_input: ScraperInput) -> JobResponse:
@@ -54,28 +57,16 @@ class LinkedInScraper(Scraper):
:param scraper_input: :param scraper_input:
:return: job_response :return: job_response
""" """
self.scraper_input = scraper_input
job_list: list[JobPost] = [] job_list: list[JobPost] = []
seen_urls = set() seen_urls = set()
url_lock = Lock() url_lock = Lock()
page = scraper_input.offset // 25 + 25 if scraper_input.offset else 0 page = scraper_input.offset // 25 + 25 if scraper_input.offset else 0
seconds_old = ( seconds_old = (
scraper_input.hours_old * 3600 scraper_input.hours_old * 3600
if scraper_input.hours_old if scraper_input.hours_old
else None else None
) )
def job_type_code(job_type_enum):
mapping = {
JobType.FULL_TIME: "F",
JobType.PART_TIME: "P",
JobType.INTERNSHIP: "I",
JobType.CONTRACT: "C",
JobType.TEMPORARY: "T",
}
return mapping.get(job_type_enum, "")
continue_search = lambda: len(job_list) < scraper_input.results_wanted and page < 1000 continue_search = lambda: len(job_list) < scraper_input.results_wanted and page < 1000
while continue_search(): while continue_search():
@@ -85,7 +76,7 @@ class LinkedInScraper(Scraper):
"location": scraper_input.location, "location": scraper_input.location,
"distance": scraper_input.distance, "distance": scraper_input.distance,
"f_WT": 2 if scraper_input.is_remote else None, "f_WT": 2 if scraper_input.is_remote else None,
"f_JT": job_type_code(scraper_input.job_type) "f_JT": self.job_type_code(scraper_input.job_type)
if scraper_input.job_type if scraper_input.job_type
else None, else None,
"pageNum": 0, "pageNum": 0,
@@ -98,23 +89,25 @@ class LinkedInScraper(Scraper):
params = {k: v for k, v in params.items() if v is not None} params = {k: v for k, v in params.items() if v is not None}
try: try:
response = session.get( response = session.get(
f"{self.url}/jobs-guest/jobs/api/seeMoreJobPostings/search?", f"{self.base_url}/jobs-guest/jobs/api/seeMoreJobPostings/search?",
params=params, params=params,
allow_redirects=True, allow_redirects=True,
proxies=self.proxy, proxies=self.proxy,
headers=self.headers(), headers=self.headers,
timeout=10, timeout=10,
) )
response.raise_for_status() if response.status_code not in range(200, 400):
if response.status_code == 429:
except requests.HTTPError as e: logger.error(f'429 Response - Blocked by LinkedIn for too many requests')
raise LinkedInException( else:
f"bad response status code: {e.response.status_code}" logger.error(f'LinkedIn response status code {response.status_code}')
) return JobResponse(job_list=job_list)
except ProxyError as e:
raise LinkedInException("bad proxy")
except Exception as e: except Exception as e:
raise LinkedInException(str(e)) if "Proxy responded with" in str(e):
logger.error(f'Indeed: Bad proxy')
else:
logger.error(f'Indeed: {str(e)}')
return JobResponse(job_list=job_list)
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
job_cards = soup.find_all("div", class_="base-search-card") job_cards = soup.find_all("div", class_="base-search-card")
@@ -127,29 +120,29 @@ class LinkedInScraper(Scraper):
if href_tag and "href" in href_tag.attrs: if href_tag and "href" in href_tag.attrs:
href = href_tag.attrs["href"].split("?")[0] href = href_tag.attrs["href"].split("?")[0]
job_id = href.split("-")[-1] job_id = href.split("-")[-1]
job_url = f"{self.url}/jobs/view/{job_id}" job_url = f"{self.base_url}/jobs/view/{job_id}"
with url_lock: with url_lock:
if job_url in seen_urls: if job_url in seen_urls:
continue continue
seen_urls.add(job_url) seen_urls.add(job_url)
# Call process_job directly without threading
try: try:
job_post = self.process_job(job_card, job_url, scraper_input.full_description) job_post = self._process_job(job_card, job_url, scraper_input.linkedin_fetch_description)
if job_post: if job_post:
job_list.append(job_post) job_list.append(job_post)
if not continue_search():
break
except Exception as e: except Exception as e:
raise LinkedInException("Exception occurred while processing jobs") raise LinkedInException(str(e))
if continue_search(): if continue_search():
time.sleep(random.uniform(LinkedInScraper.DELAY, LinkedInScraper.DELAY + 2)) time.sleep(random.uniform(self.delay, self.delay + 2))
page += 25 page += 25
job_list = job_list[: scraper_input.results_wanted] job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list) return JobResponse(jobs=job_list)
def process_job(self, job_card: Tag, job_url: str, full_descr: bool) -> Optional[JobPost]: def _process_job(self, job_card: Tag, job_url: str, full_descr: bool) -> Optional[JobPost]:
salary_tag = job_card.find('span', class_='job-search-card__salary-info') salary_tag = job_card.find('span', class_='job-search-card__salary-info')
compensation = None compensation = None
@@ -179,7 +172,7 @@ class LinkedInScraper(Scraper):
company = company_a_tag.get_text(strip=True) if company_a_tag else "N/A" company = company_a_tag.get_text(strip=True) if company_a_tag else "N/A"
metadata_card = job_card.find("div", class_="base-search-card__metadata") metadata_card = job_card.find("div", class_="base-search-card__metadata")
location = self.get_location(metadata_card) location = self._get_location(metadata_card)
datetime_tag = ( datetime_tag = (
metadata_card.find("time", class_="job-search-card__listdate") metadata_card.find("time", class_="job-search-card__listdate")
@@ -191,12 +184,12 @@ class LinkedInScraper(Scraper):
datetime_str = datetime_tag["datetime"] datetime_str = datetime_tag["datetime"]
try: try:
date_posted = datetime.strptime(datetime_str, "%Y-%m-%d") date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")
except Exception as e: except:
date_posted = None date_posted = None
benefits_tag = job_card.find("span", class_="result-benefits__text") benefits_tag = job_card.find("span", class_="result-benefits__text")
benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None
if full_descr: if full_descr:
description, job_type = self.get_job_description(job_url) description, job_type = self._get_job_description(job_url)
return JobPost( return JobPost(
title=title, title=title,
@@ -213,7 +206,7 @@ class LinkedInScraper(Scraper):
num_urgent_words=count_urgent_words(description) if description else None, num_urgent_words=count_urgent_words(description) if description else None,
) )
def get_job_description( def _get_job_description(
self, job_page_url: str self, job_page_url: str
) -> tuple[None, None] | tuple[str | None, tuple[str | None, JobType | None]]: ) -> tuple[None, None] | tuple[str | None, tuple[str | None, JobType | None]]:
""" """
@@ -223,11 +216,9 @@ class LinkedInScraper(Scraper):
""" """
try: try:
session = create_session(is_tls=False, has_retry=True) session = create_session(is_tls=False, has_retry=True)
response = session.get(job_page_url, timeout=5, proxies=self.proxy) response = session.get(job_page_url, headers=self.headers, timeout=5, proxies=self.proxy)
response.raise_for_status() response.raise_for_status()
except requests.HTTPError as e: except:
return None, None
except Exception as e:
return None, None return None, None
if response.url == "https://www.linkedin.com/signup": if response.url == "https://www.linkedin.com/signup":
return None, None return None, None
@@ -236,41 +227,19 @@ class LinkedInScraper(Scraper):
div_content = soup.find( div_content = soup.find(
"div", class_=lambda x: x and "show-more-less-html__markup" in x "div", class_=lambda x: x and "show-more-less-html__markup" in x
) )
description = None description = None
if div_content: if div_content is not None:
description = modify_and_get_description(div_content) def remove_attributes(tag):
for attr in list(tag.attrs):
del tag[attr]
return tag
div_content = remove_attributes(div_content)
description = div_content.prettify(formatter="html")
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
description = markdown_converter(description)
return description, self._parse_job_type(soup)
def get_job_type( def _get_location(self, metadata_card: Optional[Tag]) -> Location:
soup_job_type: BeautifulSoup,
) -> list[JobType] | None:
"""
Gets the job type from job page
:param soup_job_type:
:return: JobType
"""
h3_tag = soup_job_type.find(
"h3",
class_="description__job-criteria-subheader",
string=lambda text: "Employment type" in text,
)
employment_type = None
if h3_tag:
employment_type_span = h3_tag.find_next_sibling(
"span",
class_="description__job-criteria-text description__job-criteria-text--criteria",
)
if employment_type_span:
employment_type = employment_type_span.get_text(strip=True)
employment_type = employment_type.lower()
employment_type = employment_type.replace("-", "")
return [get_enum_from_job_type(employment_type)] if employment_type else []
return description, get_job_type(soup)
def get_location(self, metadata_card: Optional[Tag]) -> Location:
""" """
Extracts the location data from the job metadata card. Extracts the location data from the job metadata card.
:param metadata_card :param metadata_card
@@ -295,25 +264,50 @@ class LinkedInScraper(Scraper):
location = Location( location = Location(
city=city, city=city,
state=state, state=state,
country=Country.from_string(country), country=Country.from_string(country)
) )
return location return location
@staticmethod @staticmethod
def headers() -> dict: def _parse_job_type(soup_job_type: BeautifulSoup) -> list[JobType] | None:
"""
Gets the job type from job page
:param soup_job_type:
:return: JobType
"""
h3_tag = soup_job_type.find(
"h3",
class_="description__job-criteria-subheader",
string=lambda text: "Employment type" in text,
)
employment_type = None
if h3_tag:
employment_type_span = h3_tag.find_next_sibling(
"span",
class_="description__job-criteria-text description__job-criteria-text--criteria",
)
if employment_type_span:
employment_type = employment_type_span.get_text(strip=True)
employment_type = employment_type.lower()
employment_type = employment_type.replace("-", "")
return [get_enum_from_job_type(employment_type)] if employment_type else []
@staticmethod
def job_type_code(job_type_enum: JobType) -> str:
return { return {
JobType.FULL_TIME: "F",
JobType.PART_TIME: "P",
JobType.INTERNSHIP: "I",
JobType.CONTRACT: "C",
JobType.TEMPORARY: "T",
}.get(job_type_enum, "")
headers = {
"authority": "www.linkedin.com", "authority": "www.linkedin.com",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "en-US,en;q=0.9", "accept-language": "en-US,en;q=0.9",
"cache-control": "max-age=0", "cache-control": "max-age=0",
"sec-ch-ua": '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
# 'sec-ch-ua-mobile': '?0',
# 'sec-ch-ua-platform': '"macOS"',
# 'sec-fetch-dest': 'document',
# 'sec-fetch-mode': 'navigate',
# 'sec-fetch-site': 'none',
# 'sec-fetch-user': '?1',
"upgrade-insecure-requests": "1", "upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
} }

View File

@@ -1,20 +1,24 @@
import re import re
import logging
import numpy as np import numpy as np
import html2text
import tls_client import tls_client
import requests import requests
from requests.adapters import HTTPAdapter, Retry from requests.adapters import HTTPAdapter, Retry
from ..jobs import JobType from ..jobs import JobType
text_maker = html2text.HTML2Text()
def modify_and_get_description(soup): logger = logging.getLogger("JobSpy")
for li in soup.find_all('li'): logger.propagate = False
li.string = "- " + li.get_text() if not logger.handlers:
logger.setLevel(logging.ERROR)
description = soup.get_text(separator='\n').strip() console_handler = logging.StreamHandler()
description = re.sub(r'\n+', '\n', description) console_handler.setLevel(logging.ERROR)
return description formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
def count_urgent_words(description: str) -> int: def count_urgent_words(description: str) -> int:
@@ -31,6 +35,17 @@ def count_urgent_words(description: str) -> int:
return count return count
def markdown_converter(description_html: str):
if description_html is None:
return ""
text_maker.ignore_links = False
try:
markdown = text_maker.handle(description_html)
return markdown.strip()
except AssertionError as e:
return ""
def extract_emails_from_text(text: str) -> list[str] | None: def extract_emails_from_text(text: str) -> list[str] | None:
if not text: if not text:
return None return None
@@ -41,14 +56,10 @@ def extract_emails_from_text(text: str) -> list[str] | None:
def create_session(proxy: dict | None = None, is_tls: bool = True, has_retry: bool = False, delay: int = 1) -> requests.Session: def create_session(proxy: dict | None = None, is_tls: bool = True, has_retry: bool = False, delay: int = 1) -> requests.Session:
""" """
Creates a requests session with optional tls, proxy, and retry settings. Creates a requests session with optional tls, proxy, and retry settings.
:return: A session object :return: A session object
""" """
if is_tls: if is_tls:
session = tls_client.Session( session = tls_client.Session(random_tls_extension_order=True)
client_identifier="chrome112",
random_tls_extension_order=True,
)
session.proxies = proxy session.proxies = proxy
else: else:
session = requests.Session() session = requests.Session()
@@ -65,7 +76,6 @@ def create_session(proxy: dict | None = None, is_tls: bool = True, has_retry: bo
session.mount('http://', adapter) session.mount('http://', adapter)
session.mount('https://', adapter) session.mount('https://', adapter)
return session return session
@@ -79,6 +89,7 @@ def get_enum_from_job_type(job_type_str: str) -> JobType | None:
res = job_type res = job_type
return res return res
def currency_parser(cur_str): def currency_parser(cur_str):
# Remove any non-numerical characters # Remove any non-numerical characters
# except for ',' '.' or '-' (e.g. EUR) # except for ',' '.' or '-' (e.g. EUR)
@@ -94,3 +105,5 @@ def currency_parser(cur_str):
num = float(cur_str) num = float(cur_str)
return np.round(num, 2) return np.round(num, 2)

View File

@@ -6,34 +6,76 @@ This module contains routines to scrape ZipRecruiter.
""" """
import math import math
import time import time
from datetime import datetime, timezone from datetime import datetime
from typing import Optional, Tuple, Any from typing import Optional, Tuple, Any
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from .. import Scraper, ScraperInput, Site from .. import Scraper, ScraperInput, Site
from ..exceptions import ZipRecruiterException from ..utils import (
from ...jobs import JobPost, Compensation, Location, JobResponse, JobType, Country logger,
from ..utils import count_urgent_words, extract_emails_from_text, create_session, modify_and_get_description count_urgent_words,
extract_emails_from_text,
create_session,
markdown_converter
)
from ...jobs import (
JobPost,
Compensation,
Location,
JobResponse,
JobType,
Country,
DescriptionFormat
)
class ZipRecruiterScraper(Scraper): class ZipRecruiterScraper(Scraper):
base_url = "https://www.ziprecruiter.com"
api_url = "https://api.ziprecruiter.com"
def __init__(self, proxy: Optional[str] = None): def __init__(self, proxy: Optional[str] = None):
""" """
Initializes ZipRecruiterScraper with the ZipRecruiter job search url Initializes ZipRecruiterScraper with the ZipRecruiter job search url
""" """
site = Site(Site.ZIP_RECRUITER) self.scraper_input = None
self.url = "https://www.ziprecruiter.com"
self.session = create_session(proxy) self.session = create_session(proxy)
self.get_cookies() self._get_cookies()
super().__init__(site, proxy=proxy) super().__init__(Site.ZIP_RECRUITER, proxy=proxy)
self.delay = 5
self.jobs_per_page = 20 self.jobs_per_page = 20
self.seen_urls = set() self.seen_urls = set()
self.delay = 5
def find_jobs_in_page( def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Scrapes ZipRecruiter for jobs with scraper_input criteria.
:param scraper_input: Information about job search criteria.
:return: JobResponse containing a list of jobs.
"""
self.scraper_input = scraper_input
job_list: list[JobPost] = []
continue_token = None
max_pages = math.ceil(scraper_input.results_wanted / self.jobs_per_page)
for page in range(1, max_pages + 1):
if len(job_list) >= scraper_input.results_wanted:
break
if page > 1:
time.sleep(self.delay)
jobs_on_page, continue_token = self._find_jobs_in_page(
scraper_input, continue_token
)
if jobs_on_page:
job_list.extend(jobs_on_page)
else:
break
if not continue_token:
break
return JobResponse(jobs=job_list[: scraper_input.results_wanted])
def _find_jobs_in_page(
self, scraper_input: ScraperInput, continue_token: str | None = None self, scraper_input: ScraperInput, continue_token: str | None = None
) -> Tuple[list[JobPost], Optional[str]]: ) -> Tuple[list[JobPost], Optional[str]]:
""" """
@@ -42,75 +84,51 @@ class ZipRecruiterScraper(Scraper):
:param continue_token: :param continue_token:
:return: jobs found on page :return: jobs found on page
""" """
params = self.add_params(scraper_input) jobs_list = []
params = self._add_params(scraper_input)
if continue_token: if continue_token:
params["continue_from"] = continue_token params["continue_from"] = continue_token
try: try:
response = self.session.get( res= self.session.get(
f"https://api.ziprecruiter.com/jobs-app/jobs", f"{self.api_url}/jobs-app/jobs",
headers=self.headers(), headers=self.headers,
params=params params=params
) )
if response.status_code != 200: if res.status_code not in range(200, 400):
raise ZipRecruiterException( if res.status_code == 429:
f"bad response status code: {response.status_code}" logger.error(f'429 Response - Blocked by ZipRecruiter for too many requests')
) else:
logger.error(f'ZipRecruiter response status code {res.status_code}')
return jobs_list, ""
except Exception as e: except Exception as e:
if "Proxy responded with non 200 code" in str(e): if "Proxy responded with" in str(e):
raise ZipRecruiterException("bad proxy") logger.error(f'Indeed: Bad proxy')
raise ZipRecruiterException(str(e)) else:
logger.error(f'Indeed: {str(e)}')
return jobs_list, ""
response_data = response.json()
jobs_list = response_data.get("jobs", [])
next_continue_token = response_data.get("continue", None)
res_data = res.json()
jobs_list = res_data.get("jobs", [])
next_continue_token = res_data.get("continue", None)
with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor: with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor:
job_results = [executor.submit(self.process_job, job) for job in jobs_list] job_results = [executor.submit(self._process_job, job) for job in jobs_list]
job_list = list(filter(None, (result.result() for result in job_results))) job_list = list(filter(None, (result.result() for result in job_results)))
return job_list, next_continue_token return job_list, next_continue_token
def scrape(self, scraper_input: ScraperInput) -> JobResponse: def _process_job(self, job: dict) -> JobPost | None:
""" """
Scrapes ZipRecruiter for jobs with scraper_input criteria. Processes an individual job dict from the response
:param scraper_input: Information about job search criteria.
:return: JobResponse containing a list of jobs.
""" """
job_list: list[JobPost] = []
continue_token = None
max_pages = math.ceil(scraper_input.results_wanted / self.jobs_per_page)
for page in range(1, max_pages + 1):
if len(job_list) >= scraper_input.results_wanted:
break
if page > 1:
time.sleep(self.delay)
jobs_on_page, continue_token = self.find_jobs_in_page(
scraper_input, continue_token
)
if jobs_on_page:
job_list.extend(jobs_on_page)
if not continue_token:
break
return JobResponse(jobs=job_list[: scraper_input.results_wanted])
def process_job(self, job: dict) -> JobPost | None:
"""Processes an individual job dict from the response"""
title = job.get("name") title = job.get("name")
job_url = f"https://www.ziprecruiter.com/jobs//j?lvk={job['listing_key']}" job_url = f"{self.base_url}/jobs//j?lvk={job['listing_key']}"
if job_url in self.seen_urls: if job_url in self.seen_urls:
return return
self.seen_urls.add(job_url) self.seen_urls.add(job_url)
job_description_html = job.get("job_description", "").strip() description = job.get("job_description", "").strip()
description_soup = BeautifulSoup(job_description_html, "html.parser") description = markdown_converter(description) if self.scraper_input.description_format == DescriptionFormat.MARKDOWN else description
description = modify_and_get_description(description_soup)
company = job.get("hiring_company", {}).get("name") company = job.get("hiring_company", {}).get("name")
country_value = "usa" if job.get("job_country") == "US" else "canada" country_value = "usa" if job.get("job_country") == "US" else "canada"
country_enum = Country.from_string(country_value) country_enum = Country.from_string(country_value)
@@ -118,11 +136,10 @@ class ZipRecruiterScraper(Scraper):
location = Location( location = Location(
city=job.get("job_city"), state=job.get("job_state"), country=country_enum city=job.get("job_city"), state=job.get("job_state"), country=country_enum
) )
job_type = ZipRecruiterScraper.get_job_type_enum( job_type = self._get_job_type_enum(
job.get("employment_type", "").replace("_", "").lower() job.get("employment_type", "").replace("_", "").lower()
) )
date_posted = datetime.fromisoformat(job['posted_time'].rstrip("Z")).date() date_posted = datetime.fromisoformat(job['posted_time'].rstrip("Z")).date()
return JobPost( return JobPost(
title=title, title=title,
company_name=company, company_name=company,
@@ -147,20 +164,19 @@ class ZipRecruiterScraper(Scraper):
num_urgent_words=count_urgent_words(description) if description else None, num_urgent_words=count_urgent_words(description) if description else None,
) )
def get_cookies(self): def _get_cookies(self):
url="https://api.ziprecruiter.com/jobs-app/event"
data="event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple" data="event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple"
self.session.post(url, data=data, headers=ZipRecruiterScraper.headers()) self.session.post(f"{self.api_url}/jobs-app/event", data=data, headers=self.headers)
@staticmethod @staticmethod
def get_job_type_enum(job_type_str: str) -> list[JobType] | None: def _get_job_type_enum(job_type_str: str) -> list[JobType] | None:
for job_type in JobType: for job_type in JobType:
if job_type_str in job_type.value: if job_type_str in job_type.value:
return [job_type] return [job_type]
return None return None
@staticmethod @staticmethod
def add_params(scraper_input) -> dict[str, str | Any]: def _add_params(scraper_input) -> dict[str, str | Any]:
params = { params = {
"search": scraper_input.search_term, "search": scraper_input.search_term,
"location": scraper_input.location, "location": scraper_input.location,
@@ -168,39 +184,21 @@ class ZipRecruiterScraper(Scraper):
if scraper_input.hours_old: if scraper_input.hours_old:
fromage = max(scraper_input.hours_old // 24, 1) if scraper_input.hours_old else None fromage = max(scraper_input.hours_old // 24, 1) if scraper_input.hours_old else None
params['days'] = fromage params['days'] = fromage
job_type_value = None job_type_map = {
JobType.FULL_TIME: 'full_time',
JobType.PART_TIME: 'part_time'
}
if scraper_input.job_type: if scraper_input.job_type:
if scraper_input.job_type.value == "fulltime": params['employment_type'] = job_type_map[scraper_input.job_type] if scraper_input.job_type in job_type_map else scraper_input.job_type.value[0]
job_type_value = "full_time"
elif scraper_input.job_type.value == "parttime":
job_type_value = "part_time"
else:
job_type_value = scraper_input.job_type.value
if scraper_input.easy_apply: if scraper_input.easy_apply:
params['zipapply'] = 1 params['zipapply'] = 1
if job_type_value:
params[
"refine_by_employment"
] = f"employment_type:employment_type:{job_type_value}"
if scraper_input.is_remote: if scraper_input.is_remote:
params["refine_by_location_type"] = "only_remote" params["remote"] = 1
if scraper_input.distance: if scraper_input.distance:
params["radius"] = scraper_input.distance params["radius"] = scraper_input.distance
return {k: v for k, v in params.items() if v is not None}
params = {k: v for k, v in params.items() if v is not None} headers = {
return params
@staticmethod
def headers() -> dict:
"""
Returns headers needed for requests
:return: dict - Dictionary containing headers
"""
return {
"Host": "api.ziprecruiter.com", "Host": "api.ziprecruiter.com",
"accept": "*/*", "accept": "*/*",
"x-zr-zva-override": "100000000;vid:ZT1huzm_EQlDTVEc", "x-zr-zva-override": "100000000;vid:ZT1huzm_EQlDTVEc",