Compare commits

...

2 Commits

Author SHA1 Message Date
Cullen Watson 9f4083380d
indeed:remove tpe (#210) 2024-10-19 18:01:59 -05:00
Olzhas Arystanov 9207ab56f6
fix: extract tests out of src (#209) 2024-10-19 16:56:38 -05:00
22 changed files with 443 additions and 424 deletions

View File

@ -19,4 +19,4 @@ jobs:
pip install poetry pip install poetry
poetry install poetry install
- name: Run tests - name: Run tests
run: poetry run pytest src/tests/test_all.py run: poetry run pytest tests/test_all.py

View File

@ -168,10 +168,7 @@ Indeed specific
├── company_employees_label ├── company_employees_label
├── company_revenue_label ├── company_revenue_label
├── company_description ├── company_description
├── ceo_name └── logo_photo_url
├── ceo_photo_url
├── logo_photo_url
└── banner_photo_url
``` ```
## Supported Countries for Job Searching ## Supported Countries for Job Searching

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "python-jobspy" name = "python-jobspy"
version = "1.1.69" version = "1.1.70"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy" homepage = "https://github.com/Bunsly/JobSpy"

View File

@ -5,7 +5,7 @@ from typing import Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from .jobs import JobType, Location from .jobs import JobType, Location
from .scrapers.utils import logger, set_logger_level, extract_salary from .scrapers.utils import set_logger_level, extract_salary, create_logger
from .scrapers.indeed import IndeedScraper from .scrapers.indeed import IndeedScraper
from .scrapers.ziprecruiter import ZipRecruiterScraper from .scrapers.ziprecruiter import ZipRecruiterScraper
from .scrapers.glassdoor import GlassdoorScraper from .scrapers.glassdoor import GlassdoorScraper
@ -102,7 +102,7 @@ def scrape_jobs(
scraped_data: JobResponse = scraper.scrape(scraper_input) scraped_data: JobResponse = scraper.scrape(scraper_input)
cap_name = site.value.capitalize() cap_name = site.value.capitalize()
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name
logger.info(f"{site_name} finished scraping") create_logger(site_name).info(f"finished scraping")
return site.value, scraped_data return site.value, scraped_data
site_to_jobs_dict = {} site_to_jobs_dict = {}
@ -228,15 +228,12 @@ def scrape_jobs(
"emails", "emails",
"description", "description",
"company_url", "company_url",
"logo_photo_url",
"company_url_direct", "company_url_direct",
"company_addresses", "company_addresses",
"company_num_employees", "company_num_employees",
"company_revenue", "company_revenue",
"company_description", "company_description",
"logo_photo_url",
"banner_photo_url",
"ceo_name",
"ceo_photo_url",
] ]
# Step 3: Ensure all desired columns are present, adding missing ones as empty # Step 3: Ensure all desired columns are present, adding missing ones as empty

View File

@ -256,8 +256,6 @@ class JobPost(BaseModel):
company_num_employees: str | None = None company_num_employees: str | None = None
company_revenue: str | None = None company_revenue: str | None = None
company_description: str | None = None company_description: str | None = None
ceo_name: str | None = None
ceo_photo_url: str | None = None
logo_photo_url: str | None = None logo_photo_url: str | None = None
banner_photo_url: str | None = None banner_photo_url: str | None = None

View File

@ -14,13 +14,13 @@ from typing import Optional, Tuple
from datetime import datetime, timedelta from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from .constants import fallback_token, query_template, headers
from .. import Scraper, ScraperInput, Site from .. import Scraper, ScraperInput, Site
from ..utils import extract_emails_from_text from ..utils import extract_emails_from_text, create_logger
from ..exceptions import GlassdoorException from ..exceptions import GlassdoorException
from ..utils import ( from ..utils import (
create_session, create_session,
markdown_converter, markdown_converter,
logger,
) )
from ...jobs import ( from ...jobs import (
JobPost, JobPost,
@ -32,9 +32,13 @@ from ...jobs import (
DescriptionFormat, DescriptionFormat,
) )
logger = create_logger("Glassdoor")
class GlassdoorScraper(Scraper): class GlassdoorScraper(Scraper):
def __init__(self, proxies: list[str] | str | None = None, ca_cert: str | None = None): def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
):
""" """
Initializes GlassdoorScraper with the Glassdoor job search url Initializes GlassdoorScraper with the Glassdoor job search url
""" """
@ -59,9 +63,12 @@ class GlassdoorScraper(Scraper):
self.scraper_input.results_wanted = min(900, scraper_input.results_wanted) self.scraper_input.results_wanted = min(900, scraper_input.results_wanted)
self.base_url = self.scraper_input.country.get_glassdoor_url() self.base_url = self.scraper_input.country.get_glassdoor_url()
self.session = create_session(proxies=self.proxies, ca_cert=self.ca_cert, is_tls=True, has_retry=True) self.session = create_session(
proxies=self.proxies, ca_cert=self.ca_cert, is_tls=True, has_retry=True
)
token = self._get_csrf_token() token = self._get_csrf_token()
self.headers["gd-csrf-token"] = token if token else self.fallback_token headers["gd-csrf-token"] = token if token else fallback_token
self.session.headers.update(headers)
location_id, location_type = self._get_location( location_id, location_type = self._get_location(
scraper_input.location, scraper_input.is_remote scraper_input.location, scraper_input.is_remote
@ -76,7 +83,7 @@ class GlassdoorScraper(Scraper):
tot_pages = (scraper_input.results_wanted // self.jobs_per_page) + 2 tot_pages = (scraper_input.results_wanted // self.jobs_per_page) + 2
range_end = min(tot_pages, self.max_pages + 1) range_end = min(tot_pages, self.max_pages + 1)
for page in range(range_start, range_end): for page in range(range_start, range_end):
logger.info(f"Glassdoor search page: {page}") logger.info(f"search page: {page} / {range_end-1}")
try: try:
jobs, cursor = self._fetch_jobs_page( jobs, cursor = self._fetch_jobs_page(
scraper_input, location_id, location_type, page, cursor scraper_input, location_id, location_type, page, cursor
@ -107,7 +114,6 @@ class GlassdoorScraper(Scraper):
payload = self._add_payload(location_id, location_type, page_num, cursor) payload = self._add_payload(location_id, location_type, page_num, cursor)
response = self.session.post( response = self.session.post(
f"{self.base_url}/graph", f"{self.base_url}/graph",
headers=self.headers,
timeout_seconds=15, timeout_seconds=15,
data=payload, data=payload,
) )
@ -148,9 +154,7 @@ class GlassdoorScraper(Scraper):
""" """
Fetches csrf token needed for API by visiting a generic page Fetches csrf token needed for API by visiting a generic page
""" """
res = self.session.get( res = self.session.get(f"{self.base_url}/Job/computer-science-jobs.htm")
f"{self.base_url}/Job/computer-science-jobs.htm", headers=self.headers
)
pattern = r'"token":\s*"([^"]+)"' pattern = r'"token":\s*"([^"]+)"'
matches = re.findall(pattern, res.text) matches = re.findall(pattern, res.text)
token = None token = None
@ -199,7 +203,7 @@ class GlassdoorScraper(Scraper):
.lower() .lower()
) )
return JobPost( return JobPost(
id=str(job_id), id=f"gd-{job_id}",
title=title, title=title,
company_url=company_url if company_id else None, company_url=company_url if company_id else None,
company_name=company_name, company_name=company_name,
@ -243,7 +247,7 @@ class GlassdoorScraper(Scraper):
""", """,
} }
] ]
res = requests.post(url, json=body, headers=self.headers) res = requests.post(url, json=body, headers=headers)
if res.status_code != 200: if res.status_code != 200:
return None return None
data = res.json()[0] data = res.json()[0]
@ -256,7 +260,7 @@ class GlassdoorScraper(Scraper):
if not location or is_remote: if not location or is_remote:
return "11047", "STATE" # remote options return "11047", "STATE" # remote options
url = f"{self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}" url = f"{self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}"
res = self.session.get(url, headers=self.headers) res = self.session.get(url)
if res.status_code != 200: if res.status_code != 200:
if res.status_code == 429: if res.status_code == 429:
err = f"429 Response - Blocked by Glassdoor for too many requests" err = f"429 Response - Blocked by Glassdoor for too many requests"
@ -310,7 +314,7 @@ class GlassdoorScraper(Scraper):
"fromage": fromage, "fromage": fromage,
"sort": "date", "sort": "date",
}, },
"query": self.query_template, "query": query_template,
} }
if self.scraper_input.job_type: if self.scraper_input.job_type:
payload["variables"]["filterParams"].append( payload["variables"]["filterParams"].append(
@ -358,188 +362,3 @@ class GlassdoorScraper(Scraper):
for cursor_data in pagination_cursors: for cursor_data in pagination_cursors:
if cursor_data["pageNumber"] == page_num: if cursor_data["pageNumber"] == page_num:
return cursor_data["cursor"] return cursor_data["cursor"]
fallback_token = "Ft6oHEWlRZrxDww95Cpazw:0pGUrkb2y3TyOpAIqF2vbPmUXoXVkD3oEGDVkvfeCerceQ5-n8mBg3BovySUIjmCPHCaW0H2nQVdqzbtsYqf4Q:wcqRqeegRUa9MVLJGyujVXB7vWFPjdaS1CtrrzJq-ok"
headers = {
"authority": "www.glassdoor.com",
"accept": "*/*",
"accept-language": "en-US,en;q=0.9",
"apollographql-client-name": "job-search-next",
"apollographql-client-version": "4.65.5",
"content-type": "application/json",
"origin": "https://www.glassdoor.com",
"referer": "https://www.glassdoor.com/",
"sec-ch-ua": '"Chromium";v="118", "Google Chrome";v="118", "Not=A?Brand";v="99"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"macOS"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
}
query_template = """
query JobSearchResultsQuery(
$excludeJobListingIds: [Long!],
$keyword: String,
$locationId: Int,
$locationType: LocationTypeEnum,
$numJobsToShow: Int!,
$pageCursor: String,
$pageNumber: Int,
$filterParams: [FilterParams],
$originalPageUrl: String,
$seoFriendlyUrlInput: String,
$parameterUrlInput: String,
$seoUrl: Boolean
) {
jobListings(
contextHolder: {
searchParams: {
excludeJobListingIds: $excludeJobListingIds,
keyword: $keyword,
locationId: $locationId,
locationType: $locationType,
numPerPage: $numJobsToShow,
pageCursor: $pageCursor,
pageNumber: $pageNumber,
filterParams: $filterParams,
originalPageUrl: $originalPageUrl,
seoFriendlyUrlInput: $seoFriendlyUrlInput,
parameterUrlInput: $parameterUrlInput,
seoUrl: $seoUrl,
searchType: SR
}
}
) {
companyFilterOptions {
id
shortName
__typename
}
filterOptions
indeedCtk
jobListings {
...JobView
__typename
}
jobListingSeoLinks {
linkItems {
position
url
__typename
}
__typename
}
jobSearchTrackingKey
jobsPageSeoData {
pageMetaDescription
pageTitle
__typename
}
paginationCursors {
cursor
pageNumber
__typename
}
indexablePageForSeo
searchResultsMetadata {
searchCriteria {
implicitLocation {
id
localizedDisplayName
type
__typename
}
keyword
location {
id
shortName
localizedShortName
localizedDisplayName
type
__typename
}
__typename
}
helpCenterDomain
helpCenterLocale
jobSerpJobOutlook {
occupation
paragraph
__typename
}
showMachineReadableJobs
__typename
}
totalJobsCount
__typename
}
}
fragment JobView on JobListingSearchResult {
jobview {
header {
adOrderId
advertiserType
adOrderSponsorshipLevel
ageInDays
divisionEmployerName
easyApply
employer {
id
name
shortName
__typename
}
employerNameFromSearch
goc
gocConfidence
gocId
jobCountryId
jobLink
jobResultTrackingKey
jobTitleText
locationName
locationType
locId
needsCommission
payCurrency
payPeriod
payPeriodAdjustedPay {
p10
p50
p90
__typename
}
rating
salarySource
savedJobId
sponsored
__typename
}
job {
description
importConfigId
jobTitleId
jobTitleText
listingId
__typename
}
jobListingAdminDetails {
cpcVal
importConfigId
jobListingId
jobSourceId
userEligibleForAdminJobDetails
__typename
}
overview {
shortName
squareLogoUrl
__typename
}
__typename
}
__typename
}
"""

View File

@ -0,0 +1,184 @@
headers = {
"authority": "www.glassdoor.com",
"accept": "*/*",
"accept-language": "en-US,en;q=0.9",
"apollographql-client-name": "job-search-next",
"apollographql-client-version": "4.65.5",
"content-type": "application/json",
"origin": "https://www.glassdoor.com",
"referer": "https://www.glassdoor.com/",
"sec-ch-ua": '"Chromium";v="118", "Google Chrome";v="118", "Not=A?Brand";v="99"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"macOS"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
}
query_template = """
query JobSearchResultsQuery(
$excludeJobListingIds: [Long!],
$keyword: String,
$locationId: Int,
$locationType: LocationTypeEnum,
$numJobsToShow: Int!,
$pageCursor: String,
$pageNumber: Int,
$filterParams: [FilterParams],
$originalPageUrl: String,
$seoFriendlyUrlInput: String,
$parameterUrlInput: String,
$seoUrl: Boolean
) {
jobListings(
contextHolder: {
searchParams: {
excludeJobListingIds: $excludeJobListingIds,
keyword: $keyword,
locationId: $locationId,
locationType: $locationType,
numPerPage: $numJobsToShow,
pageCursor: $pageCursor,
pageNumber: $pageNumber,
filterParams: $filterParams,
originalPageUrl: $originalPageUrl,
seoFriendlyUrlInput: $seoFriendlyUrlInput,
parameterUrlInput: $parameterUrlInput,
seoUrl: $seoUrl,
searchType: SR
}
}
) {
companyFilterOptions {
id
shortName
__typename
}
filterOptions
indeedCtk
jobListings {
...JobView
__typename
}
jobListingSeoLinks {
linkItems {
position
url
__typename
}
__typename
}
jobSearchTrackingKey
jobsPageSeoData {
pageMetaDescription
pageTitle
__typename
}
paginationCursors {
cursor
pageNumber
__typename
}
indexablePageForSeo
searchResultsMetadata {
searchCriteria {
implicitLocation {
id
localizedDisplayName
type
__typename
}
keyword
location {
id
shortName
localizedShortName
localizedDisplayName
type
__typename
}
__typename
}
helpCenterDomain
helpCenterLocale
jobSerpJobOutlook {
occupation
paragraph
__typename
}
showMachineReadableJobs
__typename
}
totalJobsCount
__typename
}
}
fragment JobView on JobListingSearchResult {
jobview {
header {
adOrderId
advertiserType
adOrderSponsorshipLevel
ageInDays
divisionEmployerName
easyApply
employer {
id
name
shortName
__typename
}
employerNameFromSearch
goc
gocConfidence
gocId
jobCountryId
jobLink
jobResultTrackingKey
jobTitleText
locationName
locationType
locId
needsCommission
payCurrency
payPeriod
payPeriodAdjustedPay {
p10
p50
p90
__typename
}
rating
salarySource
savedJobId
sponsored
__typename
}
job {
description
importConfigId
jobTitleId
jobTitleText
listingId
__typename
}
jobListingAdminDetails {
cpcVal
importConfigId
jobListingId
jobSourceId
userEligibleForAdminJobDetails
__typename
}
overview {
shortName
squareLogoUrl
__typename
}
__typename
}
__typename
}
"""
fallback_token = "Ft6oHEWlRZrxDww95Cpazw:0pGUrkb2y3TyOpAIqF2vbPmUXoXVkD3oEGDVkvfeCerceQ5-n8mBg3BovySUIjmCPHCaW0H2nQVdqzbtsYqf4Q:wcqRqeegRUa9MVLJGyujVXB7vWFPjdaS1CtrrzJq-ok"

View File

@ -10,15 +10,15 @@ from __future__ import annotations
import math import math
from typing import Tuple from typing import Tuple
from datetime import datetime from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, Future
from .constants import job_search_query, api_headers
from .. import Scraper, ScraperInput, Site from .. import Scraper, ScraperInput, Site
from ..utils import ( from ..utils import (
extract_emails_from_text, extract_emails_from_text,
get_enum_from_job_type, get_enum_from_job_type,
markdown_converter, markdown_converter,
logger,
create_session, create_session,
create_logger,
) )
from ...jobs import ( from ...jobs import (
JobPost, JobPost,
@ -30,15 +30,21 @@ from ...jobs import (
DescriptionFormat, DescriptionFormat,
) )
logger = create_logger("Indeed")
class IndeedScraper(Scraper): class IndeedScraper(Scraper):
def __init__(self, proxies: list[str] | str | None = None, ca_cert: str | None = None): def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
):
""" """
Initializes IndeedScraper with the Indeed API url Initializes IndeedScraper with the Indeed API url
""" """
super().__init__(Site.INDEED, proxies=proxies) super().__init__(Site.INDEED, proxies=proxies)
self.session = create_session(proxies=self.proxies, ca_cert=ca_cert, is_tls=False) self.session = create_session(
proxies=self.proxies, ca_cert=ca_cert, is_tls=False
)
self.scraper_input = None self.scraper_input = None
self.jobs_per_page = 100 self.jobs_per_page = 100
self.num_workers = 10 self.num_workers = 10
@ -57,7 +63,7 @@ class IndeedScraper(Scraper):
self.scraper_input = scraper_input self.scraper_input = scraper_input
domain, self.api_country_code = self.scraper_input.country.indeed_domain_value domain, self.api_country_code = self.scraper_input.country.indeed_domain_value
self.base_url = f"https://{domain}.indeed.com" self.base_url = f"https://{domain}.indeed.com"
self.headers = self.api_headers.copy() self.headers = api_headers.copy()
self.headers["indeed-co"] = self.scraper_input.country.indeed_domain_value self.headers["indeed-co"] = self.scraper_input.country.indeed_domain_value
job_list = [] job_list = []
page = 1 page = 1
@ -65,17 +71,19 @@ class IndeedScraper(Scraper):
cursor = None cursor = None
offset_pages = math.ceil(self.scraper_input.offset / 100) offset_pages = math.ceil(self.scraper_input.offset / 100)
for _ in range(offset_pages): for _ in range(offset_pages):
logger.info(f"Indeed skipping search page: {page}") logger.info(f"skipping search page: {page}")
__, cursor = self._scrape_page(cursor) __, cursor = self._scrape_page(cursor)
if not __: if not __:
logger.info(f"Indeed found no jobs on page: {page}") logger.info(f"found no jobs on page: {page}")
break break
while len(self.seen_urls) < scraper_input.results_wanted: while len(self.seen_urls) < scraper_input.results_wanted:
logger.info(f"Indeed search page: {page}") logger.info(
f"search page: {page} / {math.ceil(scraper_input.results_wanted / 100)}"
)
jobs, cursor = self._scrape_page(cursor) jobs, cursor = self._scrape_page(cursor)
if not jobs: if not jobs:
logger.info(f"Indeed found no jobs on page: {page}") logger.info(f"found no jobs on page: {page}")
break break
job_list += jobs job_list += jobs
page += 1 page += 1
@ -95,7 +103,7 @@ class IndeedScraper(Scraper):
if self.scraper_input.search_term if self.scraper_input.search_term
else "" else ""
) )
query = self.job_search_query.format( query = job_search_query.format(
what=(f'what: "{search_term}"' if search_term else ""), what=(f'what: "{search_term}"' if search_term else ""),
location=( location=(
f'location: {{where: "{self.scraper_input.location}", radius: {self.scraper_input.distance}, radiusUnit: MILES}}' f'location: {{where: "{self.scraper_input.location}", radius: {self.scraper_input.distance}, radiusUnit: MILES}}'
@ -109,28 +117,29 @@ class IndeedScraper(Scraper):
payload = { payload = {
"query": query, "query": query,
} }
api_headers = self.api_headers.copy() api_headers_temp = api_headers.copy()
api_headers["indeed-co"] = self.api_country_code api_headers_temp["indeed-co"] = self.api_country_code
response = self.session.post( response = self.session.post(
self.api_url, self.api_url,
headers=api_headers, headers=api_headers_temp,
json=payload, json=payload,
timeout=10, timeout=10,
) )
if response.status_code != 200: if not response.ok:
logger.info( logger.info(
f"Indeed responded with status code: {response.status_code} (submit GitHub issue if this appears to be a bug)" f"responded with status code: {response.status_code} (submit GitHub issue if this appears to be a bug)"
) )
return jobs, new_cursor return jobs, new_cursor
data = response.json() data = response.json()
jobs = data["data"]["jobSearch"]["results"] jobs = data["data"]["jobSearch"]["results"]
new_cursor = data["data"]["jobSearch"]["pageInfo"]["nextCursor"] new_cursor = data["data"]["jobSearch"]["pageInfo"]["nextCursor"]
with ThreadPoolExecutor(max_workers=self.num_workers) as executor: job_list = []
job_results: list[Future] = [ for job in jobs:
executor.submit(self._process_job, job["job"]) for job in jobs processed_job = self._process_job(job["job"])
] if processed_job:
job_list = [result.result() for result in job_results if result.result()] job_list.append(processed_job)
return job_list, new_cursor return job_list, new_cursor
def _build_filters(self): def _build_filters(self):
@ -212,7 +221,7 @@ class IndeedScraper(Scraper):
employer_details = employer.get("employerDetails", {}) if employer else {} employer_details = employer.get("employerDetails", {}) if employer else {}
rel_url = job["employer"]["relativeCompanyPageUrl"] if job["employer"] else None rel_url = job["employer"]["relativeCompanyPageUrl"] if job["employer"] else None
return JobPost( return JobPost(
id=str(job["key"]), id=f'in-{job["key"]}',
title=job["title"], title=job["title"],
description=description, description=description,
company_name=job["employer"].get("name") if job.get("employer") else None, company_name=job["employer"].get("name") if job.get("employer") else None,
@ -251,18 +260,11 @@ class IndeedScraper(Scraper):
company_num_employees=employer_details.get("employeesLocalizedLabel"), company_num_employees=employer_details.get("employeesLocalizedLabel"),
company_revenue=employer_details.get("revenueLocalizedLabel"), company_revenue=employer_details.get("revenueLocalizedLabel"),
company_description=employer_details.get("briefDescription"), company_description=employer_details.get("briefDescription"),
ceo_name=employer_details.get("ceoName"),
ceo_photo_url=employer_details.get("ceoPhotoUrl"),
logo_photo_url=( logo_photo_url=(
employer["images"].get("squareLogoUrl") employer["images"].get("squareLogoUrl")
if employer and employer.get("images") if employer and employer.get("images")
else None else None
), ),
banner_photo_url=(
employer["images"].get("headerImageUrl")
if employer and employer.get("images")
else None
),
) )
@staticmethod @staticmethod
@ -347,112 +349,3 @@ class IndeedScraper(Scraper):
return CompensationInterval[mapped_interval] return CompensationInterval[mapped_interval]
else: else:
raise ValueError(f"Unsupported interval: {interval}") raise ValueError(f"Unsupported interval: {interval}")
api_headers = {
"Host": "apis.indeed.com",
"content-type": "application/json",
"indeed-api-key": "161092c2017b5bbab13edb12461a62d5a833871e7cad6d9d475304573de67ac8",
"accept": "application/json",
"indeed-locale": "en-US",
"accept-language": "en-US,en;q=0.9",
"user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Indeed App 193.1",
"indeed-app-info": "appv=193.1; appid=com.indeed.jobsearch; osv=16.6.1; os=ios; dtype=phone",
}
job_search_query = """
query GetJobData {{
jobSearch(
{what}
{location}
limit: 100
{cursor}
sort: RELEVANCE
{filters}
) {{
pageInfo {{
nextCursor
}}
results {{
trackingKey
job {{
source {{
name
}}
key
title
datePublished
dateOnIndeed
description {{
html
}}
location {{
countryName
countryCode
admin1Code
city
postalCode
streetAddress
formatted {{
short
long
}}
}}
compensation {{
estimated {{
currencyCode
baseSalary {{
unitOfWork
range {{
... on Range {{
min
max
}}
}}
}}
}}
baseSalary {{
unitOfWork
range {{
... on Range {{
min
max
}}
}}
}}
currencyCode
}}
attributes {{
key
label
}}
employer {{
relativeCompanyPageUrl
name
dossier {{
employerDetails {{
addresses
industry
employeesLocalizedLabel
revenueLocalizedLabel
briefDescription
ceoName
ceoPhotoUrl
}}
images {{
headerImageUrl
squareLogoUrl
}}
links {{
corporateWebsite
}}
}}
}}
recruit {{
viewJobUrl
detailedSalary
workSchedule
}}
}}
}}
}}
}}
"""

View File

@ -0,0 +1,109 @@
job_search_query = """
query GetJobData {{
jobSearch(
{what}
{location}
limit: 100
{cursor}
sort: RELEVANCE
{filters}
) {{
pageInfo {{
nextCursor
}}
results {{
trackingKey
job {{
source {{
name
}}
key
title
datePublished
dateOnIndeed
description {{
html
}}
location {{
countryName
countryCode
admin1Code
city
postalCode
streetAddress
formatted {{
short
long
}}
}}
compensation {{
estimated {{
currencyCode
baseSalary {{
unitOfWork
range {{
... on Range {{
min
max
}}
}}
}}
}}
baseSalary {{
unitOfWork
range {{
... on Range {{
min
max
}}
}}
}}
currencyCode
}}
attributes {{
key
label
}}
employer {{
relativeCompanyPageUrl
name
dossier {{
employerDetails {{
addresses
industry
employeesLocalizedLabel
revenueLocalizedLabel
briefDescription
ceoName
ceoPhotoUrl
}}
images {{
headerImageUrl
squareLogoUrl
}}
links {{
corporateWebsite
}}
}}
}}
recruit {{
viewJobUrl
detailedSalary
workSchedule
}}
}}
}}
}}
}}
"""
api_headers = {
"Host": "apis.indeed.com",
"content-type": "application/json",
"indeed-api-key": "161092c2017b5bbab13edb12461a62d5a833871e7cad6d9d475304573de67ac8",
"accept": "application/json",
"indeed-locale": "en-US",
"accept-language": "en-US,en;q=0.9",
"user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Indeed App 193.1",
"indeed-app-info": "appv=193.1; appid=com.indeed.jobsearch; osv=16.6.1; os=ios; dtype=phone",
}

View File

@ -7,6 +7,7 @@ This module contains routines to scrape LinkedIn.
from __future__ import annotations from __future__ import annotations
import math
import time import time
import random import random
import regex as re import regex as re
@ -17,9 +18,10 @@ from bs4.element import Tag
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urlparse, urlunparse, unquote from urllib.parse import urlparse, urlunparse, unquote
from .constants import headers
from .. import Scraper, ScraperInput, Site from .. import Scraper, ScraperInput, Site
from ..exceptions import LinkedInException from ..exceptions import LinkedInException
from ..utils import create_session, remove_attributes from ..utils import create_session, remove_attributes, create_logger
from ...jobs import ( from ...jobs import (
JobPost, JobPost,
Location, Location,
@ -30,13 +32,14 @@ from ...jobs import (
DescriptionFormat, DescriptionFormat,
) )
from ..utils import ( from ..utils import (
logger,
extract_emails_from_text, extract_emails_from_text,
get_enum_from_job_type, get_enum_from_job_type,
currency_parser, currency_parser,
markdown_converter, markdown_converter,
) )
logger = create_logger("LinkedIn")
class LinkedInScraper(Scraper): class LinkedInScraper(Scraper):
base_url = "https://www.linkedin.com" base_url = "https://www.linkedin.com"
@ -44,7 +47,9 @@ class LinkedInScraper(Scraper):
band_delay = 4 band_delay = 4
jobs_per_page = 25 jobs_per_page = 25
def __init__(self, proxies: list[str] | str | None = None, ca_cert: str | None = None): def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
):
""" """
Initializes LinkedInScraper with the LinkedIn job search url Initializes LinkedInScraper with the LinkedIn job search url
""" """
@ -57,7 +62,7 @@ class LinkedInScraper(Scraper):
delay=5, delay=5,
clear_cookies=True, clear_cookies=True,
) )
self.session.headers.update(self.headers) self.session.headers.update(headers)
self.scraper_input = None self.scraper_input = None
self.country = "worldwide" self.country = "worldwide"
self.job_url_direct_regex = re.compile(r'(?<=\?url=)[^"]+') self.job_url_direct_regex = re.compile(r'(?<=\?url=)[^"]+')
@ -71,17 +76,19 @@ class LinkedInScraper(Scraper):
self.scraper_input = scraper_input self.scraper_input = scraper_input
job_list: list[JobPost] = [] job_list: list[JobPost] = []
seen_ids = set() seen_ids = set()
page = scraper_input.offset // 10 * 10 if scraper_input.offset else 0 start = scraper_input.offset // 10 * 10 if scraper_input.offset else 0
request_count = 0 request_count = 0
seconds_old = ( seconds_old = (
scraper_input.hours_old * 3600 if scraper_input.hours_old else None scraper_input.hours_old * 3600 if scraper_input.hours_old else None
) )
continue_search = ( continue_search = (
lambda: len(job_list) < scraper_input.results_wanted and page < 1000 lambda: len(job_list) < scraper_input.results_wanted and start < 1000
) )
while continue_search(): while continue_search():
request_count += 1 request_count += 1
logger.info(f"LinkedIn search page: {request_count}") logger.info(
f"search page: {request_count} / {math.ceil(scraper_input.results_wanted / 10)}"
)
params = { params = {
"keywords": scraper_input.search_term, "keywords": scraper_input.search_term,
"location": scraper_input.location, "location": scraper_input.location,
@ -93,7 +100,7 @@ class LinkedInScraper(Scraper):
else None else None
), ),
"pageNum": 0, "pageNum": 0,
"start": page, "start": start,
"f_AL": "true" if scraper_input.easy_apply else None, "f_AL": "true" if scraper_input.easy_apply else None,
"f_C": ( "f_C": (
",".join(map(str, scraper_input.linkedin_company_ids)) ",".join(map(str, scraper_input.linkedin_company_ids))
@ -155,7 +162,7 @@ class LinkedInScraper(Scraper):
if continue_search(): if continue_search():
time.sleep(random.uniform(self.delay, self.delay + self.band_delay)) time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
page += len(job_list) start += len(job_list)
job_list = job_list[: scraper_input.results_wanted] job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list) return JobResponse(jobs=job_list)
@ -211,7 +218,7 @@ class LinkedInScraper(Scraper):
job_details = self._get_job_details(job_id) job_details = self._get_job_details(job_id)
return JobPost( return JobPost(
id=job_id, id=f"li-{job_id}",
title=title, title=title,
company_name=company, company_name=company,
company_url=company_url, company_url=company_url,
@ -267,15 +274,19 @@ class LinkedInScraper(Scraper):
) )
if job_function_span: if job_function_span:
job_function = job_function_span.text.strip() job_function = job_function_span.text.strip()
logo_photo_url = (
logo_image.get("data-delayed-url")
if (logo_image := soup.find("img", {"class": "artdeco-entity-image"}))
else None
)
return { return {
"description": description, "description": description,
"job_level": self._parse_job_level(soup), "job_level": self._parse_job_level(soup),
"company_industry": self._parse_company_industry(soup), "company_industry": self._parse_company_industry(soup),
"job_type": self._parse_job_type(soup), "job_type": self._parse_job_type(soup),
"job_url_direct": self._parse_job_url_direct(soup), "job_url_direct": self._parse_job_url_direct(soup),
"logo_photo_url": soup.find("img", {"class": "artdeco-entity-image"}).get( "logo_photo_url": logo_photo_url,
"data-delayed-url"
),
"job_function": job_function, "job_function": job_function,
} }
@ -402,12 +413,3 @@ class LinkedInScraper(Scraper):
JobType.CONTRACT: "C", JobType.CONTRACT: "C",
JobType.TEMPORARY: "T", JobType.TEMPORARY: "T",
}.get(job_type_enum, "") }.get(job_type_enum, "")
headers = {
"authority": "www.linkedin.com",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "en-US,en;q=0.9",
"cache-control": "max-age=0",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}

View File

@ -0,0 +1,8 @@
headers = {
"authority": "www.linkedin.com",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "en-US,en;q=0.9",
"cache-control": "max-age=0",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}

View File

@ -12,15 +12,18 @@ from requests.adapters import HTTPAdapter, Retry
from ..jobs import CompensationInterval, JobType from ..jobs import CompensationInterval, JobType
logger = logging.getLogger("JobSpy")
logger.propagate = False def create_logger(name: str):
if not logger.handlers: logger = logging.getLogger(f"JobSpy:{name}")
logger.setLevel(logging.INFO) logger.propagate = False
console_handler = logging.StreamHandler() if not logger.handlers:
format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" logger.setLevel(logging.INFO)
formatter = logging.Formatter(format) console_handler = logging.StreamHandler()
console_handler.setFormatter(formatter) format = "%(asctime)s - %(levelname)s - %(name)s - %(message)s"
logger.addHandler(console_handler) formatter = logging.Formatter(format)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
return logger
class RotatingProxySession: class RotatingProxySession:
@ -138,7 +141,9 @@ def set_logger_level(verbose: int = 2):
level_name = {2: "INFO", 1: "WARNING", 0: "ERROR"}.get(verbose, "INFO") level_name = {2: "INFO", 1: "WARNING", 0: "ERROR"}.get(verbose, "INFO")
level = getattr(logging, level_name.upper(), None) level = getattr(logging, level_name.upper(), None)
if level is not None: if level is not None:
logger.setLevel(level) for logger_name in logging.root.manager.loggerDict:
if logger_name.startswith("JobSpy:"):
logging.getLogger(logger_name).setLevel(level)
else: else:
raise ValueError(f"Invalid log level: {level_name}") raise ValueError(f"Invalid log level: {level_name}")
@ -199,6 +204,10 @@ def extract_salary(
monthly_threshold=30000, monthly_threshold=30000,
enforce_annual_salary=False, enforce_annual_salary=False,
): ):
"""
Extracts salary information from a string and returns the salary interval, min and max salary values, and currency.
(TODO: Needs test cases as the regex is complicated and may not cover all edge cases)
"""
if not salary_str: if not salary_str:
return None, None, None, None return None, None, None, None

View File

@ -18,13 +18,14 @@ from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from .constants import headers
from .. import Scraper, ScraperInput, Site from .. import Scraper, ScraperInput, Site
from ..utils import ( from ..utils import (
logger,
extract_emails_from_text, extract_emails_from_text,
create_session, create_session,
markdown_converter, markdown_converter,
remove_attributes, remove_attributes,
create_logger,
) )
from ...jobs import ( from ...jobs import (
JobPost, JobPost,
@ -36,12 +37,16 @@ from ...jobs import (
DescriptionFormat, DescriptionFormat,
) )
logger = create_logger("ZipRecruiter")
class ZipRecruiterScraper(Scraper): class ZipRecruiterScraper(Scraper):
base_url = "https://www.ziprecruiter.com" base_url = "https://www.ziprecruiter.com"
api_url = "https://api.ziprecruiter.com" api_url = "https://api.ziprecruiter.com"
def __init__(self, proxies: list[str] | str | None = None, ca_cert: str | None = None): def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
):
""" """
Initializes ZipRecruiterScraper with the ZipRecruiter job search url Initializes ZipRecruiterScraper with the ZipRecruiter job search url
""" """
@ -49,6 +54,7 @@ class ZipRecruiterScraper(Scraper):
self.scraper_input = None self.scraper_input = None
self.session = create_session(proxies=proxies, ca_cert=ca_cert) self.session = create_session(proxies=proxies, ca_cert=ca_cert)
self.session.headers.update(headers)
self._get_cookies() self._get_cookies()
self.delay = 5 self.delay = 5
@ -71,7 +77,7 @@ class ZipRecruiterScraper(Scraper):
break break
if page > 1: if page > 1:
time.sleep(self.delay) time.sleep(self.delay)
logger.info(f"ZipRecruiter search page: {page}") logger.info(f"search page: {page} / {max_pages}")
jobs_on_page, continue_token = self._find_jobs_in_page( jobs_on_page, continue_token = self._find_jobs_in_page(
scraper_input, continue_token scraper_input, continue_token
) )
@ -97,9 +103,7 @@ class ZipRecruiterScraper(Scraper):
if continue_token: if continue_token:
params["continue_from"] = continue_token params["continue_from"] = continue_token
try: try:
res = self.session.get( res = self.session.get(f"{self.api_url}/jobs-app/jobs", params=params)
f"{self.api_url}/jobs-app/jobs", headers=self.headers, params=params
)
if res.status_code not in range(200, 400): if res.status_code not in range(200, 400):
if res.status_code == 429: if res.status_code == 429:
err = "429 Response - Blocked by ZipRecruiter for too many requests" err = "429 Response - Blocked by ZipRecruiter for too many requests"
@ -160,7 +164,7 @@ class ZipRecruiterScraper(Scraper):
description_full, job_url_direct = self._get_descr(job_url) description_full, job_url_direct = self._get_descr(job_url)
return JobPost( return JobPost(
id=str(job["listing_key"]), id=f'zr-{job["listing_key"]}',
title=title, title=title,
company_name=company, company_name=company,
location=location, location=location,
@ -180,7 +184,7 @@ class ZipRecruiterScraper(Scraper):
) )
def _get_descr(self, job_url): def _get_descr(self, job_url):
res = self.session.get(job_url, headers=self.headers, allow_redirects=True) res = self.session.get(job_url, allow_redirects=True)
description_full = job_url_direct = None description_full = job_url_direct = None
if res.ok: if res.ok:
soup = BeautifulSoup(res.text, "html.parser") soup = BeautifulSoup(res.text, "html.parser")
@ -213,7 +217,7 @@ class ZipRecruiterScraper(Scraper):
def _get_cookies(self): def _get_cookies(self):
data = "event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple" data = "event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple"
url = f"{self.api_url}/jobs-app/event" url = f"{self.api_url}/jobs-app/event"
self.session.post(url, data=data, headers=self.headers) self.session.post(url, data=data)
@staticmethod @staticmethod
def _get_job_type_enum(job_type_str: str) -> list[JobType] | None: def _get_job_type_enum(job_type_str: str) -> list[JobType] | None:
@ -241,14 +245,3 @@ class ZipRecruiterScraper(Scraper):
if scraper_input.distance: if scraper_input.distance:
params["radius"] = scraper_input.distance params["radius"] = scraper_input.distance
return {k: v for k, v in params.items() if v is not None} return {k: v for k, v in params.items() if v is not None}
headers = {
"Host": "api.ziprecruiter.com",
"accept": "*/*",
"x-zr-zva-override": "100000000;vid:ZT1huzm_EQlDTVEc",
"x-pushnotificationid": "0ff4983d38d7fc5b3370297f2bcffcf4b3321c418f5c22dd152a0264707602a0",
"x-deviceid": "D77B3A92-E589-46A4-8A39-6EF6F1D86006",
"user-agent": "Job Search/87.0 (iPhone; CPU iOS 16_6_1 like Mac OS X)",
"authorization": "Basic YTBlZjMyZDYtN2I0Yy00MWVkLWEyODMtYTI1NDAzMzI0YTcyOg==",
"accept-language": "en-US,en;q=0.9",
}

View File

@ -0,0 +1,10 @@
headers = {
"Host": "api.ziprecruiter.com",
"accept": "*/*",
"x-zr-zva-override": "100000000;vid:ZT1huzm_EQlDTVEc",
"x-pushnotificationid": "0ff4983d38d7fc5b3370297f2bcffcf4b3321c418f5c22dd152a0264707602a0",
"x-deviceid": "D77B3A92-E589-46A4-8A39-6EF6F1D86006",
"user-agent": "Job Search/87.0 (iPhone; CPU iOS 16_6_1 like Mac OS X)",
"authorization": "Basic YTBlZjMyZDYtN2I0Yy00MWVkLWEyODMtYTI1NDAzMzI0YTcyOg==",
"accept-language": "en-US,en;q=0.9",
}

View File

View File

@ -1,18 +0,0 @@
from ..jobspy import scrape_jobs
import pandas as pd
def test_all():
result = scrape_jobs(
site_name=[
"linkedin",
"indeed",
"glassdoor",
], # ziprecruiter needs good ip, and temp fix to pass test on ci
search_term="engineer",
results_wanted=5,
)
assert (
isinstance(result, pd.DataFrame) and len(result) == 15
), "Result should be a non-empty DataFrame"

18
tests/test_all.py Normal file
View File

@ -0,0 +1,18 @@
from jobspy import scrape_jobs
import pandas as pd
def test_all():
sites = [
"indeed",
"glassdoor",
] # ziprecruiter/linkedin needs good ip, and temp fix to pass test on ci
result = scrape_jobs(
site_name=sites,
search_term="engineer",
results_wanted=5,
)
assert (
isinstance(result, pd.DataFrame) and len(result) == len(sites) * 5
), "Result should be a non-empty DataFrame"

View File

@ -1,4 +1,4 @@
from ..jobspy import scrape_jobs from jobspy import scrape_jobs
import pandas as pd import pandas as pd

View File

@ -1,4 +1,4 @@
from ..jobspy import scrape_jobs from jobspy import scrape_jobs
import pandas as pd import pandas as pd

View File

@ -1,4 +1,4 @@
from ..jobspy import scrape_jobs from jobspy import scrape_jobs
import pandas as pd import pandas as pd

View File

@ -1,4 +1,4 @@
from ..jobspy import scrape_jobs from jobspy import scrape_jobs
import pandas as pd import pandas as pd