From 9f4083380da28902859f801984e69be7642f3416 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Sat, 19 Oct 2024 18:01:59 -0500 Subject: [PATCH] indeed:remove tpe (#210) --- README.md | 5 +- pyproject.toml | 2 +- src/jobspy/__init__.py | 9 +- src/jobspy/jobs/__init__.py | 2 - src/jobspy/scrapers/glassdoor/__init__.py | 217 ++---------------- src/jobspy/scrapers/glassdoor/constants.py | 184 +++++++++++++++ src/jobspy/scrapers/indeed/__init__.py | 167 +++----------- src/jobspy/scrapers/indeed/constants.py | 109 +++++++++ src/jobspy/scrapers/linkedin/__init__.py | 46 ++-- src/jobspy/scrapers/linkedin/constants.py | 8 + src/jobspy/scrapers/utils.py | 29 ++- src/jobspy/scrapers/ziprecruiter/__init__.py | 33 ++- src/jobspy/scrapers/ziprecruiter/constants.py | 10 + tests/test_all.py | 12 +- 14 files changed, 426 insertions(+), 407 deletions(-) create mode 100644 src/jobspy/scrapers/glassdoor/constants.py create mode 100644 src/jobspy/scrapers/indeed/constants.py create mode 100644 src/jobspy/scrapers/linkedin/constants.py create mode 100644 src/jobspy/scrapers/ziprecruiter/constants.py diff --git a/README.md b/README.md index 3e9eb66..391bcc6 100644 --- a/README.md +++ b/README.md @@ -168,10 +168,7 @@ Indeed specific ├── company_employees_label ├── company_revenue_label ├── company_description -├── ceo_name -├── ceo_photo_url -├── logo_photo_url -└── banner_photo_url +└── logo_photo_url ``` ## Supported Countries for Job Searching diff --git a/pyproject.toml b/pyproject.toml index 1d06412..bb5e8e6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "python-jobspy" -version = "1.1.69" +version = "1.1.70" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/Bunsly/JobSpy" diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py index f6b8eb9..f9f02ad 100644 --- a/src/jobspy/__init__.py +++ b/src/jobspy/__init__.py @@ -5,7 +5,7 @@ from typing import Tuple from concurrent.futures import ThreadPoolExecutor, as_completed from .jobs import JobType, Location -from .scrapers.utils import logger, set_logger_level, extract_salary +from .scrapers.utils import set_logger_level, extract_salary, create_logger from .scrapers.indeed import IndeedScraper from .scrapers.ziprecruiter import ZipRecruiterScraper from .scrapers.glassdoor import GlassdoorScraper @@ -102,7 +102,7 @@ def scrape_jobs( scraped_data: JobResponse = scraper.scrape(scraper_input) cap_name = site.value.capitalize() site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name - logger.info(f"{site_name} finished scraping") + create_logger(site_name).info(f"finished scraping") return site.value, scraped_data site_to_jobs_dict = {} @@ -228,15 +228,12 @@ def scrape_jobs( "emails", "description", "company_url", + "logo_photo_url", "company_url_direct", "company_addresses", "company_num_employees", "company_revenue", "company_description", - "logo_photo_url", - "banner_photo_url", - "ceo_name", - "ceo_photo_url", ] # Step 3: Ensure all desired columns are present, adding missing ones as empty diff --git a/src/jobspy/jobs/__init__.py b/src/jobspy/jobs/__init__.py index 825dabd..48ef824 100644 --- a/src/jobspy/jobs/__init__.py +++ b/src/jobspy/jobs/__init__.py @@ -256,8 +256,6 @@ class JobPost(BaseModel): company_num_employees: str | None = None company_revenue: str | None = None company_description: str | None = None - ceo_name: str | None = None - ceo_photo_url: str | None = None logo_photo_url: str | None = None banner_photo_url: str | None = None diff --git a/src/jobspy/scrapers/glassdoor/__init__.py b/src/jobspy/scrapers/glassdoor/__init__.py index d8666da..eab4ee5 100644 --- a/src/jobspy/scrapers/glassdoor/__init__.py +++ b/src/jobspy/scrapers/glassdoor/__init__.py @@ -14,13 +14,13 @@ from typing import Optional, Tuple from datetime import datetime, timedelta from concurrent.futures import ThreadPoolExecutor, as_completed +from .constants import fallback_token, query_template, headers from .. import Scraper, ScraperInput, Site -from ..utils import extract_emails_from_text +from ..utils import extract_emails_from_text, create_logger from ..exceptions import GlassdoorException from ..utils import ( create_session, markdown_converter, - logger, ) from ...jobs import ( JobPost, @@ -32,9 +32,13 @@ from ...jobs import ( DescriptionFormat, ) +logger = create_logger("Glassdoor") + class GlassdoorScraper(Scraper): - def __init__(self, proxies: list[str] | str | None = None, ca_cert: str | None = None): + def __init__( + self, proxies: list[str] | str | None = None, ca_cert: str | None = None + ): """ Initializes GlassdoorScraper with the Glassdoor job search url """ @@ -59,9 +63,12 @@ class GlassdoorScraper(Scraper): self.scraper_input.results_wanted = min(900, scraper_input.results_wanted) self.base_url = self.scraper_input.country.get_glassdoor_url() - self.session = create_session(proxies=self.proxies, ca_cert=self.ca_cert, is_tls=True, has_retry=True) + self.session = create_session( + proxies=self.proxies, ca_cert=self.ca_cert, is_tls=True, has_retry=True + ) token = self._get_csrf_token() - self.headers["gd-csrf-token"] = token if token else self.fallback_token + headers["gd-csrf-token"] = token if token else fallback_token + self.session.headers.update(headers) location_id, location_type = self._get_location( scraper_input.location, scraper_input.is_remote @@ -76,7 +83,7 @@ class GlassdoorScraper(Scraper): tot_pages = (scraper_input.results_wanted // self.jobs_per_page) + 2 range_end = min(tot_pages, self.max_pages + 1) for page in range(range_start, range_end): - logger.info(f"Glassdoor search page: {page}") + logger.info(f"search page: {page} / {range_end-1}") try: jobs, cursor = self._fetch_jobs_page( scraper_input, location_id, location_type, page, cursor @@ -107,7 +114,6 @@ class GlassdoorScraper(Scraper): payload = self._add_payload(location_id, location_type, page_num, cursor) response = self.session.post( f"{self.base_url}/graph", - headers=self.headers, timeout_seconds=15, data=payload, ) @@ -148,9 +154,7 @@ class GlassdoorScraper(Scraper): """ Fetches csrf token needed for API by visiting a generic page """ - res = self.session.get( - f"{self.base_url}/Job/computer-science-jobs.htm", headers=self.headers - ) + res = self.session.get(f"{self.base_url}/Job/computer-science-jobs.htm") pattern = r'"token":\s*"([^"]+)"' matches = re.findall(pattern, res.text) token = None @@ -199,7 +203,7 @@ class GlassdoorScraper(Scraper): .lower() ) return JobPost( - id=str(job_id), + id=f"gd-{job_id}", title=title, company_url=company_url if company_id else None, company_name=company_name, @@ -243,7 +247,7 @@ class GlassdoorScraper(Scraper): """, } ] - res = requests.post(url, json=body, headers=self.headers) + res = requests.post(url, json=body, headers=headers) if res.status_code != 200: return None data = res.json()[0] @@ -256,7 +260,7 @@ class GlassdoorScraper(Scraper): if not location or is_remote: return "11047", "STATE" # remote options url = f"{self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}" - res = self.session.get(url, headers=self.headers) + res = self.session.get(url) if res.status_code != 200: if res.status_code == 429: err = f"429 Response - Blocked by Glassdoor for too many requests" @@ -310,7 +314,7 @@ class GlassdoorScraper(Scraper): "fromage": fromage, "sort": "date", }, - "query": self.query_template, + "query": query_template, } if self.scraper_input.job_type: payload["variables"]["filterParams"].append( @@ -358,188 +362,3 @@ class GlassdoorScraper(Scraper): for cursor_data in pagination_cursors: if cursor_data["pageNumber"] == page_num: return cursor_data["cursor"] - - fallback_token = "Ft6oHEWlRZrxDww95Cpazw:0pGUrkb2y3TyOpAIqF2vbPmUXoXVkD3oEGDVkvfeCerceQ5-n8mBg3BovySUIjmCPHCaW0H2nQVdqzbtsYqf4Q:wcqRqeegRUa9MVLJGyujVXB7vWFPjdaS1CtrrzJq-ok" - headers = { - "authority": "www.glassdoor.com", - "accept": "*/*", - "accept-language": "en-US,en;q=0.9", - "apollographql-client-name": "job-search-next", - "apollographql-client-version": "4.65.5", - "content-type": "application/json", - "origin": "https://www.glassdoor.com", - "referer": "https://www.glassdoor.com/", - "sec-ch-ua": '"Chromium";v="118", "Google Chrome";v="118", "Not=A?Brand";v="99"', - "sec-ch-ua-mobile": "?0", - "sec-ch-ua-platform": '"macOS"', - "sec-fetch-dest": "empty", - "sec-fetch-mode": "cors", - "sec-fetch-site": "same-origin", - "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36", - } - query_template = """ - query JobSearchResultsQuery( - $excludeJobListingIds: [Long!], - $keyword: String, - $locationId: Int, - $locationType: LocationTypeEnum, - $numJobsToShow: Int!, - $pageCursor: String, - $pageNumber: Int, - $filterParams: [FilterParams], - $originalPageUrl: String, - $seoFriendlyUrlInput: String, - $parameterUrlInput: String, - $seoUrl: Boolean - ) { - jobListings( - contextHolder: { - searchParams: { - excludeJobListingIds: $excludeJobListingIds, - keyword: $keyword, - locationId: $locationId, - locationType: $locationType, - numPerPage: $numJobsToShow, - pageCursor: $pageCursor, - pageNumber: $pageNumber, - filterParams: $filterParams, - originalPageUrl: $originalPageUrl, - seoFriendlyUrlInput: $seoFriendlyUrlInput, - parameterUrlInput: $parameterUrlInput, - seoUrl: $seoUrl, - searchType: SR - } - } - ) { - companyFilterOptions { - id - shortName - __typename - } - filterOptions - indeedCtk - jobListings { - ...JobView - __typename - } - jobListingSeoLinks { - linkItems { - position - url - __typename - } - __typename - } - jobSearchTrackingKey - jobsPageSeoData { - pageMetaDescription - pageTitle - __typename - } - paginationCursors { - cursor - pageNumber - __typename - } - indexablePageForSeo - searchResultsMetadata { - searchCriteria { - implicitLocation { - id - localizedDisplayName - type - __typename - } - keyword - location { - id - shortName - localizedShortName - localizedDisplayName - type - __typename - } - __typename - } - helpCenterDomain - helpCenterLocale - jobSerpJobOutlook { - occupation - paragraph - __typename - } - showMachineReadableJobs - __typename - } - totalJobsCount - __typename - } - } - - fragment JobView on JobListingSearchResult { - jobview { - header { - adOrderId - advertiserType - adOrderSponsorshipLevel - ageInDays - divisionEmployerName - easyApply - employer { - id - name - shortName - __typename - } - employerNameFromSearch - goc - gocConfidence - gocId - jobCountryId - jobLink - jobResultTrackingKey - jobTitleText - locationName - locationType - locId - needsCommission - payCurrency - payPeriod - payPeriodAdjustedPay { - p10 - p50 - p90 - __typename - } - rating - salarySource - savedJobId - sponsored - __typename - } - job { - description - importConfigId - jobTitleId - jobTitleText - listingId - __typename - } - jobListingAdminDetails { - cpcVal - importConfigId - jobListingId - jobSourceId - userEligibleForAdminJobDetails - __typename - } - overview { - shortName - squareLogoUrl - __typename - } - __typename - } - __typename - } - """ diff --git a/src/jobspy/scrapers/glassdoor/constants.py b/src/jobspy/scrapers/glassdoor/constants.py new file mode 100644 index 0000000..2c811e8 --- /dev/null +++ b/src/jobspy/scrapers/glassdoor/constants.py @@ -0,0 +1,184 @@ +headers = { + "authority": "www.glassdoor.com", + "accept": "*/*", + "accept-language": "en-US,en;q=0.9", + "apollographql-client-name": "job-search-next", + "apollographql-client-version": "4.65.5", + "content-type": "application/json", + "origin": "https://www.glassdoor.com", + "referer": "https://www.glassdoor.com/", + "sec-ch-ua": '"Chromium";v="118", "Google Chrome";v="118", "Not=A?Brand";v="99"', + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": '"macOS"', + "sec-fetch-dest": "empty", + "sec-fetch-mode": "cors", + "sec-fetch-site": "same-origin", + "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36", +} +query_template = """ + query JobSearchResultsQuery( + $excludeJobListingIds: [Long!], + $keyword: String, + $locationId: Int, + $locationType: LocationTypeEnum, + $numJobsToShow: Int!, + $pageCursor: String, + $pageNumber: Int, + $filterParams: [FilterParams], + $originalPageUrl: String, + $seoFriendlyUrlInput: String, + $parameterUrlInput: String, + $seoUrl: Boolean + ) { + jobListings( + contextHolder: { + searchParams: { + excludeJobListingIds: $excludeJobListingIds, + keyword: $keyword, + locationId: $locationId, + locationType: $locationType, + numPerPage: $numJobsToShow, + pageCursor: $pageCursor, + pageNumber: $pageNumber, + filterParams: $filterParams, + originalPageUrl: $originalPageUrl, + seoFriendlyUrlInput: $seoFriendlyUrlInput, + parameterUrlInput: $parameterUrlInput, + seoUrl: $seoUrl, + searchType: SR + } + } + ) { + companyFilterOptions { + id + shortName + __typename + } + filterOptions + indeedCtk + jobListings { + ...JobView + __typename + } + jobListingSeoLinks { + linkItems { + position + url + __typename + } + __typename + } + jobSearchTrackingKey + jobsPageSeoData { + pageMetaDescription + pageTitle + __typename + } + paginationCursors { + cursor + pageNumber + __typename + } + indexablePageForSeo + searchResultsMetadata { + searchCriteria { + implicitLocation { + id + localizedDisplayName + type + __typename + } + keyword + location { + id + shortName + localizedShortName + localizedDisplayName + type + __typename + } + __typename + } + helpCenterDomain + helpCenterLocale + jobSerpJobOutlook { + occupation + paragraph + __typename + } + showMachineReadableJobs + __typename + } + totalJobsCount + __typename + } + } + + fragment JobView on JobListingSearchResult { + jobview { + header { + adOrderId + advertiserType + adOrderSponsorshipLevel + ageInDays + divisionEmployerName + easyApply + employer { + id + name + shortName + __typename + } + employerNameFromSearch + goc + gocConfidence + gocId + jobCountryId + jobLink + jobResultTrackingKey + jobTitleText + locationName + locationType + locId + needsCommission + payCurrency + payPeriod + payPeriodAdjustedPay { + p10 + p50 + p90 + __typename + } + rating + salarySource + savedJobId + sponsored + __typename + } + job { + description + importConfigId + jobTitleId + jobTitleText + listingId + __typename + } + jobListingAdminDetails { + cpcVal + importConfigId + jobListingId + jobSourceId + userEligibleForAdminJobDetails + __typename + } + overview { + shortName + squareLogoUrl + __typename + } + __typename + } + __typename + } +""" +fallback_token = "Ft6oHEWlRZrxDww95Cpazw:0pGUrkb2y3TyOpAIqF2vbPmUXoXVkD3oEGDVkvfeCerceQ5-n8mBg3BovySUIjmCPHCaW0H2nQVdqzbtsYqf4Q:wcqRqeegRUa9MVLJGyujVXB7vWFPjdaS1CtrrzJq-ok" diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/jobspy/scrapers/indeed/__init__.py index b5ee1ef..0aa9f44 100644 --- a/src/jobspy/scrapers/indeed/__init__.py +++ b/src/jobspy/scrapers/indeed/__init__.py @@ -10,15 +10,15 @@ from __future__ import annotations import math from typing import Tuple from datetime import datetime -from concurrent.futures import ThreadPoolExecutor, Future +from .constants import job_search_query, api_headers from .. import Scraper, ScraperInput, Site from ..utils import ( extract_emails_from_text, get_enum_from_job_type, markdown_converter, - logger, create_session, + create_logger, ) from ...jobs import ( JobPost, @@ -30,15 +30,21 @@ from ...jobs import ( DescriptionFormat, ) +logger = create_logger("Indeed") + class IndeedScraper(Scraper): - def __init__(self, proxies: list[str] | str | None = None, ca_cert: str | None = None): + def __init__( + self, proxies: list[str] | str | None = None, ca_cert: str | None = None + ): """ Initializes IndeedScraper with the Indeed API url """ super().__init__(Site.INDEED, proxies=proxies) - self.session = create_session(proxies=self.proxies, ca_cert=ca_cert, is_tls=False) + self.session = create_session( + proxies=self.proxies, ca_cert=ca_cert, is_tls=False + ) self.scraper_input = None self.jobs_per_page = 100 self.num_workers = 10 @@ -57,7 +63,7 @@ class IndeedScraper(Scraper): self.scraper_input = scraper_input domain, self.api_country_code = self.scraper_input.country.indeed_domain_value self.base_url = f"https://{domain}.indeed.com" - self.headers = self.api_headers.copy() + self.headers = api_headers.copy() self.headers["indeed-co"] = self.scraper_input.country.indeed_domain_value job_list = [] page = 1 @@ -65,17 +71,19 @@ class IndeedScraper(Scraper): cursor = None offset_pages = math.ceil(self.scraper_input.offset / 100) for _ in range(offset_pages): - logger.info(f"Indeed skipping search page: {page}") + logger.info(f"skipping search page: {page}") __, cursor = self._scrape_page(cursor) if not __: - logger.info(f"Indeed found no jobs on page: {page}") + logger.info(f"found no jobs on page: {page}") break while len(self.seen_urls) < scraper_input.results_wanted: - logger.info(f"Indeed search page: {page}") + logger.info( + f"search page: {page} / {math.ceil(scraper_input.results_wanted / 100)}" + ) jobs, cursor = self._scrape_page(cursor) if not jobs: - logger.info(f"Indeed found no jobs on page: {page}") + logger.info(f"found no jobs on page: {page}") break job_list += jobs page += 1 @@ -95,7 +103,7 @@ class IndeedScraper(Scraper): if self.scraper_input.search_term else "" ) - query = self.job_search_query.format( + query = job_search_query.format( what=(f'what: "{search_term}"' if search_term else ""), location=( f'location: {{where: "{self.scraper_input.location}", radius: {self.scraper_input.distance}, radiusUnit: MILES}}' @@ -109,28 +117,29 @@ class IndeedScraper(Scraper): payload = { "query": query, } - api_headers = self.api_headers.copy() - api_headers["indeed-co"] = self.api_country_code + api_headers_temp = api_headers.copy() + api_headers_temp["indeed-co"] = self.api_country_code response = self.session.post( self.api_url, - headers=api_headers, + headers=api_headers_temp, json=payload, timeout=10, ) - if response.status_code != 200: + if not response.ok: logger.info( - f"Indeed responded with status code: {response.status_code} (submit GitHub issue if this appears to be a bug)" + f"responded with status code: {response.status_code} (submit GitHub issue if this appears to be a bug)" ) return jobs, new_cursor data = response.json() jobs = data["data"]["jobSearch"]["results"] new_cursor = data["data"]["jobSearch"]["pageInfo"]["nextCursor"] - with ThreadPoolExecutor(max_workers=self.num_workers) as executor: - job_results: list[Future] = [ - executor.submit(self._process_job, job["job"]) for job in jobs - ] - job_list = [result.result() for result in job_results if result.result()] + job_list = [] + for job in jobs: + processed_job = self._process_job(job["job"]) + if processed_job: + job_list.append(processed_job) + return job_list, new_cursor def _build_filters(self): @@ -212,7 +221,7 @@ class IndeedScraper(Scraper): employer_details = employer.get("employerDetails", {}) if employer else {} rel_url = job["employer"]["relativeCompanyPageUrl"] if job["employer"] else None return JobPost( - id=str(job["key"]), + id=f'in-{job["key"]}', title=job["title"], description=description, company_name=job["employer"].get("name") if job.get("employer") else None, @@ -251,18 +260,11 @@ class IndeedScraper(Scraper): company_num_employees=employer_details.get("employeesLocalizedLabel"), company_revenue=employer_details.get("revenueLocalizedLabel"), company_description=employer_details.get("briefDescription"), - ceo_name=employer_details.get("ceoName"), - ceo_photo_url=employer_details.get("ceoPhotoUrl"), logo_photo_url=( employer["images"].get("squareLogoUrl") if employer and employer.get("images") else None ), - banner_photo_url=( - employer["images"].get("headerImageUrl") - if employer and employer.get("images") - else None - ), ) @staticmethod @@ -347,112 +349,3 @@ class IndeedScraper(Scraper): return CompensationInterval[mapped_interval] else: raise ValueError(f"Unsupported interval: {interval}") - - api_headers = { - "Host": "apis.indeed.com", - "content-type": "application/json", - "indeed-api-key": "161092c2017b5bbab13edb12461a62d5a833871e7cad6d9d475304573de67ac8", - "accept": "application/json", - "indeed-locale": "en-US", - "accept-language": "en-US,en;q=0.9", - "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Indeed App 193.1", - "indeed-app-info": "appv=193.1; appid=com.indeed.jobsearch; osv=16.6.1; os=ios; dtype=phone", - } - job_search_query = """ - query GetJobData {{ - jobSearch( - {what} - {location} - limit: 100 - {cursor} - sort: RELEVANCE - {filters} - ) {{ - pageInfo {{ - nextCursor - }} - results {{ - trackingKey - job {{ - source {{ - name - }} - key - title - datePublished - dateOnIndeed - description {{ - html - }} - location {{ - countryName - countryCode - admin1Code - city - postalCode - streetAddress - formatted {{ - short - long - }} - }} - compensation {{ - estimated {{ - currencyCode - baseSalary {{ - unitOfWork - range {{ - ... on Range {{ - min - max - }} - }} - }} - }} - baseSalary {{ - unitOfWork - range {{ - ... on Range {{ - min - max - }} - }} - }} - currencyCode - }} - attributes {{ - key - label - }} - employer {{ - relativeCompanyPageUrl - name - dossier {{ - employerDetails {{ - addresses - industry - employeesLocalizedLabel - revenueLocalizedLabel - briefDescription - ceoName - ceoPhotoUrl - }} - images {{ - headerImageUrl - squareLogoUrl - }} - links {{ - corporateWebsite - }} - }} - }} - recruit {{ - viewJobUrl - detailedSalary - workSchedule - }} - }} - }} - }} - }} - """ diff --git a/src/jobspy/scrapers/indeed/constants.py b/src/jobspy/scrapers/indeed/constants.py new file mode 100644 index 0000000..3ff18ca --- /dev/null +++ b/src/jobspy/scrapers/indeed/constants.py @@ -0,0 +1,109 @@ +job_search_query = """ + query GetJobData {{ + jobSearch( + {what} + {location} + limit: 100 + {cursor} + sort: RELEVANCE + {filters} + ) {{ + pageInfo {{ + nextCursor + }} + results {{ + trackingKey + job {{ + source {{ + name + }} + key + title + datePublished + dateOnIndeed + description {{ + html + }} + location {{ + countryName + countryCode + admin1Code + city + postalCode + streetAddress + formatted {{ + short + long + }} + }} + compensation {{ + estimated {{ + currencyCode + baseSalary {{ + unitOfWork + range {{ + ... on Range {{ + min + max + }} + }} + }} + }} + baseSalary {{ + unitOfWork + range {{ + ... on Range {{ + min + max + }} + }} + }} + currencyCode + }} + attributes {{ + key + label + }} + employer {{ + relativeCompanyPageUrl + name + dossier {{ + employerDetails {{ + addresses + industry + employeesLocalizedLabel + revenueLocalizedLabel + briefDescription + ceoName + ceoPhotoUrl + }} + images {{ + headerImageUrl + squareLogoUrl + }} + links {{ + corporateWebsite + }} + }} + }} + recruit {{ + viewJobUrl + detailedSalary + workSchedule + }} + }} + }} + }} + }} + """ + +api_headers = { + "Host": "apis.indeed.com", + "content-type": "application/json", + "indeed-api-key": "161092c2017b5bbab13edb12461a62d5a833871e7cad6d9d475304573de67ac8", + "accept": "application/json", + "indeed-locale": "en-US", + "accept-language": "en-US,en;q=0.9", + "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Indeed App 193.1", + "indeed-app-info": "appv=193.1; appid=com.indeed.jobsearch; osv=16.6.1; os=ios; dtype=phone", +} diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py index 6931b09..f6bc63b 100644 --- a/src/jobspy/scrapers/linkedin/__init__.py +++ b/src/jobspy/scrapers/linkedin/__init__.py @@ -7,6 +7,7 @@ This module contains routines to scrape LinkedIn. from __future__ import annotations +import math import time import random import regex as re @@ -17,9 +18,10 @@ from bs4.element import Tag from bs4 import BeautifulSoup from urllib.parse import urlparse, urlunparse, unquote +from .constants import headers from .. import Scraper, ScraperInput, Site from ..exceptions import LinkedInException -from ..utils import create_session, remove_attributes +from ..utils import create_session, remove_attributes, create_logger from ...jobs import ( JobPost, Location, @@ -30,13 +32,14 @@ from ...jobs import ( DescriptionFormat, ) from ..utils import ( - logger, extract_emails_from_text, get_enum_from_job_type, currency_parser, markdown_converter, ) +logger = create_logger("LinkedIn") + class LinkedInScraper(Scraper): base_url = "https://www.linkedin.com" @@ -44,7 +47,9 @@ class LinkedInScraper(Scraper): band_delay = 4 jobs_per_page = 25 - def __init__(self, proxies: list[str] | str | None = None, ca_cert: str | None = None): + def __init__( + self, proxies: list[str] | str | None = None, ca_cert: str | None = None + ): """ Initializes LinkedInScraper with the LinkedIn job search url """ @@ -57,7 +62,7 @@ class LinkedInScraper(Scraper): delay=5, clear_cookies=True, ) - self.session.headers.update(self.headers) + self.session.headers.update(headers) self.scraper_input = None self.country = "worldwide" self.job_url_direct_regex = re.compile(r'(?<=\?url=)[^"]+') @@ -71,17 +76,19 @@ class LinkedInScraper(Scraper): self.scraper_input = scraper_input job_list: list[JobPost] = [] seen_ids = set() - page = scraper_input.offset // 10 * 10 if scraper_input.offset else 0 + start = scraper_input.offset // 10 * 10 if scraper_input.offset else 0 request_count = 0 seconds_old = ( scraper_input.hours_old * 3600 if scraper_input.hours_old else None ) continue_search = ( - lambda: len(job_list) < scraper_input.results_wanted and page < 1000 + lambda: len(job_list) < scraper_input.results_wanted and start < 1000 ) while continue_search(): request_count += 1 - logger.info(f"LinkedIn search page: {request_count}") + logger.info( + f"search page: {request_count} / {math.ceil(scraper_input.results_wanted / 10)}" + ) params = { "keywords": scraper_input.search_term, "location": scraper_input.location, @@ -93,7 +100,7 @@ class LinkedInScraper(Scraper): else None ), "pageNum": 0, - "start": page, + "start": start, "f_AL": "true" if scraper_input.easy_apply else None, "f_C": ( ",".join(map(str, scraper_input.linkedin_company_ids)) @@ -155,7 +162,7 @@ class LinkedInScraper(Scraper): if continue_search(): time.sleep(random.uniform(self.delay, self.delay + self.band_delay)) - page += len(job_list) + start += len(job_list) job_list = job_list[: scraper_input.results_wanted] return JobResponse(jobs=job_list) @@ -211,7 +218,7 @@ class LinkedInScraper(Scraper): job_details = self._get_job_details(job_id) return JobPost( - id=job_id, + id=f"li-{job_id}", title=title, company_name=company, company_url=company_url, @@ -267,15 +274,19 @@ class LinkedInScraper(Scraper): ) if job_function_span: job_function = job_function_span.text.strip() + + logo_photo_url = ( + logo_image.get("data-delayed-url") + if (logo_image := soup.find("img", {"class": "artdeco-entity-image"})) + else None + ) return { "description": description, "job_level": self._parse_job_level(soup), "company_industry": self._parse_company_industry(soup), "job_type": self._parse_job_type(soup), "job_url_direct": self._parse_job_url_direct(soup), - "logo_photo_url": soup.find("img", {"class": "artdeco-entity-image"}).get( - "data-delayed-url" - ), + "logo_photo_url": logo_photo_url, "job_function": job_function, } @@ -402,12 +413,3 @@ class LinkedInScraper(Scraper): JobType.CONTRACT: "C", JobType.TEMPORARY: "T", }.get(job_type_enum, "") - - headers = { - "authority": "www.linkedin.com", - "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", - "accept-language": "en-US,en;q=0.9", - "cache-control": "max-age=0", - "upgrade-insecure-requests": "1", - "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", - } diff --git a/src/jobspy/scrapers/linkedin/constants.py b/src/jobspy/scrapers/linkedin/constants.py new file mode 100644 index 0000000..6123058 --- /dev/null +++ b/src/jobspy/scrapers/linkedin/constants.py @@ -0,0 +1,8 @@ +headers = { + "authority": "www.linkedin.com", + "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", + "accept-language": "en-US,en;q=0.9", + "cache-control": "max-age=0", + "upgrade-insecure-requests": "1", + "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", +} diff --git a/src/jobspy/scrapers/utils.py b/src/jobspy/scrapers/utils.py index 8ccd404..760d52c 100644 --- a/src/jobspy/scrapers/utils.py +++ b/src/jobspy/scrapers/utils.py @@ -12,15 +12,18 @@ from requests.adapters import HTTPAdapter, Retry from ..jobs import CompensationInterval, JobType -logger = logging.getLogger("JobSpy") -logger.propagate = False -if not logger.handlers: - logger.setLevel(logging.INFO) - console_handler = logging.StreamHandler() - format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" - formatter = logging.Formatter(format) - console_handler.setFormatter(formatter) - logger.addHandler(console_handler) + +def create_logger(name: str): + logger = logging.getLogger(f"JobSpy:{name}") + logger.propagate = False + if not logger.handlers: + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler() + format = "%(asctime)s - %(levelname)s - %(name)s - %(message)s" + formatter = logging.Formatter(format) + console_handler.setFormatter(formatter) + logger.addHandler(console_handler) + return logger class RotatingProxySession: @@ -138,7 +141,9 @@ def set_logger_level(verbose: int = 2): level_name = {2: "INFO", 1: "WARNING", 0: "ERROR"}.get(verbose, "INFO") level = getattr(logging, level_name.upper(), None) if level is not None: - logger.setLevel(level) + for logger_name in logging.root.manager.loggerDict: + if logger_name.startswith("JobSpy:"): + logging.getLogger(logger_name).setLevel(level) else: raise ValueError(f"Invalid log level: {level_name}") @@ -199,6 +204,10 @@ def extract_salary( monthly_threshold=30000, enforce_annual_salary=False, ): + """ + Extracts salary information from a string and returns the salary interval, min and max salary values, and currency. + (TODO: Needs test cases as the regex is complicated and may not cover all edge cases) + """ if not salary_str: return None, None, None, None diff --git a/src/jobspy/scrapers/ziprecruiter/__init__.py b/src/jobspy/scrapers/ziprecruiter/__init__.py index b5c0221..294ca8c 100644 --- a/src/jobspy/scrapers/ziprecruiter/__init__.py +++ b/src/jobspy/scrapers/ziprecruiter/__init__.py @@ -18,13 +18,14 @@ from concurrent.futures import ThreadPoolExecutor from bs4 import BeautifulSoup +from .constants import headers from .. import Scraper, ScraperInput, Site from ..utils import ( - logger, extract_emails_from_text, create_session, markdown_converter, remove_attributes, + create_logger, ) from ...jobs import ( JobPost, @@ -36,12 +37,16 @@ from ...jobs import ( DescriptionFormat, ) +logger = create_logger("ZipRecruiter") + class ZipRecruiterScraper(Scraper): base_url = "https://www.ziprecruiter.com" api_url = "https://api.ziprecruiter.com" - def __init__(self, proxies: list[str] | str | None = None, ca_cert: str | None = None): + def __init__( + self, proxies: list[str] | str | None = None, ca_cert: str | None = None + ): """ Initializes ZipRecruiterScraper with the ZipRecruiter job search url """ @@ -49,6 +54,7 @@ class ZipRecruiterScraper(Scraper): self.scraper_input = None self.session = create_session(proxies=proxies, ca_cert=ca_cert) + self.session.headers.update(headers) self._get_cookies() self.delay = 5 @@ -71,7 +77,7 @@ class ZipRecruiterScraper(Scraper): break if page > 1: time.sleep(self.delay) - logger.info(f"ZipRecruiter search page: {page}") + logger.info(f"search page: {page} / {max_pages}") jobs_on_page, continue_token = self._find_jobs_in_page( scraper_input, continue_token ) @@ -97,9 +103,7 @@ class ZipRecruiterScraper(Scraper): if continue_token: params["continue_from"] = continue_token try: - res = self.session.get( - f"{self.api_url}/jobs-app/jobs", headers=self.headers, params=params - ) + res = self.session.get(f"{self.api_url}/jobs-app/jobs", params=params) if res.status_code not in range(200, 400): if res.status_code == 429: err = "429 Response - Blocked by ZipRecruiter for too many requests" @@ -160,7 +164,7 @@ class ZipRecruiterScraper(Scraper): description_full, job_url_direct = self._get_descr(job_url) return JobPost( - id=str(job["listing_key"]), + id=f'zr-{job["listing_key"]}', title=title, company_name=company, location=location, @@ -180,7 +184,7 @@ class ZipRecruiterScraper(Scraper): ) def _get_descr(self, job_url): - res = self.session.get(job_url, headers=self.headers, allow_redirects=True) + res = self.session.get(job_url, allow_redirects=True) description_full = job_url_direct = None if res.ok: soup = BeautifulSoup(res.text, "html.parser") @@ -213,7 +217,7 @@ class ZipRecruiterScraper(Scraper): def _get_cookies(self): data = "event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple" url = f"{self.api_url}/jobs-app/event" - self.session.post(url, data=data, headers=self.headers) + self.session.post(url, data=data) @staticmethod def _get_job_type_enum(job_type_str: str) -> list[JobType] | None: @@ -241,14 +245,3 @@ class ZipRecruiterScraper(Scraper): if scraper_input.distance: params["radius"] = scraper_input.distance return {k: v for k, v in params.items() if v is not None} - - headers = { - "Host": "api.ziprecruiter.com", - "accept": "*/*", - "x-zr-zva-override": "100000000;vid:ZT1huzm_EQlDTVEc", - "x-pushnotificationid": "0ff4983d38d7fc5b3370297f2bcffcf4b3321c418f5c22dd152a0264707602a0", - "x-deviceid": "D77B3A92-E589-46A4-8A39-6EF6F1D86006", - "user-agent": "Job Search/87.0 (iPhone; CPU iOS 16_6_1 like Mac OS X)", - "authorization": "Basic YTBlZjMyZDYtN2I0Yy00MWVkLWEyODMtYTI1NDAzMzI0YTcyOg==", - "accept-language": "en-US,en;q=0.9", - } diff --git a/src/jobspy/scrapers/ziprecruiter/constants.py b/src/jobspy/scrapers/ziprecruiter/constants.py new file mode 100644 index 0000000..7e179c9 --- /dev/null +++ b/src/jobspy/scrapers/ziprecruiter/constants.py @@ -0,0 +1,10 @@ +headers = { + "Host": "api.ziprecruiter.com", + "accept": "*/*", + "x-zr-zva-override": "100000000;vid:ZT1huzm_EQlDTVEc", + "x-pushnotificationid": "0ff4983d38d7fc5b3370297f2bcffcf4b3321c418f5c22dd152a0264707602a0", + "x-deviceid": "D77B3A92-E589-46A4-8A39-6EF6F1D86006", + "user-agent": "Job Search/87.0 (iPhone; CPU iOS 16_6_1 like Mac OS X)", + "authorization": "Basic YTBlZjMyZDYtN2I0Yy00MWVkLWEyODMtYTI1NDAzMzI0YTcyOg==", + "accept-language": "en-US,en;q=0.9", +} diff --git a/tests/test_all.py b/tests/test_all.py index c6b0131..3285611 100644 --- a/tests/test_all.py +++ b/tests/test_all.py @@ -3,16 +3,16 @@ import pandas as pd def test_all(): + sites = [ + "indeed", + "glassdoor", + ] # ziprecruiter/linkedin needs good ip, and temp fix to pass test on ci result = scrape_jobs( - site_name=[ - "linkedin", - "indeed", - "glassdoor", - ], # ziprecruiter needs good ip, and temp fix to pass test on ci + site_name=sites, search_term="engineer", results_wanted=5, ) assert ( - isinstance(result, pd.DataFrame) and len(result) == 15 + isinstance(result, pd.DataFrame) and len(result) == len(sites) * 5 ), "Result should be a non-empty DataFrame"