diff --git a/README.md b/README.md index 8ae668f..c17377d 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ work with us.* - Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, & **ZipRecruiter** simultaneously - Aggregates the job postings in a Pandas DataFrame -- Proxy support (HTTP/S, SOCKS) +- Proxy support [Video Guide for JobSpy](https://www.youtube.com/watch?v=RuP1HrAZnxs&pp=ygUgam9icyBzY3JhcGVyIGJvdCBsaW5rZWRpbiBpbmRlZWQ%3D) - Updated for release v1.1.3 @@ -67,12 +67,13 @@ Optional ├── location (int) ├── distance (int): in miles ├── job_type (enum): fulltime, parttime, internship, contract -├── proxy (str): in format 'http://user:pass@host:port' or [https, socks] +├── proxy (str): in format 'http://user:pass@host:port' ├── is_remote (bool) -├── full_description (bool): fetches full description for LinkedIn (slower) +├── linkedin_fetch_description (bool): fetches full description for LinkedIn (slower) ├── results_wanted (int): number of job results to retrieve for each site specified in 'site_type' ├── easy_apply (bool): filters for jobs that are hosted on the job board site ├── linkedin_company_ids (list[int): searches for linkedin jobs with specific company ids +├── description_format (enum): markdown, html (format type of the job descriptions) ├── country_indeed (enum): filters the country on Indeed (see below for correct spelling) ├── offset (num): starts the search from an offset (e.g. 25 will start the search from the 25th result) ├── hours_old (int): filters jobs by the number of hours since the job was posted (all but LinkedIn rounds up to next day) diff --git a/poetry.lock b/poetry.lock index d573844..d4581f9 100644 --- a/poetry.lock +++ b/poetry.lock @@ -524,6 +524,17 @@ files = [ {file = "fqdn-1.5.1.tar.gz", hash = "sha256:105ed3677e767fb5ca086a0c1f4bb66ebc3c100be518f0e0d755d9eae164d89f"}, ] +[[package]] +name = "html2text" +version = "2020.1.16" +description = "Turn HTML into equivalent Markdown-structured text." +optional = false +python-versions = ">=3.5" +files = [ + {file = "html2text-2020.1.16-py3-none-any.whl", hash = "sha256:c7c629882da0cf377d66f073329ccf34a12ed2adf0169b9285ae4e63ef54c82b"}, + {file = "html2text-2020.1.16.tar.gz", hash = "sha256:e296318e16b059ddb97f7a8a1d6a5c1d7af4544049a01e261731d2d5cc277bbb"}, +] + [[package]] name = "idna" version = "3.4" @@ -2435,4 +2446,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "404a77d78066cbb2ef71015562baf44aa11d12aac29a191c1ccc7758bfda598a" +content-hash = "40cdc19a57cba0d21ff4f0fcfa53e14a073fcccd9f2a871440e056ab6e8fade0" diff --git a/pyproject.toml b/pyproject.toml index e939358..8fd7ba7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "python-jobspy" -version = "1.1.44" +version = "1.1.45" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/Bunsly/JobSpy" @@ -18,6 +18,7 @@ beautifulsoup4 = "^4.12.2" pandas = "^2.1.0" NUMPY = "1.24.2" pydantic = "^2.3.0" +html2text = "^2020.1.16" [tool.poetry.group.dev.dependencies] diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py index 7370c37..c4c87d9 100644 --- a/src/jobspy/__init__.py +++ b/src/jobspy/__init__.py @@ -15,17 +15,6 @@ from .scrapers.exceptions import ( GlassdoorException, ) -SCRAPER_MAPPING = { - Site.LINKEDIN: LinkedInScraper, - Site.INDEED: IndeedScraper, - Site.ZIP_RECRUITER: ZipRecruiterScraper, - Site.GLASSDOOR: GlassdoorScraper, -} - - -def _map_str_to_site(site_name: str) -> Site: - return Site[site_name.upper()] - def scrape_jobs( site_name: str | list[str] | Site | list[Site] | None = None, @@ -39,7 +28,8 @@ def scrape_jobs( country_indeed: str = "usa", hyperlinks: bool = False, proxy: str | None = None, - full_description: bool | None = False, + description_format: str = "markdown", + linkedin_fetch_description: bool | None = False, linkedin_company_ids: list[int] | None = None, offset: int | None = 0, hours_old: int = None, @@ -49,6 +39,15 @@ def scrape_jobs( Simultaneously scrapes job data from multiple job sites. :return: results_wanted: pandas dataframe containing job data """ + SCRAPER_MAPPING = { + Site.LINKEDIN: LinkedInScraper, + Site.INDEED: IndeedScraper, + Site.ZIP_RECRUITER: ZipRecruiterScraper, + Site.GLASSDOOR: GlassdoorScraper, + } + + def map_str_to_site(site_name: str) -> Site: + return Site[site_name.upper()] def get_enum_from_value(value_str): for job_type in JobType: @@ -61,16 +60,15 @@ def scrape_jobs( def get_site_type(): site_types = list(Site) if isinstance(site_name, str): - site_types = [_map_str_to_site(site_name)] + site_types = [map_str_to_site(site_name)] elif isinstance(site_name, Site): site_types = [site_name] elif isinstance(site_name, list): site_types = [ - _map_str_to_site(site) if isinstance(site, str) else site + map_str_to_site(site) if isinstance(site, str) else site for site in site_name ] return site_types - country_enum = Country.from_string(country_indeed) scraper_input = ScraperInput( @@ -82,7 +80,8 @@ def scrape_jobs( is_remote=is_remote, job_type=job_type, easy_apply=easy_apply, - full_description=full_description, + description_format=description_format, + linkedin_fetch_description=linkedin_fetch_description, results_wanted=results_wanted, linkedin_company_ids=linkedin_company_ids, offset=offset, @@ -92,22 +91,7 @@ def scrape_jobs( def scrape_site(site: Site) -> Tuple[str, JobResponse]: scraper_class = SCRAPER_MAPPING[site] scraper = scraper_class(proxy=proxy) - - try: - scraped_data: JobResponse = scraper.scrape(scraper_input) - except (LinkedInException, IndeedException, ZipRecruiterException) as lie: - raise lie - except Exception as e: - if site == Site.LINKEDIN: - raise LinkedInException(str(e)) - if site == Site.INDEED: - raise IndeedException(str(e)) - if site == Site.ZIP_RECRUITER: - raise ZipRecruiterException(str(e)) - if site == Site.GLASSDOOR: - raise GlassdoorException(str(e)) - else: - raise e + scraped_data: JobResponse = scraper.scrape(scraper_input) return site.value, scraped_data site_to_jobs_dict = {} @@ -188,8 +172,6 @@ def scrape_jobs( "emails", "description", ] - jobs_formatted_df = jobs_df[desired_order] + return jobs_df[desired_order].sort_values(by=['site', 'date_posted'], ascending=[True, False]) else: - jobs_formatted_df = pd.DataFrame() - - return jobs_formatted_df.sort_values(by=['site', 'date_posted'], ascending=[True, False]) + return pd.DataFrame() diff --git a/src/jobspy/jobs/__init__.py b/src/jobspy/jobs/__init__.py index a819897..c4bbb43 100644 --- a/src/jobspy/jobs/__init__.py +++ b/src/jobspy/jobs/__init__.py @@ -210,6 +210,11 @@ class Compensation(BaseModel): currency: Optional[str] = "USD" +class DescriptionFormat(Enum): + MARKDOWN = "markdown" + HTML = "html" + + class JobPost(BaseModel): title: str company_name: str diff --git a/src/jobspy/scrapers/__init__.py b/src/jobspy/scrapers/__init__.py index f180d0d..0c14252 100644 --- a/src/jobspy/scrapers/__init__.py +++ b/src/jobspy/scrapers/__init__.py @@ -1,4 +1,11 @@ -from ..jobs import Enum, BaseModel, JobType, JobResponse, Country +from ..jobs import ( + Enum, + BaseModel, + JobType, + JobResponse, + Country, + DescriptionFormat +) class Site(Enum): @@ -18,9 +25,10 @@ class ScraperInput(BaseModel): is_remote: bool = False job_type: JobType | None = None easy_apply: bool | None = None - full_description: bool = False offset: int = 0 + linkedin_fetch_description: bool = False linkedin_company_ids: list[int] | None = None + description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN results_wanted: int = 15 hours_old: int | None = None diff --git a/src/jobspy/scrapers/glassdoor/__init__.py b/src/jobspy/scrapers/glassdoor/__init__.py index 3352a2a..9bffd16 100644 --- a/src/jobspy/scrapers/glassdoor/__init__.py +++ b/src/jobspy/scrapers/glassdoor/__init__.py @@ -13,7 +13,11 @@ from ..utils import count_urgent_words, extract_emails_from_text from .. import Scraper, ScraperInput, Site from ..exceptions import GlassdoorException -from ..utils import create_session +from ..utils import ( + create_session, + markdown_converter, + logger +) from ...jobs import ( JobPost, Compensation, @@ -21,6 +25,7 @@ from ...jobs import ( Location, JobResponse, JobType, + DescriptionFormat ) @@ -32,13 +37,57 @@ class GlassdoorScraper(Scraper): site = Site(Site.GLASSDOOR) super().__init__(site, proxy=proxy) - self.url = None + self.base_url = None self.country = None self.session = None + self.scraper_input = None self.jobs_per_page = 30 self.seen_urls = set() - def fetch_jobs_page( + def scrape(self, scraper_input: ScraperInput) -> JobResponse: + """ + Scrapes Glassdoor for jobs with scraper_input criteria. + :param scraper_input: Information about job search criteria. + :return: JobResponse containing a list of jobs. + """ + self.scraper_input = scraper_input + self.scraper_input.results_wanted = min(900, scraper_input.results_wanted) + self.base_url = self.scraper_input.country.get_url() + + location_id, location_type = self._get_location( + scraper_input.location, scraper_input.is_remote + ) + if location_type is None: + return JobResponse(jobs=[]) + all_jobs: list[JobPost] = [] + cursor = None + max_pages = 30 + self.session = create_session(self.proxy, is_tls=False, has_retry=True) + self.session.get(self.base_url) + + try: + for page in range( + 1 + (scraper_input.offset // self.jobs_per_page), + min( + (scraper_input.results_wanted // self.jobs_per_page) + 2, + max_pages + 1, + ), + ): + try: + jobs, cursor = self._fetch_jobs_page( + scraper_input, location_id, location_type, page, cursor + ) + all_jobs.extend(jobs) + if len(all_jobs) >= scraper_input.results_wanted: + all_jobs = all_jobs[: scraper_input.results_wanted] + break + except Exception as e: + raise GlassdoorException(str(e)) + except Exception as e: + raise GlassdoorException(str(e)) + return JobResponse(jobs=all_jobs) + + def _fetch_jobs_page( self, scraper_input: ScraperInput, location_id: int, @@ -49,12 +98,13 @@ class GlassdoorScraper(Scraper): """ Scrapes a page of Glassdoor for jobs with scraper_input criteria """ + self.scraper_input = scraper_input try: - payload = self.add_payload( - scraper_input, location_id, location_type, page_num, cursor + payload = self._add_payload( + location_id, location_type, page_num, cursor ) response = self.session.post( - f"{self.url}/graph", headers=self.headers(), timeout=10, data=payload + f"{self.base_url}/graph", headers=self.headers, timeout=10, data=payload ) if response.status_code != 200: raise GlassdoorException( @@ -70,7 +120,7 @@ class GlassdoorScraper(Scraper): jobs = [] with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor: - future_to_job_data = {executor.submit(self.process_job, job): job for job in jobs_data} + future_to_job_data = {executor.submit(self._process_job, job): job for job in jobs_data} for future in as_completed(future_to_job_data): try: job_post = future.result() @@ -83,10 +133,12 @@ class GlassdoorScraper(Scraper): res_json["data"]["jobListings"]["paginationCursors"], page_num + 1 ) - def process_job(self, job_data): - """Processes a single job and fetches its description.""" + def _process_job(self, job_data): + """ + Processes a single job and fetches its description. + """ job_id = job_data["jobview"]["job"]["listingId"] - job_url = f'{self.url}job-listing/j?jl={job_id}' + job_url = f'{self.base_url}job-listing/j?jl={job_id}' if job_url in self.seen_urls: return None self.seen_urls.add(job_url) @@ -106,15 +158,13 @@ class GlassdoorScraper(Scraper): location = self.parse_location(location_name) compensation = self.parse_compensation(job["header"]) - try: - description = self.fetch_job_description(job_id) + description = self._fetch_job_description(job_id) except: description = None - - job_post = JobPost( + return JobPost( title=title, - company_url=f"{self.url}Overview/W-EI_IE{company_id}.htm" if company_id else None, + company_url=f"{self.base_url}Overview/W-EI_IE{company_id}.htm" if company_id else None, company_name=company_name, date_posted=date_posted, job_url=job_url, @@ -125,53 +175,12 @@ class GlassdoorScraper(Scraper): emails=extract_emails_from_text(description) if description else None, num_urgent_words=count_urgent_words(description) if description else None, ) - return job_post - def scrape(self, scraper_input: ScraperInput) -> JobResponse: + def _fetch_job_description(self, job_id): """ - Scrapes Glassdoor for jobs with scraper_input criteria. - :param scraper_input: Information about job search criteria. - :return: JobResponse containing a list of jobs. + Fetches the job description for a single job ID. """ - scraper_input.results_wanted = min(900, scraper_input.results_wanted) - self.country = scraper_input.country - self.url = self.country.get_url() - - location_id, location_type = self.get_location( - scraper_input.location, scraper_input.is_remote - ) - all_jobs: list[JobPost] = [] - cursor = None - max_pages = 30 - self.session = create_session(self.proxy, is_tls=False, has_retry=True) - self.session.get(self.url) - - try: - for page in range( - 1 + (scraper_input.offset // self.jobs_per_page), - min( - (scraper_input.results_wanted // self.jobs_per_page) + 2, - max_pages + 1, - ), - ): - try: - jobs, cursor = self.fetch_jobs_page( - scraper_input, location_id, location_type, page, cursor - ) - all_jobs.extend(jobs) - if len(all_jobs) >= scraper_input.results_wanted: - all_jobs = all_jobs[: scraper_input.results_wanted] - break - except Exception as e: - raise GlassdoorException(str(e)) - except Exception as e: - raise GlassdoorException(str(e)) - - return JobResponse(jobs=all_jobs) - - def fetch_job_description(self, job_id): - """Fetches the job description for a single job ID.""" - url = f"{self.url}/graph" + url = f"{self.base_url}/graph" body = [ { "operationName": "JobDetailQuery", @@ -196,48 +205,28 @@ class GlassdoorScraper(Scraper): """ } ] - response = requests.post(url, json=body, headers=GlassdoorScraper.headers()) - if response.status_code != 200: + res = requests.post(url, json=body, headers=self.headers) + if res.status_code != 200: return None - data = response.json()[0] + data = res.json()[0] desc = data['data']['jobview']['job']['description'] - return desc + return markdown_converter(desc) if self.scraper_input.description_format == DescriptionFormat.MARKDOWN else desc - @staticmethod - def parse_compensation(data: dict) -> Optional[Compensation]: - pay_period = data.get("payPeriod") - adjusted_pay = data.get("payPeriodAdjustedPay") - currency = data.get("payCurrency", "USD") - - if not pay_period or not adjusted_pay: - return None - - interval = None - if pay_period == "ANNUAL": - interval = CompensationInterval.YEARLY - elif pay_period: - interval = CompensationInterval.get_interval(pay_period) - min_amount = int(adjusted_pay.get("p10") // 1) - max_amount = int(adjusted_pay.get("p90") // 1) - - return Compensation( - interval=interval, - min_amount=min_amount, - max_amount=max_amount, - currency=currency, - ) - - def get_location(self, location: str, is_remote: bool) -> (int, str): + def _get_location(self, location: str, is_remote: bool) -> (int, str): if not location or is_remote: return "11047", "STATE" # remote options - url = f"{self.url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}" + url = f"{self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}" session = create_session(self.proxy, has_retry=True) - response = session.get(url) - if response.status_code != 200: - raise GlassdoorException( - f"bad response status code: {response.status_code}" - ) - items = response.json() + res = session.get(url) + if res.status_code != 200: + if res.status_code == 429: + logger.error(f'429 Response - Blocked by Glassdoor for too many requests') + return None, None + else: + logger.error(f'Glassdoor response status code {res.status_code}') + return None, None + items = res.json() + if not items: raise ValueError(f"Location '{location}' not found on Glassdoor") location_type = items[0]["locationType"] @@ -249,18 +238,16 @@ class GlassdoorScraper(Scraper): location_type = "COUNTRY" return int(items[0]["locationId"]), location_type - @staticmethod - def add_payload( - scraper_input, + def _add_payload( + self, location_id: int, location_type: str, page_num: int, cursor: str | None = None, ) -> str: - # `fromage` is the posting time filter in days - fromage = max(scraper_input.hours_old // 24, 1) if scraper_input.hours_old else None + fromage = max(self.scraper_input.hours_old // 24, 1) if self.scraper_input.hours_old else None filter_params = [] - if scraper_input.easy_apply: + if self.scraper_input.easy_apply: filter_params.append({"filterKey": "applicationType", "values": "1"}) if fromage: filter_params.append({"filterKey": "fromAge", "values": str(fromage)}) @@ -269,7 +256,7 @@ class GlassdoorScraper(Scraper): "variables": { "excludeJobListingIds": [], "filterParams": filter_params, - "keyword": scraper_input.search_term, + "keyword": self.scraper_input.search_term, "numJobsToShow": 30, "locationType": location_type, "locationId": int(location_id), @@ -446,13 +433,34 @@ class GlassdoorScraper(Scraper): } """ } - - if scraper_input.job_type: + if self.scraper_input.job_type: payload["variables"]["filterParams"].append( - {"filterKey": "jobType", "values": scraper_input.job_type.value[0]} + {"filterKey": "jobType", "values": self.scraper_input.job_type.value[0]} ) return json.dumps([payload]) + @staticmethod + def parse_compensation(data: dict) -> Optional[Compensation]: + pay_period = data.get("payPeriod") + adjusted_pay = data.get("payPeriodAdjustedPay") + currency = data.get("payCurrency", "USD") + if not pay_period or not adjusted_pay: + return None + + interval = None + if pay_period == "ANNUAL": + interval = CompensationInterval.YEARLY + elif pay_period: + interval = CompensationInterval.get_interval(pay_period) + min_amount = int(adjusted_pay.get("p10") // 1) + max_amount = int(adjusted_pay.get("p90") // 1) + return Compensation( + interval=interval, + min_amount=min_amount, + max_amount=max_amount, + currency=currency, + ) + @staticmethod def get_job_type_enum(job_type_str: str) -> list[JobType] | None: for job_type in JobType: @@ -472,27 +480,21 @@ class GlassdoorScraper(Scraper): if cursor_data["pageNumber"] == page_num: return cursor_data["cursor"] - @staticmethod - def headers() -> dict: - """ - Returns headers needed for requests - :return: dict - Dictionary containing headers - """ - return { - "authority": "www.glassdoor.com", - "accept": "*/*", - "accept-language": "en-US,en;q=0.9", - "apollographql-client-name": "job-search-next", - "apollographql-client-version": "4.65.5", - "content-type": "application/json", - "gd-csrf-token": "Ft6oHEWlRZrxDww95Cpazw:0pGUrkb2y3TyOpAIqF2vbPmUXoXVkD3oEGDVkvfeCerceQ5-n8mBg3BovySUIjmCPHCaW0H2nQVdqzbtsYqf4Q:wcqRqeegRUa9MVLJGyujVXB7vWFPjdaS1CtrrzJq-ok", - "origin": "https://www.glassdoor.com", - "referer": "https://www.glassdoor.com/", - "sec-ch-ua": '"Chromium";v="118", "Google Chrome";v="118", "Not=A?Brand";v="99"', - "sec-ch-ua-mobile": "?0", - "sec-ch-ua-platform": '"macOS"', - "sec-fetch-dest": "empty", - "sec-fetch-mode": "cors", - "sec-fetch-site": "same-origin", - "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36", - } + headers = { + "authority": "www.glassdoor.com", + "accept": "*/*", + "accept-language": "en-US,en;q=0.9", + "apollographql-client-name": "job-search-next", + "apollographql-client-version": "4.65.5", + "content-type": "application/json", + "gd-csrf-token": "Ft6oHEWlRZrxDww95Cpazw:0pGUrkb2y3TyOpAIqF2vbPmUXoXVkD3oEGDVkvfeCerceQ5-n8mBg3BovySUIjmCPHCaW0H2nQVdqzbtsYqf4Q:wcqRqeegRUa9MVLJGyujVXB7vWFPjdaS1CtrrzJq-ok", + "origin": "https://www.glassdoor.com", + "referer": "https://www.glassdoor.com/", + "sec-ch-ua": '"Chromium";v="118", "Google Chrome";v="118", "Not=A?Brand";v="99"', + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": '"macOS"', + "sec-fetch-dest": "empty", + "sec-fetch-mode": "cors", + "sec-fetch-site": "same-origin", + "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36", + } diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/jobspy/scrapers/indeed/__init__.py index 5b05cbd..27c3d34 100644 --- a/src/jobspy/scrapers/indeed/__init__.py +++ b/src/jobspy/scrapers/indeed/__init__.py @@ -21,6 +21,7 @@ from ..utils import ( extract_emails_from_text, create_session, get_enum_from_job_type, + markdown_converter, logger ) from ...jobs import ( @@ -30,6 +31,7 @@ from ...jobs import ( Location, JobResponse, JobType, + DescriptionFormat ) from .. import Scraper, ScraperInput, Site @@ -39,121 +41,23 @@ class IndeedScraper(Scraper): """ Initializes IndeedScraper with the Indeed job search url """ - self.url = None - self.country = None + self.scraper_input = None + self.jobs_per_page = 25 + self.num_workers = 10 + self.seen_urls = set() + self.base_url = None + self.api_url = "https://apis.indeed.com/graphql" site = Site(Site.INDEED) super().__init__(site, proxy=proxy) - self.jobs_per_page = 25 - self.seen_urls = set() - - def scrape_page( - self, scraper_input: ScraperInput, page: int - ) -> list[JobPost]: - """ - Scrapes a page of Indeed for jobs with scraper_input criteria - :param scraper_input: - :param page: - :return: jobs found on page, total number of jobs found for search - """ - job_list = [] - self.country = scraper_input.country - domain = self.country.indeed_domain_value - self.url = f"https://{domain}.indeed.com" - - try: - session = create_session(self.proxy) - response = session.get( - f"{self.url}/m/jobs", - headers=self.get_headers(), - params=self.add_params(scraper_input, page), - allow_redirects=True, - timeout_seconds=10, - ) - if response.status_code not in range(200, 400): - raise IndeedException( - f"bad response with status code: {response.status_code}" - ) - except Exception as e: - if "Proxy responded with" in str(e): - logger.error(f'Indeed: Bad proxy') - else: - logger.error(f'Indeed: {str(e)}') - return job_list - - soup = BeautifulSoup(response.content, "html.parser") - if "did not match any jobs" in response.text: - return job_list - - jobs = IndeedScraper.parse_jobs( - soup - ) #: can raise exception, handled by main scrape function - - if ( - not jobs.get("metaData", {}) - .get("mosaicProviderJobCardsModel", {}) - .get("results") - ): - raise IndeedException("No jobs found.") - - def process_job(job: dict, job_detailed: dict) -> JobPost | None: - job_url = f'{self.url}/m/jobs/viewjob?jk={job["jobkey"]}' - job_url_client = f'{self.url}/viewjob?jk={job["jobkey"]}' - if job_url in self.seen_urls: - return None - self.seen_urls.add(job_url) - description = job_detailed['description']['html'] - - - job_type = IndeedScraper.get_job_type(job) - timestamp_seconds = job["pubDate"] / 1000 - date_posted = datetime.fromtimestamp(timestamp_seconds) - date_posted = date_posted.strftime("%Y-%m-%d") - - job_post = JobPost( - title=job["normTitle"], - description=description, - company_name=job["company"], - company_url=f"{self.url}{job_detailed['employer']['relativeCompanyPageUrl']}" if job_detailed['employer'] else None, - location=Location( - city=job.get("jobLocationCity"), - state=job.get("jobLocationState"), - country=self.country, - ), - job_type=job_type, - compensation=self.get_compensation(job, job_detailed), - date_posted=date_posted, - job_url=job_url_client, - emails=extract_emails_from_text(description) if description else None, - num_urgent_words=count_urgent_words(description) - if description - else None, - is_remote=IndeedScraper.is_job_remote(job, job_detailed, description) - - ) - return job_post - - workers = 10 - jobs = jobs["metaData"]["mosaicProviderJobCardsModel"]["results"] - job_keys = [job['jobkey'] for job in jobs] - jobs_detailed = self.get_job_details(job_keys) - - with ThreadPoolExecutor(max_workers=workers) as executor: - job_results: list[Future] = [ - executor.submit(process_job, job, job_detailed['job']) for job, job_detailed in zip(jobs, jobs_detailed) - ] - - job_list = [result.result() for result in job_results if result.result()] - - return job_list - def scrape(self, scraper_input: ScraperInput) -> JobResponse: """ Scrapes Indeed for jobs with scraper_input criteria :param scraper_input: :return: job_response """ - job_list = self.scrape_page(scraper_input, 0) + self.scraper_input = scraper_input + job_list = self._scrape_page() pages_processed = 1 while len(self.seen_urls) < scraper_input.results_wanted: @@ -162,7 +66,7 @@ class IndeedScraper(Scraper): with ThreadPoolExecutor(max_workers=10) as executor: futures: list[Future] = [ - executor.submit(self.scrape_page, scraper_input, page + pages_processed) + executor.submit(self._scrape_page, page + pages_processed) for page in range(pages_to_process) ] @@ -184,8 +88,136 @@ class IndeedScraper(Scraper): return JobResponse(jobs=job_list) + def _scrape_page(self, page: int=0) -> list[JobPost]: + """ + Scrapes a page of Indeed for jobs with scraper_input criteria + :param page: + :return: jobs found on page, total number of jobs found for search + """ + job_list = [] + domain = self.scraper_input.country.indeed_domain_value + self.base_url = f"https://{domain}.indeed.com" + + try: + session = create_session(self.proxy) + response = session.get( + f"{self.base_url}/m/jobs", + headers=self.headers, + params=self._add_params(page), + ) + if response.status_code not in range(200, 400): + if response.status_code == 429: + logger.error(f'429 Response - Blocked by Indeed for too many requests') + else: + logger.error(f'Indeed response status code {response.status_code}') + return job_list + + except Exception as e: + if "Proxy responded with" in str(e): + logger.error(f'Indeed: Bad proxy') + else: + logger.error(f'Indeed: {str(e)}') + return job_list + + soup = BeautifulSoup(response.content, "html.parser") + if "did not match any jobs" in response.text: + return job_list + + jobs = IndeedScraper._parse_jobs(soup) + if ( + not jobs.get("metaData", {}) + .get("mosaicProviderJobCardsModel", {}) + .get("results") + ): + raise IndeedException("No jobs found.") + + jobs = jobs["metaData"]["mosaicProviderJobCardsModel"]["results"] + job_keys = [job['jobkey'] for job in jobs] + jobs_detailed = self._get_job_details(job_keys) + + with ThreadPoolExecutor(max_workers=self.num_workers) as executor: + job_results: list[Future] = [ + executor.submit(self._process_job, job, job_detailed['job']) for job, job_detailed in zip(jobs, jobs_detailed) + ] + + job_list = [result.result() for result in job_results if result.result()] + + return job_list + + def _process_job(self, job: dict, job_detailed: dict) -> JobPost | None: + job_url = f'{self.base_url}/m/jobs/viewjob?jk={job["jobkey"]}' + job_url_client = f'{self.base_url}/viewjob?jk={job["jobkey"]}' + if job_url in self.seen_urls: + return None + self.seen_urls.add(job_url) + description = job_detailed['description']['html'] + description = markdown_converter(description) if self.scraper_input.description_format == DescriptionFormat.MARKDOWN else description + job_type = self._get_job_type(job) + timestamp_seconds = job["pubDate"] / 1000 + date_posted = datetime.fromtimestamp(timestamp_seconds) + date_posted = date_posted.strftime("%Y-%m-%d") + return JobPost( + title=job["normTitle"], + description=description, + company_name=job["company"], + company_url=f"{self.base_url}{job_detailed['employer']['relativeCompanyPageUrl']}" if job_detailed[ + 'employer'] else None, + location=Location( + city=job.get("jobLocationCity"), + state=job.get("jobLocationState"), + country=self.scraper_input.country, + ), + job_type=job_type, + compensation=self._get_compensation(job, job_detailed), + date_posted=date_posted, + job_url=job_url_client, + emails=extract_emails_from_text(description) if description else None, + num_urgent_words=count_urgent_words(description) if description else None, + is_remote=self._is_job_remote(job, job_detailed, description) + ) + + def _get_job_details(self, job_keys: list[str]) -> dict: + """ + Queries the GraphQL endpoint for detailed job information for the given job keys. + """ + job_keys_gql = '[' + ', '.join(f'"{key}"' for key in job_keys) + ']' + payload = dict(self.api_payload) + payload["query"] = self.api_payload["query"].format(job_keys_gql=job_keys_gql) + response = requests.post(self.api_url, headers=self.api_headers, json=payload, proxies=self.proxy) + if response.status_code == 200: + return response.json()['data']['jobData']['results'] + else: + return {} + + def _add_params(self, page: int) -> dict[str, str | Any]: + fromage = max(self.scraper_input.hours_old // 24, 1) if self.scraper_input.hours_old else None + params = { + "q": self.scraper_input.search_term, + "l": self.scraper_input.location if self.scraper_input.location else self.scraper_input.country.value[0].split(',')[-1], + "filter": 0, + "start": self.scraper_input.offset + page * 10, + "sort": "date", + "fromage": fromage, + } + if self.scraper_input.distance: + params["radius"] = self.scraper_input.distance + + sc_values = [] + if self.scraper_input.is_remote: + sc_values.append("attr(DSQF7)") + if self.scraper_input.job_type: + sc_values.append("jt({})".format(self.scraper_input.job_type.value[0])) + + if sc_values: + params["sc"] = "0kf:" + "".join(sc_values) + ";" + + if self.scraper_input.easy_apply: + params['iafilter'] = 1 + + return params + @staticmethod - def get_job_type(job: dict) -> list[JobType] | None: + def _get_job_type(job: dict) -> list[JobType] | None: """ Parses the job to get list of job types :param job: @@ -204,7 +236,7 @@ class IndeedScraper(Scraper): return job_types @staticmethod - def get_compensation(job: dict, job_detailed: dict) -> Compensation: + def _get_compensation(job: dict, job_detailed: dict) -> Compensation: """ Parses the job to get :param job: @@ -213,7 +245,7 @@ class IndeedScraper(Scraper): """ comp = job_detailed['compensation']['baseSalary'] if comp: - interval = IndeedScraper.get_correct_interval(comp['unitOfWork']) + interval = IndeedScraper._get_correct_interval(comp['unitOfWork']) if interval: return Compensation( interval=interval, @@ -242,18 +274,13 @@ class IndeedScraper(Scraper): return compensation @staticmethod - def parse_jobs(soup: BeautifulSoup) -> dict: + def _parse_jobs(soup: BeautifulSoup) -> dict: """ Parses the jobs from the soup object :param soup: :return: jobs """ - def find_mosaic_script() -> Tag | None: - """ - Finds jobcards script tag - :return: script_tag - """ script_tags = soup.find_all("script") for tag in script_tags: @@ -266,7 +293,6 @@ class IndeedScraper(Scraper): return None script_tag = find_mosaic_script() - if script_tag: script_str = script_tag.string pattern = r'window.mosaic.providerData\["mosaic-provider-jobcards"\]\s*=\s*({.*?});' @@ -283,49 +309,7 @@ class IndeedScraper(Scraper): ) @staticmethod - def get_headers(): - return { - 'Host': 'www.indeed.com', - 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'sec-fetch-site': 'same-origin', - 'sec-fetch-dest': 'document', - 'accept-language': 'en-US,en;q=0.9', - 'sec-fetch-mode': 'navigate', - 'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Indeed App 192.0', - 'referer': 'https://www.indeed.com/m/jobs?q=software%20intern&l=Dallas%2C%20TX&from=serpso&rq=1&rsIdx=3', - } - - @staticmethod - def add_params(scraper_input: ScraperInput, page: int) -> dict[str, str | Any]: - # `fromage` is the posting time filter in days - fromage = max(scraper_input.hours_old // 24, 1) if scraper_input.hours_old else None - params = { - "q": scraper_input.search_term, - "l": scraper_input.location if scraper_input.location else scraper_input.country.value[0].split(',')[-1], - "filter": 0, - "start": scraper_input.offset + page * 10, - "sort": "date", - "fromage": fromage, - } - if scraper_input.distance: - params["radius"] = scraper_input.distance - - sc_values = [] - if scraper_input.is_remote: - sc_values.append("attr(DSQF7)") - if scraper_input.job_type: - sc_values.append("jt({})".format(scraper_input.job_type.value[0])) - - if sc_values: - params["sc"] = "0kf:" + "".join(sc_values) + ";" - - if scraper_input.easy_apply: - params['iafilter'] = 1 - - return params - - @staticmethod - def is_job_remote(job: dict, job_detailed: dict, description: str) -> bool: + def _is_job_remote(job: dict, job_detailed: dict, description: str) -> bool: remote_keywords = ['remote', 'work from home', 'wfh'] is_remote_in_attributes = any( any(keyword in attr['label'].lower() for keyword in remote_keywords) @@ -342,86 +326,8 @@ class IndeedScraper(Scraper): ) return is_remote_in_attributes or is_remote_in_description or is_remote_in_location or is_remote_in_taxonomy - def get_job_details(self, job_keys: list[str]) -> dict: - """ - Queries the GraphQL endpoint for detailed job information for the given job keys. - """ - url = "https://apis.indeed.com/graphql" - headers = { - 'Host': 'apis.indeed.com', - 'content-type': 'application/json', - 'indeed-api-key': '161092c2017b5bbab13edb12461a62d5a833871e7cad6d9d475304573de67ac8', - 'accept': 'application/json', - 'indeed-locale': 'en-US', - 'accept-language': 'en-US,en;q=0.9', - 'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Indeed App 193.1', - 'indeed-app-info': 'appv=193.1; appid=com.indeed.jobsearch; osv=16.6.1; os=ios; dtype=phone', - 'indeed-co': 'US', - } - - job_keys_gql = '[' + ', '.join(f'"{key}"' for key in job_keys) + ']' - - payload = { - "query": f""" - query GetJobData {{ - jobData(input: {{ - jobKeys: {job_keys_gql} - }}) {{ - results {{ - job {{ - key - title - description {{ - html - }} - location {{ - countryName - countryCode - city - postalCode - streetAddress - formatted {{ - short - long - }} - }} - compensation {{ - baseSalary {{ - unitOfWork - range {{ - ... on Range {{ - min - max - }} - }} - }} - currencyCode - }} - attributes {{ - label - }} - employer {{ - relativeCompanyPageUrl - }} - recruit {{ - viewJobUrl - detailedSalary - workSchedule - }} - }} - }} - }} - }} - """ - } - response = requests.post(url, headers=headers, json=payload, proxies=self.proxy) - if response.status_code == 200: - return response.json()['data']['jobData']['results'] - else: - return {} - @staticmethod - def get_correct_interval(interval: str) -> CompensationInterval: + def _get_correct_interval(interval: str) -> CompensationInterval: interval_mapping = { "DAY": "DAILY", "YEAR": "YEARLY", @@ -434,3 +340,78 @@ class IndeedScraper(Scraper): return CompensationInterval[mapped_interval] else: raise ValueError(f"Unsupported interval: {interval}") + + headers = { + 'Host': 'www.indeed.com', + 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'sec-fetch-site': 'same-origin', + 'sec-fetch-dest': 'document', + 'accept-language': 'en-US,en;q=0.9', + 'sec-fetch-mode': 'navigate', + 'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Indeed App 192.0', + 'referer': 'https://www.indeed.com/m/jobs?q=software%20intern&l=Dallas%2C%20TX&from=serpso&rq=1&rsIdx=3', + } + api_headers = { + 'Host': 'apis.indeed.com', + 'content-type': 'application/json', + 'indeed-api-key': '161092c2017b5bbab13edb12461a62d5a833871e7cad6d9d475304573de67ac8', + 'accept': 'application/json', + 'indeed-locale': 'en-US', + 'accept-language': 'en-US,en;q=0.9', + 'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Indeed App 193.1', + 'indeed-app-info': 'appv=193.1; appid=com.indeed.jobsearch; osv=16.6.1; os=ios; dtype=phone', + 'indeed-co': 'US', + } + api_payload = { + "query": """ + query GetJobData {{ + jobData(input: {{ + jobKeys: {job_keys_gql} + }}) {{ + results {{ + job {{ + key + title + description {{ + html + }} + location {{ + countryName + countryCode + city + postalCode + streetAddress + formatted {{ + short + long + }} + }} + compensation {{ + baseSalary {{ + unitOfWork + range {{ + ... on Range {{ + min + max + }} + }} + }} + currencyCode + }} + attributes {{ + label + }} + employer {{ + relativeCompanyPageUrl + }} + recruit {{ + viewJobUrl + detailedSalary + workSchedule + }} + }} + }} + }} + }} + """ + } diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py index 4833337..ad17cd4 100644 --- a/src/jobspy/scrapers/linkedin/__init__.py +++ b/src/jobspy/scrapers/linkedin/__init__.py @@ -25,26 +25,30 @@ from ...jobs import ( JobResponse, JobType, Country, - Compensation + Compensation, + DescriptionFormat ) from ..utils import ( + logger, count_urgent_words, extract_emails_from_text, get_enum_from_job_type, - currency_parser + currency_parser, + markdown_converter ) class LinkedInScraper(Scraper): - DELAY = 3 + base_url = "https://www.linkedin.com" + delay = 3 def __init__(self, proxy: Optional[str] = None): """ Initializes LinkedInScraper with the LinkedIn job search url """ + self.scraper_input = None site = Site(Site.LINKEDIN) self.country = "worldwide" - self.url = "https://www.linkedin.com" super().__init__(site, proxy=proxy) def scrape(self, scraper_input: ScraperInput) -> JobResponse: @@ -53,28 +57,16 @@ class LinkedInScraper(Scraper): :param scraper_input: :return: job_response """ + self.scraper_input = scraper_input job_list: list[JobPost] = [] seen_urls = set() url_lock = Lock() page = scraper_input.offset // 25 + 25 if scraper_input.offset else 0 - seconds_old = ( scraper_input.hours_old * 3600 if scraper_input.hours_old else None ) - - def job_type_code(job_type_enum): - mapping = { - JobType.FULL_TIME: "F", - JobType.PART_TIME: "P", - JobType.INTERNSHIP: "I", - JobType.CONTRACT: "C", - JobType.TEMPORARY: "T", - } - - return mapping.get(job_type_enum, "") - continue_search = lambda: len(job_list) < scraper_input.results_wanted and page < 1000 while continue_search(): @@ -84,7 +76,7 @@ class LinkedInScraper(Scraper): "location": scraper_input.location, "distance": scraper_input.distance, "f_WT": 2 if scraper_input.is_remote else None, - "f_JT": job_type_code(scraper_input.job_type) + "f_JT": self.job_type_code(scraper_input.job_type) if scraper_input.job_type else None, "pageNum": 0, @@ -97,23 +89,25 @@ class LinkedInScraper(Scraper): params = {k: v for k, v in params.items() if v is not None} try: response = session.get( - f"{self.url}/jobs-guest/jobs/api/seeMoreJobPostings/search?", + f"{self.base_url}/jobs-guest/jobs/api/seeMoreJobPostings/search?", params=params, allow_redirects=True, proxies=self.proxy, - headers=self.headers(), + headers=self.headers, timeout=10, ) - response.raise_for_status() - - except requests.HTTPError as e: - raise LinkedInException( - f"bad response status code: {e.response.status_code}" - ) - except ProxyError as e: - raise LinkedInException("bad proxy") + if response.status_code not in range(200, 400): + if response.status_code == 429: + logger.error(f'429 Response - Blocked by LinkedIn for too many requests') + else: + logger.error(f'LinkedIn response status code {response.status_code}') + return JobResponse(job_list=job_list) except Exception as e: - raise LinkedInException(str(e)) + if "Proxy responded with" in str(e): + logger.error(f'Indeed: Bad proxy') + else: + logger.error(f'Indeed: {str(e)}') + return JobResponse(job_list=job_list) soup = BeautifulSoup(response.text, "html.parser") job_cards = soup.find_all("div", class_="base-search-card") @@ -126,29 +120,29 @@ class LinkedInScraper(Scraper): if href_tag and "href" in href_tag.attrs: href = href_tag.attrs["href"].split("?")[0] job_id = href.split("-")[-1] - job_url = f"{self.url}/jobs/view/{job_id}" + job_url = f"{self.base_url}/jobs/view/{job_id}" with url_lock: if job_url in seen_urls: continue seen_urls.add(job_url) - - # Call process_job directly without threading try: - job_post = self.process_job(job_card, job_url, scraper_input.full_description) + job_post = self._process_job(job_card, job_url, scraper_input.linkedin_fetch_description) if job_post: job_list.append(job_post) + if not continue_search(): + break except Exception as e: - raise LinkedInException("Exception occurred while processing jobs") + raise LinkedInException(str(e)) if continue_search(): - time.sleep(random.uniform(LinkedInScraper.DELAY, LinkedInScraper.DELAY + 2)) + time.sleep(random.uniform(self.delay, self.delay + 2)) page += 25 job_list = job_list[: scraper_input.results_wanted] return JobResponse(jobs=job_list) - def process_job(self, job_card: Tag, job_url: str, full_descr: bool) -> Optional[JobPost]: + def _process_job(self, job_card: Tag, job_url: str, full_descr: bool) -> Optional[JobPost]: salary_tag = job_card.find('span', class_='job-search-card__salary-info') compensation = None @@ -178,7 +172,7 @@ class LinkedInScraper(Scraper): company = company_a_tag.get_text(strip=True) if company_a_tag else "N/A" metadata_card = job_card.find("div", class_="base-search-card__metadata") - location = self.get_location(metadata_card) + location = self._get_location(metadata_card) datetime_tag = ( metadata_card.find("time", class_="job-search-card__listdate") @@ -190,12 +184,12 @@ class LinkedInScraper(Scraper): datetime_str = datetime_tag["datetime"] try: date_posted = datetime.strptime(datetime_str, "%Y-%m-%d") - except Exception as e: + except: date_posted = None benefits_tag = job_card.find("span", class_="result-benefits__text") benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None if full_descr: - description, job_type = self.get_job_description(job_url) + description, job_type = self._get_job_description(job_url) return JobPost( title=title, @@ -212,7 +206,7 @@ class LinkedInScraper(Scraper): num_urgent_words=count_urgent_words(description) if description else None, ) - def get_job_description( + def _get_job_description( self, job_page_url: str ) -> tuple[None, None] | tuple[str | None, tuple[str | None, JobType | None]]: """ @@ -222,11 +216,9 @@ class LinkedInScraper(Scraper): """ try: session = create_session(is_tls=False, has_retry=True) - response = session.get(job_page_url, timeout=5, proxies=self.proxy) + response = session.get(job_page_url, headers=self.headers, timeout=5, proxies=self.proxy) response.raise_for_status() - except requests.HTTPError as e: - return None, None - except Exception as e: + except: return None, None if response.url == "https://www.linkedin.com/signup": return None, None @@ -241,40 +233,13 @@ class LinkedInScraper(Scraper): for attr in list(tag.attrs): del tag[attr] return tag - div_content = remove_attributes(div_content) description = div_content.prettify(formatter="html") + if self.scraper_input.description_format == DescriptionFormat.MARKDOWN: + description = markdown_converter(description) + return description, self._parse_job_type(soup) - def get_job_type( - soup_job_type: BeautifulSoup, - ) -> list[JobType] | None: - """ - Gets the job type from job page - :param soup_job_type: - :return: JobType - """ - h3_tag = soup_job_type.find( - "h3", - class_="description__job-criteria-subheader", - string=lambda text: "Employment type" in text, - ) - - employment_type = None - if h3_tag: - employment_type_span = h3_tag.find_next_sibling( - "span", - class_="description__job-criteria-text description__job-criteria-text--criteria", - ) - if employment_type_span: - employment_type = employment_type_span.get_text(strip=True) - employment_type = employment_type.lower() - employment_type = employment_type.replace("-", "") - - return [get_enum_from_job_type(employment_type)] if employment_type else [] - - return description, get_job_type(soup) - - def get_location(self, metadata_card: Optional[Tag]) -> Location: + def _get_location(self, metadata_card: Optional[Tag]) -> Location: """ Extracts the location data from the job metadata card. :param metadata_card @@ -299,25 +264,50 @@ class LinkedInScraper(Scraper): location = Location( city=city, state=state, - country=Country.from_string(country), + country=Country.from_string(country) ) - return location @staticmethod - def headers() -> dict: + def _parse_job_type(soup_job_type: BeautifulSoup) -> list[JobType] | None: + """ + Gets the job type from job page + :param soup_job_type: + :return: JobType + """ + h3_tag = soup_job_type.find( + "h3", + class_="description__job-criteria-subheader", + string=lambda text: "Employment type" in text, + ) + employment_type = None + if h3_tag: + employment_type_span = h3_tag.find_next_sibling( + "span", + class_="description__job-criteria-text description__job-criteria-text--criteria", + ) + if employment_type_span: + employment_type = employment_type_span.get_text(strip=True) + employment_type = employment_type.lower() + employment_type = employment_type.replace("-", "") + + return [get_enum_from_job_type(employment_type)] if employment_type else [] + + @staticmethod + def job_type_code(job_type_enum: JobType) -> str: return { - "authority": "www.linkedin.com", - "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", - "accept-language": "en-US,en;q=0.9", - "cache-control": "max-age=0", - "sec-ch-ua": '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"', - # 'sec-ch-ua-mobile': '?0', - # 'sec-ch-ua-platform': '"macOS"', - # 'sec-fetch-dest': 'document', - # 'sec-fetch-mode': 'navigate', - # 'sec-fetch-site': 'none', - # 'sec-fetch-user': '?1', - "upgrade-insecure-requests": "1", - "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", - } + JobType.FULL_TIME: "F", + JobType.PART_TIME: "P", + JobType.INTERNSHIP: "I", + JobType.CONTRACT: "C", + JobType.TEMPORARY: "T", + }.get(job_type_enum, "") + + headers = { + "authority": "www.linkedin.com", + "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", + "accept-language": "en-US,en;q=0.9", + "cache-control": "max-age=0", + "upgrade-insecure-requests": "1", + "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + } diff --git a/src/jobspy/scrapers/utils.py b/src/jobspy/scrapers/utils.py index 5366193..5f54569 100644 --- a/src/jobspy/scrapers/utils.py +++ b/src/jobspy/scrapers/utils.py @@ -2,13 +2,16 @@ import re import logging import numpy as np +import html2text import tls_client import requests from requests.adapters import HTTPAdapter, Retry from ..jobs import JobType +text_maker = html2text.HTML2Text() logger = logging.getLogger("JobSpy") +logger.propagate = False if not logger.handlers: logger.setLevel(logging.ERROR) console_handler = logging.StreamHandler() @@ -32,6 +35,17 @@ def count_urgent_words(description: str) -> int: return count +def markdown_converter(description_html: str): + if description_html is None: + return "" + text_maker.ignore_links = False + try: + markdown = text_maker.handle(description_html) + return markdown.strip() + except AssertionError as e: + return "" + + def extract_emails_from_text(text: str) -> list[str] | None: if not text: return None @@ -42,14 +56,10 @@ def extract_emails_from_text(text: str) -> list[str] | None: def create_session(proxy: dict | None = None, is_tls: bool = True, has_retry: bool = False, delay: int = 1) -> requests.Session: """ Creates a requests session with optional tls, proxy, and retry settings. - :return: A session object """ if is_tls: - session = tls_client.Session( - client_identifier="chrome112", - random_tls_extension_order=True, - ) + session = tls_client.Session(random_tls_extension_order=True) session.proxies = proxy else: session = requests.Session() @@ -66,7 +76,6 @@ def create_session(proxy: dict | None = None, is_tls: bool = True, has_retry: bo session.mount('http://', adapter) session.mount('https://', adapter) - return session diff --git a/src/jobspy/scrapers/ziprecruiter/__init__.py b/src/jobspy/scrapers/ziprecruiter/__init__.py index fd7f25a..af3ca7b 100644 --- a/src/jobspy/scrapers/ziprecruiter/__init__.py +++ b/src/jobspy/scrapers/ziprecruiter/__init__.py @@ -6,33 +6,76 @@ This module contains routines to scrape ZipRecruiter. """ import math import time -from datetime import datetime, timezone +from datetime import datetime from typing import Optional, Tuple, Any from concurrent.futures import ThreadPoolExecutor from .. import Scraper, ScraperInput, Site -from ..exceptions import ZipRecruiterException -from ...jobs import JobPost, Compensation, Location, JobResponse, JobType, Country -from ..utils import count_urgent_words, extract_emails_from_text, create_session +from ..utils import ( + logger, + count_urgent_words, + extract_emails_from_text, + create_session, + markdown_converter +) +from ...jobs import ( + JobPost, + Compensation, + Location, + JobResponse, + JobType, + Country, + DescriptionFormat +) class ZipRecruiterScraper(Scraper): + base_url = "https://www.ziprecruiter.com" + api_url = "https://api.ziprecruiter.com" + def __init__(self, proxy: Optional[str] = None): """ Initializes ZipRecruiterScraper with the ZipRecruiter job search url """ - site = Site(Site.ZIP_RECRUITER) - self.url = "https://www.ziprecruiter.com" + self.scraper_input = None self.session = create_session(proxy) - self.get_cookies() - super().__init__(site, proxy=proxy) + self._get_cookies() + super().__init__(Site.ZIP_RECRUITER, proxy=proxy) + self.delay = 5 self.jobs_per_page = 20 self.seen_urls = set() - self.delay = 5 - def find_jobs_in_page( + def scrape(self, scraper_input: ScraperInput) -> JobResponse: + """ + Scrapes ZipRecruiter for jobs with scraper_input criteria. + :param scraper_input: Information about job search criteria. + :return: JobResponse containing a list of jobs. + """ + self.scraper_input = scraper_input + job_list: list[JobPost] = [] + continue_token = None + + max_pages = math.ceil(scraper_input.results_wanted / self.jobs_per_page) + for page in range(1, max_pages + 1): + if len(job_list) >= scraper_input.results_wanted: + break + if page > 1: + time.sleep(self.delay) + + jobs_on_page, continue_token = self._find_jobs_in_page( + scraper_input, continue_token + ) + if jobs_on_page: + job_list.extend(jobs_on_page) + else: + break + if not continue_token: + break + return JobResponse(jobs=job_list[: scraper_input.results_wanted]) + + def _find_jobs_in_page( self, scraper_input: ScraperInput, continue_token: str | None = None ) -> Tuple[list[JobPost], Optional[str]]: """ @@ -41,73 +84,51 @@ class ZipRecruiterScraper(Scraper): :param continue_token: :return: jobs found on page """ - params = self.add_params(scraper_input) + jobs_list = [] + params = self._add_params(scraper_input) if continue_token: params["continue_from"] = continue_token try: - response = self.session.get( - f"https://api.ziprecruiter.com/jobs-app/jobs", - headers=self.headers(), + res= self.session.get( + f"{self.api_url}/jobs-app/jobs", + headers=self.headers, params=params ) - if response.status_code != 200: - raise ZipRecruiterException( - f"bad response status code: {response.status_code}" - ) + if res.status_code not in range(200, 400): + if res.status_code == 429: + logger.error(f'429 Response - Blocked by ZipRecruiter for too many requests') + else: + logger.error(f'ZipRecruiter response status code {res.status_code}') + return jobs_list, "" except Exception as e: - if "Proxy responded with non 200 code" in str(e): - raise ZipRecruiterException("bad proxy") - raise ZipRecruiterException(str(e)) + if "Proxy responded with" in str(e): + logger.error(f'Indeed: Bad proxy') + else: + logger.error(f'Indeed: {str(e)}') + return jobs_list, "" - response_data = response.json() - jobs_list = response_data.get("jobs", []) - next_continue_token = response_data.get("continue", None) + res_data = res.json() + jobs_list = res_data.get("jobs", []) + next_continue_token = res_data.get("continue", None) with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor: - job_results = [executor.submit(self.process_job, job) for job in jobs_list] + job_results = [executor.submit(self._process_job, job) for job in jobs_list] job_list = list(filter(None, (result.result() for result in job_results))) return job_list, next_continue_token - def scrape(self, scraper_input: ScraperInput) -> JobResponse: + def _process_job(self, job: dict) -> JobPost | None: """ - Scrapes ZipRecruiter for jobs with scraper_input criteria. - :param scraper_input: Information about job search criteria. - :return: JobResponse containing a list of jobs. + Processes an individual job dict from the response """ - job_list: list[JobPost] = [] - continue_token = None - - max_pages = math.ceil(scraper_input.results_wanted / self.jobs_per_page) - - for page in range(1, max_pages + 1): - if len(job_list) >= scraper_input.results_wanted: - break - - if page > 1: - time.sleep(self.delay) - - jobs_on_page, continue_token = self.find_jobs_in_page( - scraper_input, continue_token - ) - if jobs_on_page: - job_list.extend(jobs_on_page) - - if not continue_token: - break - - return JobResponse(jobs=job_list[: scraper_input.results_wanted]) - - def process_job(self, job: dict) -> JobPost | None: - """Processes an individual job dict from the response""" title = job.get("name") - job_url = f"https://www.ziprecruiter.com/jobs//j?lvk={job['listing_key']}" + job_url = f"{self.base_url}/jobs//j?lvk={job['listing_key']}" if job_url in self.seen_urls: return self.seen_urls.add(job_url) description = job.get("job_description", "").strip() - + description = markdown_converter(description) if self.scraper_input.description_format == DescriptionFormat.MARKDOWN else description company = job.get("hiring_company", {}).get("name") country_value = "usa" if job.get("job_country") == "US" else "canada" country_enum = Country.from_string(country_value) @@ -115,11 +136,10 @@ class ZipRecruiterScraper(Scraper): location = Location( city=job.get("job_city"), state=job.get("job_state"), country=country_enum ) - job_type = ZipRecruiterScraper.get_job_type_enum( + job_type = self._get_job_type_enum( job.get("employment_type", "").replace("_", "").lower() ) date_posted = datetime.fromisoformat(job['posted_time'].rstrip("Z")).date() - return JobPost( title=title, company_name=company, @@ -144,20 +164,19 @@ class ZipRecruiterScraper(Scraper): num_urgent_words=count_urgent_words(description) if description else None, ) - def get_cookies(self): - url="https://api.ziprecruiter.com/jobs-app/event" + def _get_cookies(self): data="event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple" - self.session.post(url, data=data, headers=ZipRecruiterScraper.headers()) + self.session.post(f"{self.api_url}/jobs-app/event", data=data, headers=self.headers) @staticmethod - def get_job_type_enum(job_type_str: str) -> list[JobType] | None: + def _get_job_type_enum(job_type_str: str) -> list[JobType] | None: for job_type in JobType: if job_type_str in job_type.value: return [job_type] return None @staticmethod - def add_params(scraper_input) -> dict[str, str | Any]: + def _add_params(scraper_input) -> dict[str, str | Any]: params = { "search": scraper_input.search_term, "location": scraper_input.location, @@ -177,24 +196,15 @@ class ZipRecruiterScraper(Scraper): params["remote"] = 1 if scraper_input.distance: params["radius"] = scraper_input.distance + return {k: v for k, v in params.items() if v is not None} - params = {k: v for k, v in params.items() if v is not None} - - return params - - @staticmethod - def headers() -> dict: - """ - Returns headers needed for requests - :return: dict - Dictionary containing headers - """ - return { - "Host": "api.ziprecruiter.com", - "accept": "*/*", - "x-zr-zva-override": "100000000;vid:ZT1huzm_EQlDTVEc", - "x-pushnotificationid": "0ff4983d38d7fc5b3370297f2bcffcf4b3321c418f5c22dd152a0264707602a0", - "x-deviceid": "D77B3A92-E589-46A4-8A39-6EF6F1D86006", - "user-agent": "Job Search/87.0 (iPhone; CPU iOS 16_6_1 like Mac OS X)", - "authorization": "Basic YTBlZjMyZDYtN2I0Yy00MWVkLWEyODMtYTI1NDAzMzI0YTcyOg==", - "accept-language": "en-US,en;q=0.9", - } + headers = { + "Host": "api.ziprecruiter.com", + "accept": "*/*", + "x-zr-zva-override": "100000000;vid:ZT1huzm_EQlDTVEc", + "x-pushnotificationid": "0ff4983d38d7fc5b3370297f2bcffcf4b3321c418f5c22dd152a0264707602a0", + "x-deviceid": "D77B3A92-E589-46A4-8A39-6EF6F1D86006", + "user-agent": "Job Search/87.0 (iPhone; CPU iOS 16_6_1 like Mac OS X)", + "authorization": "Basic YTBlZjMyZDYtN2I0Yy00MWVkLWEyODMtYTI1NDAzMzI0YTcyOg==", + "accept-language": "en-US,en;q=0.9", + }