From 13c76944746c701bec6c5aaa666bd3979e5fe422 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Fri, 2 Feb 2024 17:47:15 -0600 Subject: [PATCH] Easy apply (#95) * enh(glassdoor): easy apply filter * enh(ziprecruiter): easy apply * enh(indeed): use mobile headers * chore: version --- README.md | 2 +- pyproject.toml | 2 +- src/jobspy/scrapers/indeed/__init__.py | 76 +++++++++++--------- src/jobspy/scrapers/ziprecruiter/__init__.py | 2 + 4 files changed, 45 insertions(+), 37 deletions(-) diff --git a/README.md b/README.md index e798335..15df377 100644 --- a/README.md +++ b/README.md @@ -69,7 +69,7 @@ Optional ├── is_remote (bool) ├── full_description (bool): fetches full description for Indeed / LinkedIn (much slower) ├── results_wanted (int): number of job results to retrieve for each site specified in 'site_type' -├── easy_apply (bool): filters for jobs that are hosted on LinkedIn, Glassdoor +├── easy_apply (bool): filters for jobs that are hosted on the job board site ├── country_indeed (enum): filters the country on Indeed (see below for correct spelling) ├── offset (num): starts the search from an offset (e.g. 25 will start the search from the 25th result) ``` diff --git a/pyproject.toml b/pyproject.toml index 049155d..783de19 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "python-jobspy" -version = "1.1.37" +version = "1.1.38" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/Bunsly/JobSpy" diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/jobspy/scrapers/indeed/__init__.py index eeb7ff8..f1a714b 100644 --- a/src/jobspy/scrapers/indeed/__init__.py +++ b/src/jobspy/scrapers/indeed/__init__.py @@ -8,6 +8,7 @@ import re import math import io import json +from typing import Any from datetime import datetime import urllib.parse @@ -44,7 +45,7 @@ class IndeedScraper(Scraper): site = Site(Site.INDEED) super().__init__(site, proxy=proxy) - self.jobs_per_page = 15 + self.jobs_per_page = 25 self.seen_urls = set() def scrape_page( @@ -60,30 +61,12 @@ class IndeedScraper(Scraper): domain = self.country.indeed_domain_value self.url = f"https://{domain}.indeed.com" - params = { - "q": scraper_input.search_term, - "l": scraper_input.location, - "filter": 0, - "start": scraper_input.offset + page * 10, - "sort": "date" - } - if scraper_input.distance: - params["radius"] = scraper_input.distance - - sc_values = [] - if scraper_input.is_remote: - sc_values.append("attr(DSQF7)") - if scraper_input.job_type: - sc_values.append("jt({})".format(scraper_input.job_type.value)) - - if sc_values: - params["sc"] = "0kf:" + "".join(sc_values) + ";" try: session = create_session(self.proxy) response = session.get( - f"{self.url}/jobs", + f"{self.url}/m/jobs", headers=self.get_headers(), - params=params, + params=self.add_params(scraper_input, page), allow_redirects=True, timeout_seconds=10, ) @@ -112,8 +95,8 @@ class IndeedScraper(Scraper): ): raise IndeedException("No jobs found.") - def process_job(job) -> JobPost | None: - job_url = f'{self.url}/jobs/viewjob?jk={job["jobkey"]}' + def process_job(job: dict) -> JobPost | None: + job_url = f'{self.url}/m/jobs/viewjob?jk={job["jobkey"]}' job_url_client = f'{self.url}/viewjob?jk={job["jobkey"]}' if job_url in self.seen_urls: return None @@ -194,7 +177,7 @@ class IndeedScraper(Scraper): #: get first page to initialize session job_list, total_results = self.scrape_page(scraper_input, 0) - with ThreadPoolExecutor(max_workers=1) as executor: + with ThreadPoolExecutor(max_workers=10) as executor: futures: list[Future] = [ executor.submit(self.scrape_page, scraper_input, page) for page in range(1, pages_to_process + 1) @@ -331,17 +314,14 @@ class IndeedScraper(Scraper): @staticmethod def get_headers(): return { - "authority": "www.indeed.com", - "accept": "*/*", - "accept-language": "en-US,en;q=0.9", - "referer": "https://www.indeed.com/viewjob?jk=fe6182337d72c7b1&tk=1hcbfcmd0k62t802&from=serp&vjs=3&advn=8132938064490989&adid=408692607&ad=-6NYlbfkN0A3Osc99MJFDKjquSk4WOGT28ALb_ad4QMtrHreCb9ICg6MiSVy9oDAp3evvOrI7Q-O9qOtQTg1EPbthP9xWtBN2cOuVeHQijxHjHpJC65TjDtftH3AXeINjBvAyDrE8DrRaAXl8LD3Fs1e_xuDHQIssdZ2Mlzcav8m5jHrA0fA64ZaqJV77myldaNlM7-qyQpy4AsJQfvg9iR2MY7qeC5_FnjIgjKIy_lNi9OPMOjGRWXA94CuvC7zC6WeiJmBQCHISl8IOBxf7EdJZlYdtzgae3593TFxbkd6LUwbijAfjax39aAuuCXy3s9C4YgcEP3TwEFGQoTpYu9Pmle-Ae1tHGPgsjxwXkgMm7Cz5mBBdJioglRCj9pssn-1u1blHZM4uL1nK9p1Y6HoFgPUU9xvKQTHjKGdH8d4y4ETyCMoNF4hAIyUaysCKdJKitC8PXoYaWhDqFtSMR4Jys8UPqUV&xkcb=SoDD-_M3JLQfWnQTDh0LbzkdCdPP&xpse=SoBa6_I3JLW9FlWZlB0PbzkdCdPP&sjdu=i6xVERweJM_pVUvgf-MzuaunBTY7G71J5eEX6t4DrDs5EMPQdODrX7Nn-WIPMezoqr5wA_l7Of-3CtoiUawcHw", - "sec-ch-ua": '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"', - "sec-ch-ua-mobile": "?0", - "sec-ch-ua-platform": '"Windows"', - "sec-fetch-dest": "empty", - "sec-fetch-mode": "cors", - "sec-fetch-site": "same-origin", - "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", + 'Host': 'www.indeed.com', + 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'sec-fetch-site': 'same-origin', + 'sec-fetch-dest': 'document', + 'accept-language': 'en-US,en;q=0.9', + 'sec-fetch-mode': 'navigate', + 'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Indeed App 192.0', + 'referer': 'https://www.indeed.com/m/jobs?q=software%20intern&l=Dallas%2C%20TX&from=serpso&rq=1&rsIdx=3', } @staticmethod @@ -354,3 +334,29 @@ class IndeedScraper(Scraper): if taxonomy["label"] == "remote" and len(taxonomy["attributes"]) > 0: return True return False + + @staticmethod + def add_params(scraper_input: ScraperInput, page: int) -> dict[str, str | Any]: + params = { + "q": scraper_input.search_term, + "l": scraper_input.location, + "filter": 0, + "start": scraper_input.offset + page * 10, + "sort": "date" + } + if scraper_input.distance: + params["radius"] = scraper_input.distance + + sc_values = [] + if scraper_input.is_remote: + sc_values.append("attr(DSQF7)") + if scraper_input.job_type: + sc_values.append("jt({})".format(scraper_input.job_type.value)) + + if sc_values: + params["sc"] = "0kf:" + "".join(sc_values) + ";" + + if scraper_input.easy_apply: + params['iafilter'] = 1 + + return params diff --git a/src/jobspy/scrapers/ziprecruiter/__init__.py b/src/jobspy/scrapers/ziprecruiter/__init__.py index 16a67f3..e73d69c 100644 --- a/src/jobspy/scrapers/ziprecruiter/__init__.py +++ b/src/jobspy/scrapers/ziprecruiter/__init__.py @@ -183,6 +183,8 @@ class ZipRecruiterScraper(Scraper): job_type_value = "part_time" else: job_type_value = scraper_input.job_type.value + if scraper_input.easy_apply: + params['zipapply'] = 1 if job_type_value: params[