From 75e3f73ba3250149f820a5720abf417b7ab55664 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Tue, 10 Oct 2023 10:20:44 -0500 Subject: [PATCH] fix(indeed): use new session obj per request --- src/jobspy/__init__.py | 1 + src/jobspy/scrapers/indeed/__init__.py | 72 +++++++++++++++++++------- 2 files changed, 53 insertions(+), 20 deletions(-) diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py index 60eee05..a2ead6a 100644 --- a/src/jobspy/__init__.py +++ b/src/jobspy/__init__.py @@ -122,6 +122,7 @@ def scrape_jobs( job_data["company"] = job_data["company_name"] job_data["job_type"] = ", ".join(job_type.value[0] for job_type in job_data["job_type"]) if job_data[ "job_type"] else None + job_data["location"] = Location(**job_data["location"]).display_location() diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/jobspy/scrapers/indeed/__init__.py index 65d1144..8d51761 100644 --- a/src/jobspy/scrapers/indeed/__init__.py +++ b/src/jobspy/scrapers/indeed/__init__.py @@ -9,7 +9,6 @@ import math import io import json from datetime import datetime -from typing import Optional import tls_client import urllib.parse @@ -31,7 +30,7 @@ from .. import Scraper, ScraperInput, Site class IndeedScraper(Scraper): - def __init__(self, proxy: Optional[str] = None): + def __init__(self, proxy: str | None = None): """ Initializes IndeedScraper with the Indeed job search url """ @@ -44,20 +43,18 @@ class IndeedScraper(Scraper): self.seen_urls = set() def scrape_page( - self, scraper_input: ScraperInput, page: int, session: tls_client.Session + self, scraper_input: ScraperInput, page: int ) -> tuple[list[JobPost], int]: """ Scrapes a page of Indeed for jobs with scraper_input criteria :param scraper_input: :param page: - :param session: :return: jobs found on page, total number of jobs found for search """ self.country = scraper_input.country domain = self.country.domain_value self.url = f"https://{domain}.indeed.com" - - job_list: list[JobPost] = [] + session = self.create_session() params = { "q": scraper_input.search_term, @@ -79,9 +76,9 @@ class IndeedScraper(Scraper): try: response = session.get( f"{self.url}/jobs", + headers=self.get_headers(), params=params, allow_redirects=True, - proxy=self.proxy, timeout_seconds=10, ) if response.status_code not in range(200, 400): @@ -109,7 +106,7 @@ class IndeedScraper(Scraper): ): raise IndeedException("No jobs found.") - def process_job(job) -> Optional[JobPost]: + def process_job(job) -> JobPost | None: job_url = f'{self.url}/jobs/viewjob?jk={job["jobkey"]}' job_url_client = f'{self.url}/viewjob?jk={job["jobkey"]}' if job_url in self.seen_urls: @@ -138,7 +135,7 @@ class IndeedScraper(Scraper): date_posted = datetime.fromtimestamp(timestamp_seconds) date_posted = date_posted.strftime("%Y-%m-%d") - description = self.get_description(job_url, session) + description = self.get_description(job_url) with io.StringIO(job["snippet"]) as f: soup_io = BeautifulSoup(f, "html.parser") li_elements = soup_io.find_all("li") @@ -179,20 +176,16 @@ class IndeedScraper(Scraper): :param scraper_input: :return: job_response """ - session = tls_client.Session( - client_identifier="chrome112", random_tls_extension_order=True - ) - pages_to_process = ( math.ceil(scraper_input.results_wanted / self.jobs_per_page) - 1 ) #: get first page to initialize session - job_list, total_results = self.scrape_page(scraper_input, 0, session) + job_list, total_results = self.scrape_page(scraper_input, 0) with ThreadPoolExecutor(max_workers=1) as executor: futures: list[Future] = [ - executor.submit(self.scrape_page, scraper_input, page, session) + executor.submit(self.scrape_page, scraper_input, page) for page in range(1, pages_to_process + 1) ] @@ -210,21 +203,24 @@ class IndeedScraper(Scraper): ) return job_response - def get_description(self, job_page_url: str, session: tls_client.Session) -> Optional[str]: + def get_description(self, job_page_url: str) -> str | None: """ Retrieves job description by going to the job page url :param job_page_url: - :param session: :return: description """ parsed_url = urllib.parse.urlparse(job_page_url) params = urllib.parse.parse_qs(parsed_url.query) jk_value = params.get("jk", [None])[0] formatted_url = f"{self.url}/viewjob?jk={jk_value}&spa=1" + session = self.create_session() try: response = session.get( - formatted_url, allow_redirects=True, timeout_seconds=5, proxy=self.proxy + formatted_url, + headers=self.get_headers(), + allow_redirects=True, + timeout_seconds=5, ) except Exception as e: return None @@ -241,7 +237,7 @@ class IndeedScraper(Scraper): return text_content @staticmethod - def get_job_type(job: dict) -> Optional[list[JobType]]: + def get_job_type(job: dict) -> list[JobType] | None: """ Parses the job to get list of job types :param job: @@ -276,7 +272,7 @@ class IndeedScraper(Scraper): :return: jobs """ - def find_mosaic_script() -> Optional[Tag]: + def find_mosaic_script() -> Tag | None: """ Finds jobcards script tag :return: script_tag @@ -326,3 +322,39 @@ class IndeedScraper(Scraper): data = json.loads(json_str) total_num_jobs = int(data["searchTitleBarModel"]["totalNumResults"]) return total_num_jobs + + @staticmethod + def get_headers(): + return { + 'authority': 'www.indeed.com', + 'accept': '*/*', + 'accept-language': 'en-US,en;q=0.9', + 'referer': 'https://www.indeed.com/viewjob?jk=fe6182337d72c7b1&tk=1hcbfcmd0k62t802&from=serp&vjs=3&advn=8132938064490989&adid=408692607&ad=-6NYlbfkN0A3Osc99MJFDKjquSk4WOGT28ALb_ad4QMtrHreCb9ICg6MiSVy9oDAp3evvOrI7Q-O9qOtQTg1EPbthP9xWtBN2cOuVeHQijxHjHpJC65TjDtftH3AXeINjBvAyDrE8DrRaAXl8LD3Fs1e_xuDHQIssdZ2Mlzcav8m5jHrA0fA64ZaqJV77myldaNlM7-qyQpy4AsJQfvg9iR2MY7qeC5_FnjIgjKIy_lNi9OPMOjGRWXA94CuvC7zC6WeiJmBQCHISl8IOBxf7EdJZlYdtzgae3593TFxbkd6LUwbijAfjax39aAuuCXy3s9C4YgcEP3TwEFGQoTpYu9Pmle-Ae1tHGPgsjxwXkgMm7Cz5mBBdJioglRCj9pssn-1u1blHZM4uL1nK9p1Y6HoFgPUU9xvKQTHjKGdH8d4y4ETyCMoNF4hAIyUaysCKdJKitC8PXoYaWhDqFtSMR4Jys8UPqUV&xkcb=SoDD-_M3JLQfWnQTDh0LbzkdCdPP&xpse=SoBa6_I3JLW9FlWZlB0PbzkdCdPP&sjdu=i6xVERweJM_pVUvgf-MzuaunBTY7G71J5eEX6t4DrDs5EMPQdODrX7Nn-WIPMezoqr5wA_l7Of-3CtoiUawcHw', + 'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': '"Windows"', + 'sec-fetch-dest': 'empty', + 'sec-fetch-mode': 'cors', + 'sec-fetch-site': 'same-origin', + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36' + } + + def create_session(self): + """ + Creates a session with specific client identifiers and assigns proxies if available. + + :return: A session object with or without proxies. + """ + session = tls_client.Session( + client_identifier="chrome112", + random_tls_extension_order=True, + ) + session.proxies = self.proxy + # TODO multiple proxies + # if self.proxies: + # session.proxies = { + # "http": random.choice(self.proxies), + # "https": random.choice(self.proxies), + # } + + return session