fix job type search (#106)

2026-03-11 07:54:33 -07:00 · 2024-02-12 11:02:48 -06:00
parent 91b137ef86
commit aeb1a50d2c
7 changed files with 220 additions and 137 deletions
--- a/src/jobspy/scrapers/indeed/init.py
+++ b/src/jobspy/scrapers/indeed/init.py
@@ -11,7 +11,6 @@ import requests
 from typing import Any
 from datetime import datetime

-import urllib.parse
 from bs4 import BeautifulSoup
 from bs4.element import Tag
 from concurrent.futures import ThreadPoolExecutor, Future
@@ -22,7 +21,7 @@ from ..utils import (
    extract_emails_from_text,
    create_session,
    get_enum_from_job_type,
-    modify_and_get_description
+    logger
 )
 from ...jobs import (
    JobPost,
@@ -50,13 +49,14 @@ class IndeedScraper(Scraper):

    def scrape_page(
        self, scraper_input: ScraperInput, page: int
-    ) -> tuple[list[JobPost], int]:
+    ) -> list[JobPost]:
        """
        Scrapes a page of Indeed for jobs with scraper_input criteria
        :param scraper_input:
        :param page:
        :return: jobs found on page, total number of jobs found for search
        """
+        job_list = []
        self.country = scraper_input.country
        domain = self.country.indeed_domain_value
        self.url = f"https://{domain}.indeed.com"
@@ -76,14 +76,14 @@ class IndeedScraper(Scraper):
                )
        except Exception as e:
            if "Proxy responded with" in str(e):
-                raise IndeedException("bad proxy")
-            raise IndeedException(str(e))
+                logger.error(f'Indeed: Bad proxy')
+            else:
+                logger.error(f'Indeed: {str(e)}')
+            return job_list

        soup = BeautifulSoup(response.content, "html.parser")
-        job_list = []
-        total_num_jobs = IndeedScraper.total_jobs(soup)
        if "did not match any jobs" in response.text:
-            return job_list, total_num_jobs
+            return job_list

        jobs = IndeedScraper.parse_jobs(
            soup
@@ -145,7 +145,7 @@ class IndeedScraper(Scraper):

        job_list = [result.result() for result in job_results if result.result()]

-        return job_list, total_num_jobs
+        return job_list

    def scrape(self, scraper_input: ScraperInput) -> JobResponse:
        """
@@ -153,7 +153,7 @@ class IndeedScraper(Scraper):
        :param scraper_input:
        :return: job_response
        """
-        job_list, total_results = self.scrape_page(scraper_input, 0)
+        job_list = self.scrape_page(scraper_input, 0)
        pages_processed = 1

        while len(self.seen_urls) < scraper_input.results_wanted:
@@ -167,7 +167,7 @@ class IndeedScraper(Scraper):
                ]

                for future in futures:
-                    jobs, _ = future.result()
+                    jobs = future.result()
                    if jobs:
                        job_list += jobs
                        new_jobs = True
@@ -182,55 +182,7 @@ class IndeedScraper(Scraper):
        if len(self.seen_urls) > scraper_input.results_wanted:
            job_list = job_list[:scraper_input.results_wanted]

-        job_response = JobResponse(
-            jobs=job_list,
-            total_results=total_results,
-        )
-        return job_response
-
-    def get_description(self, job_page_url: str) -> str | None:
-        """
-        Retrieves job description by going to the job page url
-        :param job_page_url:
-        :return: description
-        """
-        parsed_url = urllib.parse.urlparse(job_page_url)
-        params = urllib.parse.parse_qs(parsed_url.query)
-        jk_value = params.get("jk", [None])[0]
-        formatted_url = f"{self.url}/m/viewjob?jk={jk_value}&spa=1"
-        session = create_session(self.proxy)
-
-        try:
-            response = session.get(
-                formatted_url,
-                headers=self.get_headers(),
-                allow_redirects=True,
-                timeout_seconds=5,
-            )
-        except Exception as e:
-            return None
-
-        if response.status_code not in range(200, 400):
-            return None
-
-        try:
-            soup = BeautifulSoup(response.text, 'html.parser')
-            script_tags = soup.find_all('script')
-
-            job_description = ''
-            for tag in script_tags:
-                if 'window._initialData' in tag.text:
-                    json_str = tag.text
-                    json_str = json_str.split('window._initialData=')[1]
-                    json_str = json_str.rsplit(';', 1)[0]
-                    data = json.loads(json_str)
-                    job_description = data["jobInfoWrapperModel"]["jobInfoModel"]["sanitizedJobDescription"]
-                    break
-        except (KeyError, TypeError, IndexError):
-            return None
-
-        soup = BeautifulSoup(job_description, "html.parser")
-        return modify_and_get_description(soup)
+        return JobResponse(jobs=job_list)

    @staticmethod
    def get_job_type(job: dict) -> list[JobType] | None:
@@ -330,24 +282,6 @@ class IndeedScraper(Scraper):
                "Could not find any results for the search"
            )

-    @staticmethod
-    def total_jobs(soup: BeautifulSoup) -> int:
-        """
-        Parses the total jobs for that search from soup object
-        :param soup:
-        :return: total_num_jobs
-        """
-        script = soup.find("script", string=lambda t: t and "window._initialData" in t)
-
-        pattern = re.compile(r"window._initialData\s*=\s*({.*})\s*;", re.DOTALL)
-        match = pattern.search(script.string)
-        total_num_jobs = 0
-        if match:
-            json_str = match.group(1)
-            data = json.loads(json_str)
-            total_num_jobs = int(data["searchTitleBarModel"]["totalNumResults"])
-        return total_num_jobs
-
    @staticmethod
    def get_headers():
        return {
@@ -380,7 +314,7 @@ class IndeedScraper(Scraper):
        if scraper_input.is_remote:
            sc_values.append("attr(DSQF7)")
        if scraper_input.job_type:
-            sc_values.append("jt({})".format(scraper_input.job_type.value))
+            sc_values.append("jt({})".format(scraper_input.job_type.value[0]))

        if sc_values:
            params["sc"] = "0kf:" + "".join(sc_values) + ";"
@@ -406,7 +340,7 @@ class IndeedScraper(Scraper):
            taxonomy["label"] == "remote" and len(taxonomy["attributes"]) > 0
            for taxonomy in job.get("taxonomyAttributes", [])
        )
-        return is_remote_in_attributes or is_remote_in_description or is_remote_in_location
+        return is_remote_in_attributes or is_remote_in_description or is_remote_in_location or is_remote_in_taxonomy

    def get_job_details(self, job_keys: list[str]) -> dict:
        """