Proxy support (#44)

* add proxy support * return as data frame
2026-03-11 07:54:33 -07:00 · 2023-09-07 11:28:17 -05:00
parent a37e7f235e
commit 59f739018a
10 changed files with 366 additions and 319 deletions
--- a/src/jobspy/scrapers/indeed/init.py
+++ b/src/jobspy/scrapers/indeed/init.py
@@ -1,3 +1,9 @@
+"""
+jobspy.scrapers.indeed
+~~~~~~~~~~~~~~~~~~~
+
+This module contains routines to scrape Indeed.
+"""
 import re
 import math
 import io
@@ -12,6 +18,7 @@ from bs4 import BeautifulSoup
 from bs4.element import Tag
 from concurrent.futures import ThreadPoolExecutor, Future

+from ..exceptions import IndeedException
 from ...jobs import (
    JobPost,
    Compensation,
@@ -20,20 +27,16 @@ from ...jobs import (
    JobResponse,
    JobType,
 )
-from .. import Scraper, ScraperInput, Site, Country, StatusException
-
-
-class ParsingException(Exception):
-    pass
+from .. import Scraper, ScraperInput, Site, Country


 class IndeedScraper(Scraper):
-    def __init__(self):
+    def __init__(self, proxy: Optional[str] = None):
        """
        Initializes IndeedScraper with the Indeed job search url
        """
        site = Site(Site.INDEED)
-        super().__init__(site)
+        super().__init__(site, proxy=proxy)

        self.jobs_per_page = 15
        self.seen_urls = set()
@@ -52,7 +55,7 @@ class IndeedScraper(Scraper):
        domain = self.country.domain_value
        self.url = f"https://{domain}.indeed.com"

-        job_list = []
+        job_list: list[JobPost] = []

        params = {
            "q": scraper_input.search_term,
@@ -71,15 +74,26 @@ class IndeedScraper(Scraper):

        if sc_values:
            params["sc"] = "0kf:" + "".join(sc_values) + ";"
-        response = session.get(self.url + "/jobs", params=params, allow_redirects=True)
-        # print(response.status_code)
-
-        if response.status_code not in range(200, 400):
-            raise StatusException(response.status_code)
+        try:
+            response = session.get(
+                self.url + "/jobs",
+                params=params,
+                allow_redirects=True,
+                proxy=self.proxy,
+                timeout_seconds=10,
+            )
+            if response.status_code not in range(200, 400):
+                raise IndeedException(
+                    f"bad response with status code: {response.status_code}"
+                )
+        except Exception as e:
+            if "Proxy responded with" in str(e):
+                raise IndeedException("bad proxy")
+            raise IndeedException(str(e))

        soup = BeautifulSoup(response.content, "html.parser")
        if "did not match any jobs" in response.text:
-            raise ParsingException("Search did not match any jobs")
+            raise IndeedException("Parsing exception: Search did not match any jobs")

        jobs = IndeedScraper.parse_jobs(
            soup
@@ -91,7 +105,7 @@ class IndeedScraper(Scraper):
            .get("mosaicProviderJobCardsModel", {})
            .get("results")
        ):
-            raise Exception("No jobs found.")
+            raise IndeedException("No jobs found.")

        def process_job(job) -> Optional[JobPost]:
            job_url = f'{self.url}/jobs/viewjob?jk={job["jobkey"]}'
@@ -169,42 +183,24 @@ class IndeedScraper(Scraper):
            math.ceil(scraper_input.results_wanted / self.jobs_per_page) - 1
        )

-        try:
-            #: get first page to initialize session
-            job_list, total_results = self.scrape_page(scraper_input, 0, session)
+        #: get first page to initialize session
+        job_list, total_results = self.scrape_page(scraper_input, 0, session)

-            with ThreadPoolExecutor(max_workers=1) as executor:
-                futures: list[Future] = [
-                    executor.submit(self.scrape_page, scraper_input, page, session)
-                    for page in range(1, pages_to_process + 1)
-                ]
+        with ThreadPoolExecutor(max_workers=1) as executor:
+            futures: list[Future] = [
+                executor.submit(self.scrape_page, scraper_input, page, session)
+                for page in range(1, pages_to_process + 1)
+            ]

-                for future in futures:
-                    jobs, _ = future.result()
+            for future in futures:
+                jobs, _ = future.result()

-                    job_list += jobs
-        except StatusException as e:
-            return JobResponse(
-                success=False,
-                error=f"Indeed returned status code {e.status_code}",
-            )
-
-        except ParsingException as e:
-            return JobResponse(
-                success=False,
-                error=f"Indeed failed to parse response: {e}",
-            )
-        except Exception as e:
-            return JobResponse(
-                success=False,
-                error=f"Indeed failed to scrape: {e}",
-            )
+                job_list += jobs

        if len(job_list) > scraper_input.results_wanted:
            job_list = job_list[: scraper_input.results_wanted]

        job_response = JobResponse(
-            success=True,
            jobs=job_list,
            total_results=total_results,
        )
@@ -224,9 +220,9 @@ class IndeedScraper(Scraper):

        try:
            response = session.get(
-                formatted_url, allow_redirects=True, timeout_seconds=5
+                formatted_url, allow_redirects=True, timeout_seconds=5, proxy=self.proxy
            )
-        except requests.exceptions.Timeout:
+        except Exception as e:
            return None

        if response.status_code not in range(200, 400):
@@ -253,7 +249,6 @@ class IndeedScraper(Scraper):
                    label = taxonomy["attributes"][0].get("label")
                    if label:
                        job_type_str = label.replace("-", "").replace(" ", "").lower()
-                        # print(f"Debug: job_type_str = {job_type_str}")
                        return IndeedScraper.get_enum_from_value(job_type_str)
        return None

@@ -299,9 +294,9 @@ class IndeedScraper(Scraper):
                jobs = json.loads(m.group(1).strip())
                return jobs
            else:
-                raise ParsingException("Could not find mosaic provider job cards data")
+                raise IndeedException("Could not find mosaic provider job cards data")
        else:
-            raise ParsingException(
+            raise IndeedException(
                "Could not find a script tag containing mosaic provider data"
            )