readd proxy support for zip (#64 )

chore: version
2026-03-04 19:44:30 -08:00 · 2023-10-29 08:54:56 -05:00 · 2023-10-28 16:58:32 -05:00 · 2023-10-28 16:54:04 -05:00 · 2023-10-28 16:52:05 -05:00 · 2023-10-28 16:43:44 -05:00
5 changed files with 42 additions and 140 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "python-jobspy"
-version = "1.1.15"
+version = "1.1.22"
 description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
 authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
 homepage = "https://github.com/Bunsly/JobSpy"
--- a/src/jobspy/jobs/init.py
+++ b/src/jobspy/jobs/init.py
@@ -177,8 +177,8 @@ class CompensationInterval(Enum):
 class Compensation(BaseModel):
    interval: Optional[CompensationInterval] = None
-    min_amount: int = None
+    min_amount: int | None = None
-    max_amount: int = None
+    max_amount: int | None = None
    currency: Optional[str] = "USD"
--- a/src/jobspy/scrapers/indeed/init.py
+++ b/src/jobspy/scrapers/indeed/init.py
@@ -58,7 +58,6 @@ class IndeedScraper(Scraper):
        self.country = scraper_input.country
        domain = self.country.domain_value
        self.url = f"https://{domain}.indeed.com"
        session = create_session(self.proxy)
        params = {
            "q": scraper_input.search_term,
@@ -78,6 +77,7 @@ class IndeedScraper(Scraper):
        if sc_values:
            params["sc"] = "0kf:" + "".join(sc_values) + ";"
        try:
            session = create_session(self.proxy, is_tls=True)
            response = session.get(
                f"{self.url}/jobs",
                headers=self.get_headers(),
--- a/src/jobspy/scrapers/utils.py
+++ b/src/jobspy/scrapers/utils.py
@@ -1,4 +1,6 @@
 import re
 import requests
 import tls_client
 from ..jobs import JobType
@@ -24,23 +26,28 @@ def extract_emails_from_text(text: str) -> list[str] | None:
    return email_regex.findall(text)
-def create_session(proxy: str | None = None):
+def create_session(proxy: dict | None = None, is_tls: bool = True):
    """
    Creates a tls client session
    :return: A session object with or without proxies.
    """
-    session = tls_client.Session(
+    if is_tls:
-        client_identifier="chrome112",
+        session = tls_client.Session(
-        random_tls_extension_order=True,
+            client_identifier="chrome112",
-    )
+            random_tls_extension_order=True,
-    session.proxies = proxy
+        )
-    # TODO multiple proxies
+        session.proxies = proxy
-    # if self.proxies:
+        # TODO multiple proxies
-    #     session.proxies = {
+        # if self.proxies:
-    #         "http": random.choice(self.proxies),
+        #     session.proxies = {
-    #         "https": random.choice(self.proxies),
+        #         "http": random.choice(self.proxies),
-    #     }
+        #         "https": random.choice(self.proxies),
        #     }
    else:
        session = requests.Session()
        session.allow_redirects = True
        session.proxies.update(proxy)
    return session
--- a/src/jobspy/scrapers/ziprecruiter/init.py
+++ b/src/jobspy/scrapers/ziprecruiter/init.py
@@ -5,29 +5,18 @@ jobspy.scrapers.ziprecruiter
 This module contains routines to scrape ZipRecruiter.
 """
 import math
-import json
+import time
 import re
 from datetime import datetime, date
 from typing import Optional, Tuple, Any
 from urllib.parse import urlparse, parse_qs, urlunparse
 import requests
 from bs4 import BeautifulSoup
-from bs4.element import Tag
+from concurrent.futures import ThreadPoolExecutor
 from concurrent.futures import ThreadPoolExecutor, Future
 from .. import Scraper, ScraperInput, Site
 from ..exceptions import ZipRecruiterException
 from ..utils import count_urgent_words, extract_emails_from_text, create_session
-from ...jobs import (
+from ...jobs import JobPost, Compensation, Location, JobResponse, JobType
    JobPost,
    Compensation,
    CompensationInterval,
    Location,
    JobResponse,
    JobType,
    Country,
 )
 class ZipRecruiterScraper(Scraper):
@@ -42,21 +31,22 @@ class ZipRecruiterScraper(Scraper):
        self.jobs_per_page = 20
        self.seen_urls = set()
-    def find_jobs_in_page(self, scraper_input: ScraperInput, continue_token: Optional[str] = None) -> Tuple[list[JobPost], Optional[str]]:
+    def find_jobs_in_page(self, scraper_input: ScraperInput, continue_token: str | None = None) -> Tuple[list[JobPost], Optional[str]]:
        """
        Scrapes a page of ZipRecruiter for jobs with scraper_input criteria
        :param scraper_input:
        :param continue_token:
        :return: jobs found on page
        """
        params = self.add_params(scraper_input)
        if continue_token:
            params['continue'] = continue_token
        try:
-            response = requests.get(
+            session = create_session(self.proxy, is_tls=False)
            response = session.get(
                f"https://api.ziprecruiter.com/jobs-app/jobs",
                headers=self.headers(),
                params=self.add_params(scraper_input),
                allow_redirects=True,
                timeout=10,
            )
            if response.status_code != 200:
@@ -68,11 +58,12 @@ class ZipRecruiterScraper(Scraper):
                raise ZipRecruiterException("bad proxy")
            raise ZipRecruiterException(str(e))
        time.sleep(5)
        response_data = response.json()
        jobs_list = response_data.get("jobs", [])
        next_continue_token = response_data.get('continue', None)
-        with ThreadPoolExecutor(max_workers=10) as executor:
+        with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor:
            job_results = [
                executor.submit(self.process_job, job)
                for job in jobs_list
@@ -108,17 +99,17 @@ class ZipRecruiterScraper(Scraper):
        return JobResponse(jobs=job_list)
-    def process_job(self, job: dict) -> JobPost:
+    @staticmethod
-        """the most common type of jobs page on ZR"""
+    def process_job(job: dict) -> JobPost:
        """ Processes an individual job dict from the response """
        title = job.get("name")
        job_url = job.get("job_url")
        # job_url = updated_job_url if updated_job_url else job_url
        description = BeautifulSoup(
            job.get("job_description", "").strip(), "html.parser"
        ).get_text()
-        company = job.get("source")
+        company = job['hiring_company'].get("name") if "hiring_company" in job else None
        location = Location(
            city=job.get("job_city"), state=job.get("job_state"), country='usa' if job.get("job_country") == 'US' else 'canada'
        )
@@ -142,7 +133,12 @@ class ZipRecruiterScraper(Scraper):
            company_name=company,
            location=location,
            job_type=job_type,
-            # compensation=compensation,
+            compensation=Compensation(
                interval="yearly" if job.get("compensation_interval") == "annual" else job.get("compensation_interval"),
                min_amount=int(job["compensation_min"]) if "compensation_min" in job else None,
                max_amount=int(job["compensation_max"]) if "compensation_max" in job else None,
                currency=job.get("compensation_currency"),
            ),
            date_posted=date_posted,
            job_url=job_url,
            description=description,
@@ -186,107 +182,6 @@ class ZipRecruiterScraper(Scraper):
        return params
    @staticmethod
    def get_interval(interval_str: str):
        """
         Maps the interval alias to its appropriate CompensationInterval.
        :param interval_str
        :return: CompensationInterval
        """
        interval_alias = {"annually": CompensationInterval.YEARLY}
        interval_str = interval_str.lower()
        if interval_str in interval_alias:
            return interval_alias[interval_str]
        return CompensationInterval(interval_str)
    @staticmethod
    def get_date_posted(job: Tag) -> Optional[datetime.date]:
        """
        Extracts the date a job was posted
        :param job
        :return: date the job was posted or None
        """
        button = job.find(
            "button", {"class": "action_input save_job zrs_btn_secondary_200"}
        )
        if not button:
            return None
        url_time = button.get("data-href", "")
        url_components = urlparse(url_time)
        params = parse_qs(url_components.query)
        posted_time_str = params.get("posted_time", [None])[0]
        if posted_time_str:
            posted_date = datetime.strptime(
                posted_time_str, "%Y-%m-%dT%H:%M:%SZ"
            ).date()
            return posted_date
        return None
    @staticmethod
    def get_compensation(job: Tag) -> Optional[Compensation]:
        """
        Parses the compensation tag from the job BeautifulSoup object
        :param job
        :return: Compensation object or None
        """
        pay_element = job.find("li", {"class": "perk_item perk_pay"})
        if pay_element is None:
            return None
        pay = pay_element.find("div", {"class": "value"}).find("span").text.strip()
        def create_compensation_object(pay_string: str) -> Compensation:
            """
            Creates a Compensation object from a pay_string
            :param pay_string
            :return: compensation
            """
            interval = ZipRecruiterScraper.get_interval(pay_string.split()[-1])
            amounts = []
            for amount in pay_string.split("to"):
                amount = amount.replace(",", "").strip("$ ").split(" ")[0]
                if "K" in amount:
                    amount = amount.replace("K", "")
                    amount = int(float(amount)) * 1000
                else:
                    amount = int(float(amount))
                amounts.append(amount)
            compensation = Compensation(
                interval=interval,
                min_amount=min(amounts),
                max_amount=max(amounts),
                currency="USD/CAD",
            )
            return compensation
        return create_compensation_object(pay)
    @staticmethod
    def get_location(job: Tag) -> Location:
        """
        Extracts the job location from BeatifulSoup object
        :param job:
        :return: location
        """
        location_link = job.find("a", {"class": "company_location"})
        if location_link is not None:
            location_string = location_link.text.strip()
            parts = location_string.split(", ")
            if len(parts) == 2:
                city, state = parts
            else:
                city, state = None, None
        else:
            city, state = None, None
        return Location(city=city, state=state, country=Country.US_CANADA)
    @staticmethod
    def headers() -> dict:
        """
Author	SHA1	Message	Date
Cullen Watson	e3fc222eb5	readd proxy support for zip (#64 )	2023-10-29 08:54:56 -05:00
Cullen	b303b3f841	chore: version	2023-10-28 16:58:32 -05:00
Cullen	1a0c75f323	chore: version	2023-10-28 16:54:04 -05:00
Cullen	e2f6885d61	chore: format	2023-10-28 16:52:05 -05:00
Cullen	8d65d1b652	[chore] version	2023-10-28 16:43:44 -05:00
Cullen	216d3fd39f	ziprecruiter: 5s delay	2023-10-28 16:41:32 -05:00
Cullen Watson	d3bfdc0a6e	ziprecruiter api (#63 )	2023-10-28 16:17:28 -05:00