[fix] add compensation

[enh] use ziprecuriter api
2026-03-05 03:54:31 -08:00 · 2023-10-28 16:13:10 -05:00 · 2023-10-28 15:50:28 -05:00
4 changed files with 137 additions and 35 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "python-jobspy"
-version = "1.1.23"
+version = "1.1.16"
 description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
 authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
 homepage = "https://github.com/Bunsly/JobSpy"
--- a/src/jobspy/scrapers/indeed/init.py
+++ b/src/jobspy/scrapers/indeed/init.py
@@ -58,6 +58,7 @@ class IndeedScraper(Scraper):
        self.country = scraper_input.country
        domain = self.country.domain_value
        self.url = f"https://{domain}.indeed.com"
+        session = create_session(self.proxy)

        params = {
            "q": scraper_input.search_term,
@@ -77,7 +78,6 @@ class IndeedScraper(Scraper):
        if sc_values:
            params["sc"] = "0kf:" + "".join(sc_values) + ";"
        try:
-            session = create_session(self.proxy, is_tls=True)
            response = session.get(
                f"{self.url}/jobs",
                headers=self.get_headers(),
--- a/src/jobspy/scrapers/utils.py
+++ b/src/jobspy/scrapers/utils.py
@@ -1,6 +1,4 @@
 import re
-
-import requests
 import tls_client
 from ..jobs import JobType

@@ -26,29 +24,23 @@ def extract_emails_from_text(text: str) -> list[str] | None:
    return email_regex.findall(text)


-def create_session(proxy: dict | None = None, is_tls: bool = True):
+def create_session(proxy: str | None = None):
    """
    Creates a tls client session

    :return: A session object with or without proxies.
    """
-    if is_tls:
-        session = tls_client.Session(
-            client_identifier="chrome112",
-            random_tls_extension_order=True,
-        )
-        session.proxies = proxy
-        # TODO multiple proxies
-        # if self.proxies:
-        #     session.proxies = {
-        #         "http": random.choice(self.proxies),
-        #         "https": random.choice(self.proxies),
-        #     }
-    else:
-        session = requests.Session()
-        session.allow_redirects = True
-        if proxy:
-            session.proxies.update(proxy)
+    session = tls_client.Session(
+        client_identifier="chrome112",
+        random_tls_extension_order=True,
+    )
+    session.proxies = proxy
+    # TODO multiple proxies
+    # if self.proxies:
+    #     session.proxies = {
+    #         "http": random.choice(self.proxies),
+    #         "https": random.choice(self.proxies),
+    #     }

    return session

--- a/src/jobspy/scrapers/ziprecruiter/init.py
+++ b/src/jobspy/scrapers/ziprecruiter/init.py
@@ -5,18 +5,29 @@ jobspy.scrapers.ziprecruiter
 This module contains routines to scrape ZipRecruiter.
 """
 import math
-import time
+import json
 import re
 from datetime import datetime, date
 from typing import Optional, Tuple, Any
+from urllib.parse import urlparse, parse_qs, urlunparse

+import requests
 from bs4 import BeautifulSoup
-from concurrent.futures import ThreadPoolExecutor
+from bs4.element import Tag
+from concurrent.futures import ThreadPoolExecutor, Future

 from .. import Scraper, ScraperInput, Site
 from ..exceptions import ZipRecruiterException
 from ..utils import count_urgent_words, extract_emails_from_text, create_session
-from ...jobs import JobPost, Compensation, Location, JobResponse, JobType
+from ...jobs import (
+    JobPost,
+    Compensation,
+    CompensationInterval,
+    Location,
+    JobResponse,
+    JobType,
+    Country,
+)


 class ZipRecruiterScraper(Scraper):
@@ -31,22 +42,21 @@ class ZipRecruiterScraper(Scraper):
        self.jobs_per_page = 20
        self.seen_urls = set()

-    def find_jobs_in_page(self, scraper_input: ScraperInput, continue_token: str | None = None) -> Tuple[list[JobPost], Optional[str]]:
+    def find_jobs_in_page(self, scraper_input: ScraperInput, continue_token: Optional[str] = None) -> Tuple[list[JobPost], Optional[str]]:
        """
        Scrapes a page of ZipRecruiter for jobs with scraper_input criteria
        :param scraper_input:
-        :param continue_token:
        :return: jobs found on page
        """
        params = self.add_params(scraper_input)
        if continue_token:
            params['continue'] = continue_token
        try:
-            session = create_session(self.proxy, is_tls=False)
-            response = session.get(
+            response = requests.get(
                f"https://api.ziprecruiter.com/jobs-app/jobs",
                headers=self.headers(),
                params=self.add_params(scraper_input),
+                allow_redirects=True,
                timeout=10,
            )
            if response.status_code != 200:
@@ -58,12 +68,11 @@ class ZipRecruiterScraper(Scraper):
                raise ZipRecruiterException("bad proxy")
            raise ZipRecruiterException(str(e))

-        time.sleep(5)
        response_data = response.json()
        jobs_list = response_data.get("jobs", [])
        next_continue_token = response_data.get('continue', None)

-        with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor:
+        with ThreadPoolExecutor(max_workers=10) as executor:
            job_results = [
                executor.submit(self.process_job, job)
                for job in jobs_list
@@ -99,9 +108,8 @@ class ZipRecruiterScraper(Scraper):

        return JobResponse(jobs=job_list)

-    @staticmethod
-    def process_job(job: dict) -> JobPost:
-        """ Processes an individual job dict from the response """
+    def process_job(self, job: dict) -> JobPost:
+        """the most common type of jobs page on ZR"""
        title = job.get("name")
        job_url = job.get("job_url")

@@ -128,13 +136,14 @@ class ZipRecruiterScraper(Scraper):
        else:
            date_posted = date.today()

+
        return JobPost(
            title=title,
            company_name=company,
            location=location,
            job_type=job_type,
            compensation=Compensation(
-                interval="yearly" if job.get("compensation_interval") == "annual" else job.get("compensation_interval"),
+                interval="yearly" if job.get("compensation_interval") == "annual" else job.get("compensation_interval") ,
                min_amount=int(job["compensation_min"]) if "compensation_min" in job else None,
                max_amount=int(job["compensation_max"]) if "compensation_max" in job else None,
                currency=job.get("compensation_currency"),
@@ -182,6 +191,107 @@ class ZipRecruiterScraper(Scraper):

        return params

+    @staticmethod
+    def get_interval(interval_str: str):
+        """
+         Maps the interval alias to its appropriate CompensationInterval.
+        :param interval_str
+        :return: CompensationInterval
+        """
+        interval_alias = {"annually": CompensationInterval.YEARLY}
+        interval_str = interval_str.lower()
+
+        if interval_str in interval_alias:
+            return interval_alias[interval_str]
+
+        return CompensationInterval(interval_str)
+
+    @staticmethod
+    def get_date_posted(job: Tag) -> Optional[datetime.date]:
+        """
+        Extracts the date a job was posted
+        :param job
+        :return: date the job was posted or None
+        """
+        button = job.find(
+            "button", {"class": "action_input save_job zrs_btn_secondary_200"}
+        )
+        if not button:
+            return None
+
+        url_time = button.get("data-href", "")
+        url_components = urlparse(url_time)
+        params = parse_qs(url_components.query)
+        posted_time_str = params.get("posted_time", [None])[0]
+
+        if posted_time_str:
+            posted_date = datetime.strptime(
+                posted_time_str, "%Y-%m-%dT%H:%M:%SZ"
+            ).date()
+            return posted_date
+
+        return None
+
+    @staticmethod
+    def get_compensation(job: Tag) -> Optional[Compensation]:
+        """
+        Parses the compensation tag from the job BeautifulSoup object
+        :param job
+        :return: Compensation object or None
+        """
+        pay_element = job.find("li", {"class": "perk_item perk_pay"})
+        if pay_element is None:
+            return None
+        pay = pay_element.find("div", {"class": "value"}).find("span").text.strip()
+
+        def create_compensation_object(pay_string: str) -> Compensation:
+            """
+            Creates a Compensation object from a pay_string
+            :param pay_string
+            :return: compensation
+            """
+            interval = ZipRecruiterScraper.get_interval(pay_string.split()[-1])
+
+            amounts = []
+            for amount in pay_string.split("to"):
+                amount = amount.replace(",", "").strip("$ ").split(" ")[0]
+                if "K" in amount:
+                    amount = amount.replace("K", "")
+                    amount = int(float(amount)) * 1000
+                else:
+                    amount = int(float(amount))
+                amounts.append(amount)
+
+            compensation = Compensation(
+                interval=interval,
+                min_amount=min(amounts),
+                max_amount=max(amounts),
+                currency="USD/CAD",
+            )
+
+            return compensation
+
+        return create_compensation_object(pay)
+
+    @staticmethod
+    def get_location(job: Tag) -> Location:
+        """
+        Extracts the job location from BeatifulSoup object
+        :param job:
+        :return: location
+        """
+        location_link = job.find("a", {"class": "company_location"})
+        if location_link is not None:
+            location_string = location_link.text.strip()
+            parts = location_string.split(", ")
+            if len(parts) == 2:
+                city, state = parts
+            else:
+                city, state = None, None
+        else:
+            city, state = None, None
+        return Location(city=city, state=state, country=Country.US_CANADA)
+
    @staticmethod
    def headers() -> dict:
        """
Author	SHA1	Message	Date
Cullen	78c1ec8e9f	[fix] add compensation	2023-10-28 16:13:10 -05:00
Cullen	a2dd93aca1	[enh] use ziprecuriter api	2023-10-28 15:50:28 -05:00