readd proxy support for zip (#64)

2023-10-29 08:54:56 -05:00 · 2023-10-29 08:54:56 -05:00 · e3fc222eb5
parent b303b3f841
commit e3fc222eb5
4 changed files with 32 additions and 136 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "python-jobspy"
-version = "1.1.21"
+version = "1.1.22"
 description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
 authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
 homepage = "https://github.com/Bunsly/JobSpy"
--- a/src/jobspy/scrapers/indeed/init.py
+++ b/src/jobspy/scrapers/indeed/init.py
@ -58,7 +58,6 @@ class IndeedScraper(Scraper):
        self.country = scraper_input.country
        domain = self.country.domain_value
        self.url = f"https://{domain}.indeed.com"
-        session = create_session(self.proxy)

        params = {
            "q": scraper_input.search_term,
@ -78,6 +77,7 @@ class IndeedScraper(Scraper):
        if sc_values:
            params["sc"] = "0kf:" + "".join(sc_values) + ";"
        try:
+            session = create_session(self.proxy, is_tls=True)
            response = session.get(
                f"{self.url}/jobs",
                headers=self.get_headers(),
--- a/src/jobspy/scrapers/utils.py
+++ b/src/jobspy/scrapers/utils.py
@ -1,4 +1,6 @@
 import re
+
+import requests
 import tls_client
 from ..jobs import JobType

@ -24,23 +26,28 @@ def extract_emails_from_text(text: str) -> list[str] | None:
    return email_regex.findall(text)


-def create_session(proxy: str | None = None):
+def create_session(proxy: dict | None = None, is_tls: bool = True):
    """
    Creates a tls client session

    :return: A session object with or without proxies.
    """
-    session = tls_client.Session(
-        client_identifier="chrome112",
-        random_tls_extension_order=True,
-    )
-    session.proxies = proxy
-    # TODO multiple proxies
-    # if self.proxies:
-    #     session.proxies = {
-    #         "http": random.choice(self.proxies),
-    #         "https": random.choice(self.proxies),
-    #     }
+    if is_tls:
+        session = tls_client.Session(
+            client_identifier="chrome112",
+            random_tls_extension_order=True,
+        )
+        session.proxies = proxy
+        # TODO multiple proxies
+        # if self.proxies:
+        #     session.proxies = {
+        #         "http": random.choice(self.proxies),
+        #         "https": random.choice(self.proxies),
+        #     }
+    else:
+        session = requests.Session()
+        session.allow_redirects = True
+        session.proxies.update(proxy)

    return session

--- a/src/jobspy/scrapers/ziprecruiter/init.py
+++ b/src/jobspy/scrapers/ziprecruiter/init.py
@ -9,25 +9,14 @@ import time
 import re
 from datetime import datetime, date
 from typing import Optional, Tuple, Any
-from urllib.parse import urlparse, parse_qs, urlunparse

-import requests
 from bs4 import BeautifulSoup
-from bs4.element import Tag
-from concurrent.futures import ThreadPoolExecutor, Future
+from concurrent.futures import ThreadPoolExecutor

 from .. import Scraper, ScraperInput, Site
 from ..exceptions import ZipRecruiterException
 from ..utils import count_urgent_words, extract_emails_from_text, create_session
-from ...jobs import (
-    JobPost,
-    Compensation,
-    CompensationInterval,
-    Location,
-    JobResponse,
-    JobType,
-    Country,
-)
+from ...jobs import JobPost, Compensation, Location, JobResponse, JobType


 class ZipRecruiterScraper(Scraper):
@ -42,21 +31,22 @@ class ZipRecruiterScraper(Scraper):
        self.jobs_per_page = 20
        self.seen_urls = set()

-    def find_jobs_in_page(self, scraper_input: ScraperInput, continue_token: Optional[str] = None) -> Tuple[list[JobPost], Optional[str]]:
+    def find_jobs_in_page(self, scraper_input: ScraperInput, continue_token: str | None = None) -> Tuple[list[JobPost], Optional[str]]:
        """
        Scrapes a page of ZipRecruiter for jobs with scraper_input criteria
        :param scraper_input:
+        :param continue_token:
        :return: jobs found on page
        """
        params = self.add_params(scraper_input)
        if continue_token:
            params['continue'] = continue_token
        try:
-            response = requests.get(
+            session = create_session(self.proxy, is_tls=False)
+            response = session.get(
                f"https://api.ziprecruiter.com/jobs-app/jobs",
                headers=self.headers(),
                params=self.add_params(scraper_input),
-                allow_redirects=True,
                timeout=10,
            )
            if response.status_code != 200:
@ -73,7 +63,7 @@ class ZipRecruiterScraper(Scraper):
        jobs_list = response_data.get("jobs", [])
        next_continue_token = response_data.get('continue', None)

-        with ThreadPoolExecutor(max_workers=10) as executor:
+        with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor:
            job_results = [
                executor.submit(self.process_job, job)
                for job in jobs_list
@ -109,12 +99,12 @@ class ZipRecruiterScraper(Scraper):

        return JobResponse(jobs=job_list)

-    def process_job(self, job: dict) -> JobPost:
-        """the most common type of jobs page on ZR"""
+    @staticmethod
+    def process_job(job: dict) -> JobPost:
+        """ Processes an individual job dict from the response """
        title = job.get("name")
        job_url = job.get("job_url")

-
        description = BeautifulSoup(
            job.get("job_description", "").strip(), "html.parser"
        ).get_text()
@ -144,7 +134,7 @@ class ZipRecruiterScraper(Scraper):
            location=location,
            job_type=job_type,
            compensation=Compensation(
-                interval="yearly" if job.get("compensation_interval") == "annual" else job.get("compensation_interval") ,
+                interval="yearly" if job.get("compensation_interval") == "annual" else job.get("compensation_interval"),
                min_amount=int(job["compensation_min"]) if "compensation_min" in job else None,
                max_amount=int(job["compensation_max"]) if "compensation_max" in job else None,
                currency=job.get("compensation_currency"),
@ -192,107 +182,6 @@ class ZipRecruiterScraper(Scraper):

        return params

-    @staticmethod
-    def get_interval(interval_str: str):
-        """
-         Maps the interval alias to its appropriate CompensationInterval.
-        :param interval_str
-        :return: CompensationInterval
-        """
-        interval_alias = {"annually": CompensationInterval.YEARLY}
-        interval_str = interval_str.lower()
-
-        if interval_str in interval_alias:
-            return interval_alias[interval_str]
-
-        return CompensationInterval(interval_str)
-
-    @staticmethod
-    def get_date_posted(job: Tag) -> Optional[datetime.date]:
-        """
-        Extracts the date a job was posted
-        :param job
-        :return: date the job was posted or None
-        """
-        button = job.find(
-            "button", {"class": "action_input save_job zrs_btn_secondary_200"}
-        )
-        if not button:
-            return None
-
-        url_time = button.get("data-href", "")
-        url_components = urlparse(url_time)
-        params = parse_qs(url_components.query)
-        posted_time_str = params.get("posted_time", [None])[0]
-
-        if posted_time_str:
-            posted_date = datetime.strptime(
-                posted_time_str, "%Y-%m-%dT%H:%M:%SZ"
-            ).date()
-            return posted_date
-
-        return None
-
-    @staticmethod
-    def get_compensation(job: Tag) -> Optional[Compensation]:
-        """
-        Parses the compensation tag from the job BeautifulSoup object
-        :param job
-        :return: Compensation object or None
-        """
-        pay_element = job.find("li", {"class": "perk_item perk_pay"})
-        if pay_element is None:
-            return None
-        pay = pay_element.find("div", {"class": "value"}).find("span").text.strip()
-
-        def create_compensation_object(pay_string: str) -> Compensation:
-            """
-            Creates a Compensation object from a pay_string
-            :param pay_string
-            :return: compensation
-            """
-            interval = ZipRecruiterScraper.get_interval(pay_string.split()[-1])
-
-            amounts = []
-            for amount in pay_string.split("to"):
-                amount = amount.replace(",", "").strip("$ ").split(" ")[0]
-                if "K" in amount:
-                    amount = amount.replace("K", "")
-                    amount = int(float(amount)) * 1000
-                else:
-                    amount = int(float(amount))
-                amounts.append(amount)
-
-            compensation = Compensation(
-                interval=interval,
-                min_amount=min(amounts),
-                max_amount=max(amounts),
-                currency="USD/CAD",
-            )
-
-            return compensation
-
-        return create_compensation_object(pay)
-
-    @staticmethod
-    def get_location(job: Tag) -> Location:
-        """
-        Extracts the job location from BeatifulSoup object
-        :param job:
-        :return: location
-        """
-        location_link = job.find("a", {"class": "company_location"})
-        if location_link is not None:
-            location_string = location_link.text.strip()
-            parts = location_string.split(", ")
-            if len(parts) == 2:
-                city, state = parts
-            else:
-                city, state = None, None
-        else:
-            city, state = None, None
-        return Location(city=city, state=state, country=Country.US_CANADA)
-
    @staticmethod
    def headers() -> dict:
        """