From 6e576475770d4f01405593d9f5abb7bc67a75567 Mon Sep 17 00:00:00 2001 From: Fasih Hussain Date: Sat, 25 May 2024 16:31:19 +0500 Subject: [PATCH] chore: multiple proxies type allowed, fix: zip recruiter proxy not working --- src/jobspy/__init__.py | 4 ++-- src/jobspy/scrapers/__init__.py | 4 ++-- src/jobspy/scrapers/glassdoor/__init__.py | 8 ++++---- src/jobspy/scrapers/indeed/__init__.py | 6 +++--- src/jobspy/scrapers/linkedin/__init__.py | 10 +++++----- src/jobspy/scrapers/utils.py | 8 ++++---- src/jobspy/scrapers/ziprecruiter/__init__.py | 8 ++++---- 7 files changed, 24 insertions(+), 24 deletions(-) diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py index a2656cb..5069191 100644 --- a/src/jobspy/__init__.py +++ b/src/jobspy/__init__.py @@ -30,7 +30,7 @@ def scrape_jobs( results_wanted: int = 15, country_indeed: str = "usa", hyperlinks: bool = False, - proxy: str | None = None, + proxies: list[str] | None = None, description_format: str = "markdown", linkedin_fetch_description: bool | None = False, linkedin_company_ids: list[int] | None = None, @@ -96,7 +96,7 @@ def scrape_jobs( def scrape_site(site: Site) -> Tuple[str, JobResponse]: scraper_class = SCRAPER_MAPPING[site] - scraper = scraper_class(proxy=proxy) + scraper = scraper_class(proxies=proxies) scraped_data: JobResponse = scraper.scrape(scraper_input) cap_name = site.value.capitalize() site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name diff --git a/src/jobspy/scrapers/__init__.py b/src/jobspy/scrapers/__init__.py index 0ff2382..6cf4d49 100644 --- a/src/jobspy/scrapers/__init__.py +++ b/src/jobspy/scrapers/__init__.py @@ -39,9 +39,9 @@ class ScraperInput(BaseModel): class Scraper(ABC): - def __init__(self, site: Site, proxy: list[str] | None = None): + def __init__(self, site: Site, proxies: list[str] | None = None): self.site = site - self.proxy = (lambda p: {"http": p, "https": p} if p else None)(proxy) + self.proxies = (lambda p: {"http": p, "https": p} if p else None)(proxies) @abstractmethod def scrape(self, scraper_input: ScraperInput) -> JobResponse: ... diff --git a/src/jobspy/scrapers/glassdoor/__init__.py b/src/jobspy/scrapers/glassdoor/__init__.py index 89f5a95..70b411b 100644 --- a/src/jobspy/scrapers/glassdoor/__init__.py +++ b/src/jobspy/scrapers/glassdoor/__init__.py @@ -34,12 +34,12 @@ from ...jobs import ( class GlassdoorScraper(Scraper): - def __init__(self, proxy: Optional[str] = None): + def __init__(self, proxies: Optional[list[str]] = None): """ Initializes GlassdoorScraper with the Glassdoor job search url """ site = Site(Site.GLASSDOOR) - super().__init__(site, proxy=proxy) + super().__init__(site, proxies=proxies) self.base_url = None self.country = None @@ -59,7 +59,7 @@ class GlassdoorScraper(Scraper): self.scraper_input.results_wanted = min(900, scraper_input.results_wanted) self.base_url = self.scraper_input.country.get_glassdoor_url() - self.session = create_session(self.proxy, is_tls=True, has_retry=True) + self.session = create_session(self.proxies, is_tls=True, has_retry=True) token = self._get_csrf_token() self.headers["gd-csrf-token"] = token if token else self.fallback_token @@ -245,7 +245,7 @@ class GlassdoorScraper(Scraper): if not location or is_remote: return "11047", "STATE" # remote options url = f"{self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}" - session = create_session(self.proxy, has_retry=True) + session = create_session(self.proxies, has_retry=True) res = self.session.get(url, headers=self.headers) if res.status_code != 200: if res.status_code == 429: diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/jobspy/scrapers/indeed/__init__.py index 58303f5..86c7fe8 100644 --- a/src/jobspy/scrapers/indeed/__init__.py +++ b/src/jobspy/scrapers/indeed/__init__.py @@ -33,7 +33,7 @@ from ...jobs import ( class IndeedScraper(Scraper): - def __init__(self, proxy: str | None = None): + def __init__(self, proxies: str | None = None): """ Initializes IndeedScraper with the Indeed API url """ @@ -46,7 +46,7 @@ class IndeedScraper(Scraper): self.base_url = None self.api_url = "https://apis.indeed.com/graphql" site = Site(Site.INDEED) - super().__init__(site, proxy=proxy) + super().__init__(site, proxies=proxies) def scrape(self, scraper_input: ScraperInput) -> JobResponse: """ @@ -115,7 +115,7 @@ class IndeedScraper(Scraper): self.api_url, headers=api_headers, json=payload, - proxies=self.proxy, + proxies=self.proxies, timeout=10, ) if response.status_code != 200: diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py index 18fbb84..10bcff8 100644 --- a/src/jobspy/scrapers/linkedin/__init__.py +++ b/src/jobspy/scrapers/linkedin/__init__.py @@ -11,7 +11,7 @@ import time import random import regex as re import urllib.parse -from typing import Optional +from typing import List, Optional from datetime import datetime from threading import Lock @@ -46,11 +46,11 @@ class LinkedInScraper(Scraper): band_delay = 4 jobs_per_page = 25 - def __init__(self, proxy: Optional[str] = None): + def __init__(self, proxies: Optional[List[str]] = None): """ Initializes LinkedInScraper with the LinkedIn job search url """ - super().__init__(Site(Site.LINKEDIN), proxy=proxy) + super().__init__(Site(Site.LINKEDIN), proxies=proxies) self.scraper_input = None self.country = "worldwide" self.job_url_direct_regex = re.compile(r'(?<=\?url=)[^"]+') @@ -103,7 +103,7 @@ class LinkedInScraper(Scraper): f"{self.base_url}/jobs-guest/jobs/api/seeMoreJobPostings/search?", params=params, allow_redirects=True, - proxies=self.proxy, + proxies=self.proxies, headers=self.headers, timeout=10, ) @@ -243,7 +243,7 @@ class LinkedInScraper(Scraper): try: session = create_session(is_tls=False, has_retry=True) response = session.get( - job_page_url, headers=self.headers, timeout=5, proxies=self.proxy + job_page_url, headers=self.headers, timeout=5, proxies=self.proxies ) response.raise_for_status() except: diff --git a/src/jobspy/scrapers/utils.py b/src/jobspy/scrapers/utils.py index 8fef421..f558988 100644 --- a/src/jobspy/scrapers/utils.py +++ b/src/jobspy/scrapers/utils.py @@ -53,7 +53,7 @@ def extract_emails_from_text(text: str) -> list[str] | None: def create_session( - proxy: dict | None = None, + proxies: dict | None = None, is_tls: bool = True, has_retry: bool = False, delay: int = 1, @@ -64,12 +64,12 @@ def create_session( """ if is_tls: session = tls_client.Session(random_tls_extension_order=True) - session.proxies = proxy + session.proxies = proxies else: session = requests.Session() session.allow_redirects = True - if proxy: - session.proxies.update(proxy) + if proxies: + session.proxies.update(proxies) if has_retry: retries = Retry( total=3, diff --git a/src/jobspy/scrapers/ziprecruiter/__init__.py b/src/jobspy/scrapers/ziprecruiter/__init__.py index fbe896f..c9ea58a 100644 --- a/src/jobspy/scrapers/ziprecruiter/__init__.py +++ b/src/jobspy/scrapers/ziprecruiter/__init__.py @@ -10,7 +10,7 @@ from __future__ import annotations import math import time from datetime import datetime -from typing import Optional, Tuple, Any +from typing import List, Optional, Tuple, Any from concurrent.futures import ThreadPoolExecutor @@ -36,14 +36,14 @@ class ZipRecruiterScraper(Scraper): base_url = "https://www.ziprecruiter.com" api_url = "https://api.ziprecruiter.com" - def __init__(self, proxy: Optional[str] = None): + def __init__(self, proxies: Optional[str] = None): """ Initializes ZipRecruiterScraper with the ZipRecruiter job search url """ self.scraper_input = None - self.session = create_session(proxy) + super().__init__(Site.ZIP_RECRUITER, proxies=proxies) + self.session = create_session(self.proxies) self._get_cookies() - super().__init__(Site.ZIP_RECRUITER, proxy=proxy) self.delay = 5 self.jobs_per_page = 20