mirror of https://github.com/Bunsly/JobSpy
chore: multiple proxies type allowed, fix: zip recruiter proxy not working
parent
65d2e5e707
commit
6e57647577
|
@ -30,7 +30,7 @@ def scrape_jobs(
|
|||
results_wanted: int = 15,
|
||||
country_indeed: str = "usa",
|
||||
hyperlinks: bool = False,
|
||||
proxy: str | None = None,
|
||||
proxies: list[str] | None = None,
|
||||
description_format: str = "markdown",
|
||||
linkedin_fetch_description: bool | None = False,
|
||||
linkedin_company_ids: list[int] | None = None,
|
||||
|
@ -96,7 +96,7 @@ def scrape_jobs(
|
|||
|
||||
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
|
||||
scraper_class = SCRAPER_MAPPING[site]
|
||||
scraper = scraper_class(proxy=proxy)
|
||||
scraper = scraper_class(proxies=proxies)
|
||||
scraped_data: JobResponse = scraper.scrape(scraper_input)
|
||||
cap_name = site.value.capitalize()
|
||||
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name
|
||||
|
|
|
@ -39,9 +39,9 @@ class ScraperInput(BaseModel):
|
|||
|
||||
|
||||
class Scraper(ABC):
|
||||
def __init__(self, site: Site, proxy: list[str] | None = None):
|
||||
def __init__(self, site: Site, proxies: list[str] | None = None):
|
||||
self.site = site
|
||||
self.proxy = (lambda p: {"http": p, "https": p} if p else None)(proxy)
|
||||
self.proxies = (lambda p: {"http": p, "https": p} if p else None)(proxies)
|
||||
|
||||
@abstractmethod
|
||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...
|
||||
|
|
|
@ -34,12 +34,12 @@ from ...jobs import (
|
|||
|
||||
|
||||
class GlassdoorScraper(Scraper):
|
||||
def __init__(self, proxy: Optional[str] = None):
|
||||
def __init__(self, proxies: Optional[list[str]] = None):
|
||||
"""
|
||||
Initializes GlassdoorScraper with the Glassdoor job search url
|
||||
"""
|
||||
site = Site(Site.GLASSDOOR)
|
||||
super().__init__(site, proxy=proxy)
|
||||
super().__init__(site, proxies=proxies)
|
||||
|
||||
self.base_url = None
|
||||
self.country = None
|
||||
|
@ -59,7 +59,7 @@ class GlassdoorScraper(Scraper):
|
|||
self.scraper_input.results_wanted = min(900, scraper_input.results_wanted)
|
||||
self.base_url = self.scraper_input.country.get_glassdoor_url()
|
||||
|
||||
self.session = create_session(self.proxy, is_tls=True, has_retry=True)
|
||||
self.session = create_session(self.proxies, is_tls=True, has_retry=True)
|
||||
token = self._get_csrf_token()
|
||||
self.headers["gd-csrf-token"] = token if token else self.fallback_token
|
||||
|
||||
|
@ -245,7 +245,7 @@ class GlassdoorScraper(Scraper):
|
|||
if not location or is_remote:
|
||||
return "11047", "STATE" # remote options
|
||||
url = f"{self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}"
|
||||
session = create_session(self.proxy, has_retry=True)
|
||||
session = create_session(self.proxies, has_retry=True)
|
||||
res = self.session.get(url, headers=self.headers)
|
||||
if res.status_code != 200:
|
||||
if res.status_code == 429:
|
||||
|
|
|
@ -33,7 +33,7 @@ from ...jobs import (
|
|||
|
||||
|
||||
class IndeedScraper(Scraper):
|
||||
def __init__(self, proxy: str | None = None):
|
||||
def __init__(self, proxies: str | None = None):
|
||||
"""
|
||||
Initializes IndeedScraper with the Indeed API url
|
||||
"""
|
||||
|
@ -46,7 +46,7 @@ class IndeedScraper(Scraper):
|
|||
self.base_url = None
|
||||
self.api_url = "https://apis.indeed.com/graphql"
|
||||
site = Site(Site.INDEED)
|
||||
super().__init__(site, proxy=proxy)
|
||||
super().__init__(site, proxies=proxies)
|
||||
|
||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
||||
"""
|
||||
|
@ -115,7 +115,7 @@ class IndeedScraper(Scraper):
|
|||
self.api_url,
|
||||
headers=api_headers,
|
||||
json=payload,
|
||||
proxies=self.proxy,
|
||||
proxies=self.proxies,
|
||||
timeout=10,
|
||||
)
|
||||
if response.status_code != 200:
|
||||
|
|
|
@ -11,7 +11,7 @@ import time
|
|||
import random
|
||||
import regex as re
|
||||
import urllib.parse
|
||||
from typing import Optional
|
||||
from typing import List, Optional
|
||||
from datetime import datetime
|
||||
|
||||
from threading import Lock
|
||||
|
@ -46,11 +46,11 @@ class LinkedInScraper(Scraper):
|
|||
band_delay = 4
|
||||
jobs_per_page = 25
|
||||
|
||||
def __init__(self, proxy: Optional[str] = None):
|
||||
def __init__(self, proxies: Optional[List[str]] = None):
|
||||
"""
|
||||
Initializes LinkedInScraper with the LinkedIn job search url
|
||||
"""
|
||||
super().__init__(Site(Site.LINKEDIN), proxy=proxy)
|
||||
super().__init__(Site(Site.LINKEDIN), proxies=proxies)
|
||||
self.scraper_input = None
|
||||
self.country = "worldwide"
|
||||
self.job_url_direct_regex = re.compile(r'(?<=\?url=)[^"]+')
|
||||
|
@ -103,7 +103,7 @@ class LinkedInScraper(Scraper):
|
|||
f"{self.base_url}/jobs-guest/jobs/api/seeMoreJobPostings/search?",
|
||||
params=params,
|
||||
allow_redirects=True,
|
||||
proxies=self.proxy,
|
||||
proxies=self.proxies,
|
||||
headers=self.headers,
|
||||
timeout=10,
|
||||
)
|
||||
|
@ -243,7 +243,7 @@ class LinkedInScraper(Scraper):
|
|||
try:
|
||||
session = create_session(is_tls=False, has_retry=True)
|
||||
response = session.get(
|
||||
job_page_url, headers=self.headers, timeout=5, proxies=self.proxy
|
||||
job_page_url, headers=self.headers, timeout=5, proxies=self.proxies
|
||||
)
|
||||
response.raise_for_status()
|
||||
except:
|
||||
|
|
|
@ -53,7 +53,7 @@ def extract_emails_from_text(text: str) -> list[str] | None:
|
|||
|
||||
|
||||
def create_session(
|
||||
proxy: dict | None = None,
|
||||
proxies: dict | None = None,
|
||||
is_tls: bool = True,
|
||||
has_retry: bool = False,
|
||||
delay: int = 1,
|
||||
|
@ -64,12 +64,12 @@ def create_session(
|
|||
"""
|
||||
if is_tls:
|
||||
session = tls_client.Session(random_tls_extension_order=True)
|
||||
session.proxies = proxy
|
||||
session.proxies = proxies
|
||||
else:
|
||||
session = requests.Session()
|
||||
session.allow_redirects = True
|
||||
if proxy:
|
||||
session.proxies.update(proxy)
|
||||
if proxies:
|
||||
session.proxies.update(proxies)
|
||||
if has_retry:
|
||||
retries = Retry(
|
||||
total=3,
|
||||
|
|
|
@ -10,7 +10,7 @@ from __future__ import annotations
|
|||
import math
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Optional, Tuple, Any
|
||||
from typing import List, Optional, Tuple, Any
|
||||
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
|
@ -36,14 +36,14 @@ class ZipRecruiterScraper(Scraper):
|
|||
base_url = "https://www.ziprecruiter.com"
|
||||
api_url = "https://api.ziprecruiter.com"
|
||||
|
||||
def __init__(self, proxy: Optional[str] = None):
|
||||
def __init__(self, proxies: Optional[str] = None):
|
||||
"""
|
||||
Initializes ZipRecruiterScraper with the ZipRecruiter job search url
|
||||
"""
|
||||
self.scraper_input = None
|
||||
self.session = create_session(proxy)
|
||||
super().__init__(Site.ZIP_RECRUITER, proxies=proxies)
|
||||
self.session = create_session(self.proxies)
|
||||
self._get_cookies()
|
||||
super().__init__(Site.ZIP_RECRUITER, proxy=proxy)
|
||||
|
||||
self.delay = 5
|
||||
self.jobs_per_page = 20
|
||||
|
|
Loading…
Reference in New Issue