chore: multiple proxies type allowed, fix: zip recruiter proxy not working

pull/155/head
Fasih Hussain 2024-05-25 16:31:19 +05:00
parent 65d2e5e707
commit 6e57647577
7 changed files with 24 additions and 24 deletions

View File

@ -30,7 +30,7 @@ def scrape_jobs(
results_wanted: int = 15,
country_indeed: str = "usa",
hyperlinks: bool = False,
proxy: str | None = None,
proxies: list[str] | None = None,
description_format: str = "markdown",
linkedin_fetch_description: bool | None = False,
linkedin_company_ids: list[int] | None = None,
@ -96,7 +96,7 @@ def scrape_jobs(
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class(proxy=proxy)
scraper = scraper_class(proxies=proxies)
scraped_data: JobResponse = scraper.scrape(scraper_input)
cap_name = site.value.capitalize()
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name

View File

@ -39,9 +39,9 @@ class ScraperInput(BaseModel):
class Scraper(ABC):
def __init__(self, site: Site, proxy: list[str] | None = None):
def __init__(self, site: Site, proxies: list[str] | None = None):
self.site = site
self.proxy = (lambda p: {"http": p, "https": p} if p else None)(proxy)
self.proxies = (lambda p: {"http": p, "https": p} if p else None)(proxies)
@abstractmethod
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...

View File

@ -34,12 +34,12 @@ from ...jobs import (
class GlassdoorScraper(Scraper):
def __init__(self, proxy: Optional[str] = None):
def __init__(self, proxies: Optional[list[str]] = None):
"""
Initializes GlassdoorScraper with the Glassdoor job search url
"""
site = Site(Site.GLASSDOOR)
super().__init__(site, proxy=proxy)
super().__init__(site, proxies=proxies)
self.base_url = None
self.country = None
@ -59,7 +59,7 @@ class GlassdoorScraper(Scraper):
self.scraper_input.results_wanted = min(900, scraper_input.results_wanted)
self.base_url = self.scraper_input.country.get_glassdoor_url()
self.session = create_session(self.proxy, is_tls=True, has_retry=True)
self.session = create_session(self.proxies, is_tls=True, has_retry=True)
token = self._get_csrf_token()
self.headers["gd-csrf-token"] = token if token else self.fallback_token
@ -245,7 +245,7 @@ class GlassdoorScraper(Scraper):
if not location or is_remote:
return "11047", "STATE" # remote options
url = f"{self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}"
session = create_session(self.proxy, has_retry=True)
session = create_session(self.proxies, has_retry=True)
res = self.session.get(url, headers=self.headers)
if res.status_code != 200:
if res.status_code == 429:

View File

@ -33,7 +33,7 @@ from ...jobs import (
class IndeedScraper(Scraper):
def __init__(self, proxy: str | None = None):
def __init__(self, proxies: str | None = None):
"""
Initializes IndeedScraper with the Indeed API url
"""
@ -46,7 +46,7 @@ class IndeedScraper(Scraper):
self.base_url = None
self.api_url = "https://apis.indeed.com/graphql"
site = Site(Site.INDEED)
super().__init__(site, proxy=proxy)
super().__init__(site, proxies=proxies)
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
@ -115,7 +115,7 @@ class IndeedScraper(Scraper):
self.api_url,
headers=api_headers,
json=payload,
proxies=self.proxy,
proxies=self.proxies,
timeout=10,
)
if response.status_code != 200:

View File

@ -11,7 +11,7 @@ import time
import random
import regex as re
import urllib.parse
from typing import Optional
from typing import List, Optional
from datetime import datetime
from threading import Lock
@ -46,11 +46,11 @@ class LinkedInScraper(Scraper):
band_delay = 4
jobs_per_page = 25
def __init__(self, proxy: Optional[str] = None):
def __init__(self, proxies: Optional[List[str]] = None):
"""
Initializes LinkedInScraper with the LinkedIn job search url
"""
super().__init__(Site(Site.LINKEDIN), proxy=proxy)
super().__init__(Site(Site.LINKEDIN), proxies=proxies)
self.scraper_input = None
self.country = "worldwide"
self.job_url_direct_regex = re.compile(r'(?<=\?url=)[^"]+')
@ -103,7 +103,7 @@ class LinkedInScraper(Scraper):
f"{self.base_url}/jobs-guest/jobs/api/seeMoreJobPostings/search?",
params=params,
allow_redirects=True,
proxies=self.proxy,
proxies=self.proxies,
headers=self.headers,
timeout=10,
)
@ -243,7 +243,7 @@ class LinkedInScraper(Scraper):
try:
session = create_session(is_tls=False, has_retry=True)
response = session.get(
job_page_url, headers=self.headers, timeout=5, proxies=self.proxy
job_page_url, headers=self.headers, timeout=5, proxies=self.proxies
)
response.raise_for_status()
except:

View File

@ -53,7 +53,7 @@ def extract_emails_from_text(text: str) -> list[str] | None:
def create_session(
proxy: dict | None = None,
proxies: dict | None = None,
is_tls: bool = True,
has_retry: bool = False,
delay: int = 1,
@ -64,12 +64,12 @@ def create_session(
"""
if is_tls:
session = tls_client.Session(random_tls_extension_order=True)
session.proxies = proxy
session.proxies = proxies
else:
session = requests.Session()
session.allow_redirects = True
if proxy:
session.proxies.update(proxy)
if proxies:
session.proxies.update(proxies)
if has_retry:
retries = Retry(
total=3,

View File

@ -10,7 +10,7 @@ from __future__ import annotations
import math
import time
from datetime import datetime
from typing import Optional, Tuple, Any
from typing import List, Optional, Tuple, Any
from concurrent.futures import ThreadPoolExecutor
@ -36,14 +36,14 @@ class ZipRecruiterScraper(Scraper):
base_url = "https://www.ziprecruiter.com"
api_url = "https://api.ziprecruiter.com"
def __init__(self, proxy: Optional[str] = None):
def __init__(self, proxies: Optional[str] = None):
"""
Initializes ZipRecruiterScraper with the ZipRecruiter job search url
"""
self.scraper_input = None
self.session = create_session(proxy)
super().__init__(Site.ZIP_RECRUITER, proxies=proxies)
self.session = create_session(self.proxies)
self._get_cookies()
super().__init__(Site.ZIP_RECRUITER, proxy=proxy)
self.delay = 5
self.jobs_per_page = 20