chore: multiple proxies type allowed, fix: zip recruiter proxy not working

pull/155/head
Fasih Hussain 2024-05-25 16:31:19 +05:00
parent 65d2e5e707
commit 6e57647577
7 changed files with 24 additions and 24 deletions

View File

@ -30,7 +30,7 @@ def scrape_jobs(
results_wanted: int = 15, results_wanted: int = 15,
country_indeed: str = "usa", country_indeed: str = "usa",
hyperlinks: bool = False, hyperlinks: bool = False,
proxy: str | None = None, proxies: list[str] | None = None,
description_format: str = "markdown", description_format: str = "markdown",
linkedin_fetch_description: bool | None = False, linkedin_fetch_description: bool | None = False,
linkedin_company_ids: list[int] | None = None, linkedin_company_ids: list[int] | None = None,
@ -96,7 +96,7 @@ def scrape_jobs(
def scrape_site(site: Site) -> Tuple[str, JobResponse]: def scrape_site(site: Site) -> Tuple[str, JobResponse]:
scraper_class = SCRAPER_MAPPING[site] scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class(proxy=proxy) scraper = scraper_class(proxies=proxies)
scraped_data: JobResponse = scraper.scrape(scraper_input) scraped_data: JobResponse = scraper.scrape(scraper_input)
cap_name = site.value.capitalize() cap_name = site.value.capitalize()
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name

View File

@ -39,9 +39,9 @@ class ScraperInput(BaseModel):
class Scraper(ABC): class Scraper(ABC):
def __init__(self, site: Site, proxy: list[str] | None = None): def __init__(self, site: Site, proxies: list[str] | None = None):
self.site = site self.site = site
self.proxy = (lambda p: {"http": p, "https": p} if p else None)(proxy) self.proxies = (lambda p: {"http": p, "https": p} if p else None)(proxies)
@abstractmethod @abstractmethod
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ... def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...

View File

@ -34,12 +34,12 @@ from ...jobs import (
class GlassdoorScraper(Scraper): class GlassdoorScraper(Scraper):
def __init__(self, proxy: Optional[str] = None): def __init__(self, proxies: Optional[list[str]] = None):
""" """
Initializes GlassdoorScraper with the Glassdoor job search url Initializes GlassdoorScraper with the Glassdoor job search url
""" """
site = Site(Site.GLASSDOOR) site = Site(Site.GLASSDOOR)
super().__init__(site, proxy=proxy) super().__init__(site, proxies=proxies)
self.base_url = None self.base_url = None
self.country = None self.country = None
@ -59,7 +59,7 @@ class GlassdoorScraper(Scraper):
self.scraper_input.results_wanted = min(900, scraper_input.results_wanted) self.scraper_input.results_wanted = min(900, scraper_input.results_wanted)
self.base_url = self.scraper_input.country.get_glassdoor_url() self.base_url = self.scraper_input.country.get_glassdoor_url()
self.session = create_session(self.proxy, is_tls=True, has_retry=True) self.session = create_session(self.proxies, is_tls=True, has_retry=True)
token = self._get_csrf_token() token = self._get_csrf_token()
self.headers["gd-csrf-token"] = token if token else self.fallback_token self.headers["gd-csrf-token"] = token if token else self.fallback_token
@ -245,7 +245,7 @@ class GlassdoorScraper(Scraper):
if not location or is_remote: if not location or is_remote:
return "11047", "STATE" # remote options return "11047", "STATE" # remote options
url = f"{self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}" url = f"{self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}"
session = create_session(self.proxy, has_retry=True) session = create_session(self.proxies, has_retry=True)
res = self.session.get(url, headers=self.headers) res = self.session.get(url, headers=self.headers)
if res.status_code != 200: if res.status_code != 200:
if res.status_code == 429: if res.status_code == 429:

View File

@ -33,7 +33,7 @@ from ...jobs import (
class IndeedScraper(Scraper): class IndeedScraper(Scraper):
def __init__(self, proxy: str | None = None): def __init__(self, proxies: str | None = None):
""" """
Initializes IndeedScraper with the Indeed API url Initializes IndeedScraper with the Indeed API url
""" """
@ -46,7 +46,7 @@ class IndeedScraper(Scraper):
self.base_url = None self.base_url = None
self.api_url = "https://apis.indeed.com/graphql" self.api_url = "https://apis.indeed.com/graphql"
site = Site(Site.INDEED) site = Site(Site.INDEED)
super().__init__(site, proxy=proxy) super().__init__(site, proxies=proxies)
def scrape(self, scraper_input: ScraperInput) -> JobResponse: def scrape(self, scraper_input: ScraperInput) -> JobResponse:
""" """
@ -115,7 +115,7 @@ class IndeedScraper(Scraper):
self.api_url, self.api_url,
headers=api_headers, headers=api_headers,
json=payload, json=payload,
proxies=self.proxy, proxies=self.proxies,
timeout=10, timeout=10,
) )
if response.status_code != 200: if response.status_code != 200:

View File

@ -11,7 +11,7 @@ import time
import random import random
import regex as re import regex as re
import urllib.parse import urllib.parse
from typing import Optional from typing import List, Optional
from datetime import datetime from datetime import datetime
from threading import Lock from threading import Lock
@ -46,11 +46,11 @@ class LinkedInScraper(Scraper):
band_delay = 4 band_delay = 4
jobs_per_page = 25 jobs_per_page = 25
def __init__(self, proxy: Optional[str] = None): def __init__(self, proxies: Optional[List[str]] = None):
""" """
Initializes LinkedInScraper with the LinkedIn job search url Initializes LinkedInScraper with the LinkedIn job search url
""" """
super().__init__(Site(Site.LINKEDIN), proxy=proxy) super().__init__(Site(Site.LINKEDIN), proxies=proxies)
self.scraper_input = None self.scraper_input = None
self.country = "worldwide" self.country = "worldwide"
self.job_url_direct_regex = re.compile(r'(?<=\?url=)[^"]+') self.job_url_direct_regex = re.compile(r'(?<=\?url=)[^"]+')
@ -103,7 +103,7 @@ class LinkedInScraper(Scraper):
f"{self.base_url}/jobs-guest/jobs/api/seeMoreJobPostings/search?", f"{self.base_url}/jobs-guest/jobs/api/seeMoreJobPostings/search?",
params=params, params=params,
allow_redirects=True, allow_redirects=True,
proxies=self.proxy, proxies=self.proxies,
headers=self.headers, headers=self.headers,
timeout=10, timeout=10,
) )
@ -243,7 +243,7 @@ class LinkedInScraper(Scraper):
try: try:
session = create_session(is_tls=False, has_retry=True) session = create_session(is_tls=False, has_retry=True)
response = session.get( response = session.get(
job_page_url, headers=self.headers, timeout=5, proxies=self.proxy job_page_url, headers=self.headers, timeout=5, proxies=self.proxies
) )
response.raise_for_status() response.raise_for_status()
except: except:

View File

@ -53,7 +53,7 @@ def extract_emails_from_text(text: str) -> list[str] | None:
def create_session( def create_session(
proxy: dict | None = None, proxies: dict | None = None,
is_tls: bool = True, is_tls: bool = True,
has_retry: bool = False, has_retry: bool = False,
delay: int = 1, delay: int = 1,
@ -64,12 +64,12 @@ def create_session(
""" """
if is_tls: if is_tls:
session = tls_client.Session(random_tls_extension_order=True) session = tls_client.Session(random_tls_extension_order=True)
session.proxies = proxy session.proxies = proxies
else: else:
session = requests.Session() session = requests.Session()
session.allow_redirects = True session.allow_redirects = True
if proxy: if proxies:
session.proxies.update(proxy) session.proxies.update(proxies)
if has_retry: if has_retry:
retries = Retry( retries = Retry(
total=3, total=3,

View File

@ -10,7 +10,7 @@ from __future__ import annotations
import math import math
import time import time
from datetime import datetime from datetime import datetime
from typing import Optional, Tuple, Any from typing import List, Optional, Tuple, Any
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
@ -36,14 +36,14 @@ class ZipRecruiterScraper(Scraper):
base_url = "https://www.ziprecruiter.com" base_url = "https://www.ziprecruiter.com"
api_url = "https://api.ziprecruiter.com" api_url = "https://api.ziprecruiter.com"
def __init__(self, proxy: Optional[str] = None): def __init__(self, proxies: Optional[str] = None):
""" """
Initializes ZipRecruiterScraper with the ZipRecruiter job search url Initializes ZipRecruiterScraper with the ZipRecruiter job search url
""" """
self.scraper_input = None self.scraper_input = None
self.session = create_session(proxy) super().__init__(Site.ZIP_RECRUITER, proxies=proxies)
self.session = create_session(self.proxies)
self._get_cookies() self._get_cookies()
super().__init__(Site.ZIP_RECRUITER, proxy=proxy)
self.delay = 5 self.delay = 5
self.jobs_per_page = 20 self.jobs_per_page = 20