mirror of https://github.com/Bunsly/JobSpy
chore: multiple proxies type allowed, fix: zip recruiter proxy not working
parent
65d2e5e707
commit
6e57647577
|
@ -30,7 +30,7 @@ def scrape_jobs(
|
||||||
results_wanted: int = 15,
|
results_wanted: int = 15,
|
||||||
country_indeed: str = "usa",
|
country_indeed: str = "usa",
|
||||||
hyperlinks: bool = False,
|
hyperlinks: bool = False,
|
||||||
proxy: str | None = None,
|
proxies: list[str] | None = None,
|
||||||
description_format: str = "markdown",
|
description_format: str = "markdown",
|
||||||
linkedin_fetch_description: bool | None = False,
|
linkedin_fetch_description: bool | None = False,
|
||||||
linkedin_company_ids: list[int] | None = None,
|
linkedin_company_ids: list[int] | None = None,
|
||||||
|
@ -96,7 +96,7 @@ def scrape_jobs(
|
||||||
|
|
||||||
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
|
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
|
||||||
scraper_class = SCRAPER_MAPPING[site]
|
scraper_class = SCRAPER_MAPPING[site]
|
||||||
scraper = scraper_class(proxy=proxy)
|
scraper = scraper_class(proxies=proxies)
|
||||||
scraped_data: JobResponse = scraper.scrape(scraper_input)
|
scraped_data: JobResponse = scraper.scrape(scraper_input)
|
||||||
cap_name = site.value.capitalize()
|
cap_name = site.value.capitalize()
|
||||||
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name
|
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name
|
||||||
|
|
|
@ -39,9 +39,9 @@ class ScraperInput(BaseModel):
|
||||||
|
|
||||||
|
|
||||||
class Scraper(ABC):
|
class Scraper(ABC):
|
||||||
def __init__(self, site: Site, proxy: list[str] | None = None):
|
def __init__(self, site: Site, proxies: list[str] | None = None):
|
||||||
self.site = site
|
self.site = site
|
||||||
self.proxy = (lambda p: {"http": p, "https": p} if p else None)(proxy)
|
self.proxies = (lambda p: {"http": p, "https": p} if p else None)(proxies)
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...
|
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...
|
||||||
|
|
|
@ -34,12 +34,12 @@ from ...jobs import (
|
||||||
|
|
||||||
|
|
||||||
class GlassdoorScraper(Scraper):
|
class GlassdoorScraper(Scraper):
|
||||||
def __init__(self, proxy: Optional[str] = None):
|
def __init__(self, proxies: Optional[list[str]] = None):
|
||||||
"""
|
"""
|
||||||
Initializes GlassdoorScraper with the Glassdoor job search url
|
Initializes GlassdoorScraper with the Glassdoor job search url
|
||||||
"""
|
"""
|
||||||
site = Site(Site.GLASSDOOR)
|
site = Site(Site.GLASSDOOR)
|
||||||
super().__init__(site, proxy=proxy)
|
super().__init__(site, proxies=proxies)
|
||||||
|
|
||||||
self.base_url = None
|
self.base_url = None
|
||||||
self.country = None
|
self.country = None
|
||||||
|
@ -59,7 +59,7 @@ class GlassdoorScraper(Scraper):
|
||||||
self.scraper_input.results_wanted = min(900, scraper_input.results_wanted)
|
self.scraper_input.results_wanted = min(900, scraper_input.results_wanted)
|
||||||
self.base_url = self.scraper_input.country.get_glassdoor_url()
|
self.base_url = self.scraper_input.country.get_glassdoor_url()
|
||||||
|
|
||||||
self.session = create_session(self.proxy, is_tls=True, has_retry=True)
|
self.session = create_session(self.proxies, is_tls=True, has_retry=True)
|
||||||
token = self._get_csrf_token()
|
token = self._get_csrf_token()
|
||||||
self.headers["gd-csrf-token"] = token if token else self.fallback_token
|
self.headers["gd-csrf-token"] = token if token else self.fallback_token
|
||||||
|
|
||||||
|
@ -245,7 +245,7 @@ class GlassdoorScraper(Scraper):
|
||||||
if not location or is_remote:
|
if not location or is_remote:
|
||||||
return "11047", "STATE" # remote options
|
return "11047", "STATE" # remote options
|
||||||
url = f"{self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}"
|
url = f"{self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}"
|
||||||
session = create_session(self.proxy, has_retry=True)
|
session = create_session(self.proxies, has_retry=True)
|
||||||
res = self.session.get(url, headers=self.headers)
|
res = self.session.get(url, headers=self.headers)
|
||||||
if res.status_code != 200:
|
if res.status_code != 200:
|
||||||
if res.status_code == 429:
|
if res.status_code == 429:
|
||||||
|
|
|
@ -33,7 +33,7 @@ from ...jobs import (
|
||||||
|
|
||||||
|
|
||||||
class IndeedScraper(Scraper):
|
class IndeedScraper(Scraper):
|
||||||
def __init__(self, proxy: str | None = None):
|
def __init__(self, proxies: str | None = None):
|
||||||
"""
|
"""
|
||||||
Initializes IndeedScraper with the Indeed API url
|
Initializes IndeedScraper with the Indeed API url
|
||||||
"""
|
"""
|
||||||
|
@ -46,7 +46,7 @@ class IndeedScraper(Scraper):
|
||||||
self.base_url = None
|
self.base_url = None
|
||||||
self.api_url = "https://apis.indeed.com/graphql"
|
self.api_url = "https://apis.indeed.com/graphql"
|
||||||
site = Site(Site.INDEED)
|
site = Site(Site.INDEED)
|
||||||
super().__init__(site, proxy=proxy)
|
super().__init__(site, proxies=proxies)
|
||||||
|
|
||||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
||||||
"""
|
"""
|
||||||
|
@ -115,7 +115,7 @@ class IndeedScraper(Scraper):
|
||||||
self.api_url,
|
self.api_url,
|
||||||
headers=api_headers,
|
headers=api_headers,
|
||||||
json=payload,
|
json=payload,
|
||||||
proxies=self.proxy,
|
proxies=self.proxies,
|
||||||
timeout=10,
|
timeout=10,
|
||||||
)
|
)
|
||||||
if response.status_code != 200:
|
if response.status_code != 200:
|
||||||
|
|
|
@ -11,7 +11,7 @@ import time
|
||||||
import random
|
import random
|
||||||
import regex as re
|
import regex as re
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
from typing import Optional
|
from typing import List, Optional
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
from threading import Lock
|
from threading import Lock
|
||||||
|
@ -46,11 +46,11 @@ class LinkedInScraper(Scraper):
|
||||||
band_delay = 4
|
band_delay = 4
|
||||||
jobs_per_page = 25
|
jobs_per_page = 25
|
||||||
|
|
||||||
def __init__(self, proxy: Optional[str] = None):
|
def __init__(self, proxies: Optional[List[str]] = None):
|
||||||
"""
|
"""
|
||||||
Initializes LinkedInScraper with the LinkedIn job search url
|
Initializes LinkedInScraper with the LinkedIn job search url
|
||||||
"""
|
"""
|
||||||
super().__init__(Site(Site.LINKEDIN), proxy=proxy)
|
super().__init__(Site(Site.LINKEDIN), proxies=proxies)
|
||||||
self.scraper_input = None
|
self.scraper_input = None
|
||||||
self.country = "worldwide"
|
self.country = "worldwide"
|
||||||
self.job_url_direct_regex = re.compile(r'(?<=\?url=)[^"]+')
|
self.job_url_direct_regex = re.compile(r'(?<=\?url=)[^"]+')
|
||||||
|
@ -103,7 +103,7 @@ class LinkedInScraper(Scraper):
|
||||||
f"{self.base_url}/jobs-guest/jobs/api/seeMoreJobPostings/search?",
|
f"{self.base_url}/jobs-guest/jobs/api/seeMoreJobPostings/search?",
|
||||||
params=params,
|
params=params,
|
||||||
allow_redirects=True,
|
allow_redirects=True,
|
||||||
proxies=self.proxy,
|
proxies=self.proxies,
|
||||||
headers=self.headers,
|
headers=self.headers,
|
||||||
timeout=10,
|
timeout=10,
|
||||||
)
|
)
|
||||||
|
@ -243,7 +243,7 @@ class LinkedInScraper(Scraper):
|
||||||
try:
|
try:
|
||||||
session = create_session(is_tls=False, has_retry=True)
|
session = create_session(is_tls=False, has_retry=True)
|
||||||
response = session.get(
|
response = session.get(
|
||||||
job_page_url, headers=self.headers, timeout=5, proxies=self.proxy
|
job_page_url, headers=self.headers, timeout=5, proxies=self.proxies
|
||||||
)
|
)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
except:
|
except:
|
||||||
|
|
|
@ -53,7 +53,7 @@ def extract_emails_from_text(text: str) -> list[str] | None:
|
||||||
|
|
||||||
|
|
||||||
def create_session(
|
def create_session(
|
||||||
proxy: dict | None = None,
|
proxies: dict | None = None,
|
||||||
is_tls: bool = True,
|
is_tls: bool = True,
|
||||||
has_retry: bool = False,
|
has_retry: bool = False,
|
||||||
delay: int = 1,
|
delay: int = 1,
|
||||||
|
@ -64,12 +64,12 @@ def create_session(
|
||||||
"""
|
"""
|
||||||
if is_tls:
|
if is_tls:
|
||||||
session = tls_client.Session(random_tls_extension_order=True)
|
session = tls_client.Session(random_tls_extension_order=True)
|
||||||
session.proxies = proxy
|
session.proxies = proxies
|
||||||
else:
|
else:
|
||||||
session = requests.Session()
|
session = requests.Session()
|
||||||
session.allow_redirects = True
|
session.allow_redirects = True
|
||||||
if proxy:
|
if proxies:
|
||||||
session.proxies.update(proxy)
|
session.proxies.update(proxies)
|
||||||
if has_retry:
|
if has_retry:
|
||||||
retries = Retry(
|
retries = Retry(
|
||||||
total=3,
|
total=3,
|
||||||
|
|
|
@ -10,7 +10,7 @@ from __future__ import annotations
|
||||||
import math
|
import math
|
||||||
import time
|
import time
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Optional, Tuple, Any
|
from typing import List, Optional, Tuple, Any
|
||||||
|
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
|
@ -36,14 +36,14 @@ class ZipRecruiterScraper(Scraper):
|
||||||
base_url = "https://www.ziprecruiter.com"
|
base_url = "https://www.ziprecruiter.com"
|
||||||
api_url = "https://api.ziprecruiter.com"
|
api_url = "https://api.ziprecruiter.com"
|
||||||
|
|
||||||
def __init__(self, proxy: Optional[str] = None):
|
def __init__(self, proxies: Optional[str] = None):
|
||||||
"""
|
"""
|
||||||
Initializes ZipRecruiterScraper with the ZipRecruiter job search url
|
Initializes ZipRecruiterScraper with the ZipRecruiter job search url
|
||||||
"""
|
"""
|
||||||
self.scraper_input = None
|
self.scraper_input = None
|
||||||
self.session = create_session(proxy)
|
super().__init__(Site.ZIP_RECRUITER, proxies=proxies)
|
||||||
|
self.session = create_session(self.proxies)
|
||||||
self._get_cookies()
|
self._get_cookies()
|
||||||
super().__init__(Site.ZIP_RECRUITER, proxy=proxy)
|
|
||||||
|
|
||||||
self.delay = 5
|
self.delay = 5
|
||||||
self.jobs_per_page = 20
|
self.jobs_per_page = 20
|
||||||
|
|
Loading…
Reference in New Issue