Compare commits

...

9 Commits

Author SHA1 Message Date
Cullen Watson
2ec3b04777 fix(ziprecruiter): init cookies (#82) 2024-01-12 12:28:35 -06:00
Harish Vadaparty
89a5264391 add long scrape example (#81) 2024-01-12 12:24:00 -06:00
Cullen Watson
a7ad616567 fix: linkedin no results (#80) 2024-01-10 14:01:10 -06:00
cullenwatson
53bc33a43a chore: version 2024-01-09 19:33:56 -06:00
Cullen Watson
22870438c7 linkedin fix delays (#79) 2024-01-09 19:32:51 -06:00
Cullen Watson
aeb93b99f5 Update pyproject.toml 2024-01-03 12:04:50 -06:00
Cullen Watson
a5916edcdd fix(glassdoor): add retry adapter (#77) 2024-01-03 12:04:32 -06:00
Augusto Gunsch
33d442bf1e Add czech to Indeed (#72) 2023-12-02 02:42:54 -06:00
Zachary Hampton
6587e464fa Update README.md 2023-11-30 11:49:31 -07:00
10 changed files with 208 additions and 115 deletions

View File

@@ -7,9 +7,6 @@
*Looking to build a data-focused software product?* **[Book a call](https://bunsly.com/)** *to *Looking to build a data-focused software product?* **[Book a call](https://bunsly.com/)** *to
work with us.* work with us.*
Check out another project we wrote: ***[HomeHarvest](https://github.com/Bunsly/HomeHarvest)** a Python package
for real estate scraping*
## Features ## Features
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, & **ZipRecruiter** simultaneously - Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, & **ZipRecruiter** simultaneously

View File

@@ -2,12 +2,11 @@ from jobspy import scrape_jobs
import pandas as pd import pandas as pd
jobs: pd.DataFrame = scrape_jobs( jobs: pd.DataFrame = scrape_jobs(
site_name=["indeed", "linkedin", "zip_recruiter"], site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor"],
search_term="software engineer", search_term="software engineer",
location="Dallas, TX", location="Dallas, TX",
results_wanted=50, # be wary the higher it is, the more likey you'll get blocked (rotating proxy should work tho) results_wanted=25, # be wary the higher it is, the more likey you'll get blocked (rotating proxy can help tho)
country_indeed="USA", country_indeed="USA",
offset=25 # start jobs from an offset (use if search failed and want to continue)
# proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001", # proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
) )

View File

@@ -0,0 +1,77 @@
from jobspy import scrape_jobs
import pandas as pd
import os
import time
# creates csv a new filename if the jobs.csv already exists.
csv_filename = "jobs.csv"
counter = 1
while os.path.exists(csv_filename):
csv_filename = f"jobs_{counter}.csv"
counter += 1
# results wanted and offset
results_wanted = 1000
offset = 0
all_jobs = []
# max retries
max_retries = 3
# nuumber of results at each iteration
results_in_each_iteration = 30
while len(all_jobs) < results_wanted:
retry_count = 0
while retry_count < max_retries:
print("Doing from", offset, "to", offset + results_in_each_iteration, "jobs")
try:
jobs = scrape_jobs(
site_name=["indeed"],
search_term="software engineer",
# New York, NY
# Dallas, TX
# Los Angeles, CA
location="Los Angeles, CA",
results_wanted=min(results_in_each_iteration, results_wanted - len(all_jobs)),
country_indeed="USA",
offset=offset,
# proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
)
# Add the scraped jobs to the list
all_jobs.extend(jobs.to_dict('records'))
# Increment the offset for the next page of results
offset += results_in_each_iteration
# Add a delay to avoid rate limiting (you can adjust the delay time as needed)
print(f"Scraped {len(all_jobs)} jobs")
print("Sleeping secs", 100 * (retry_count + 1))
time.sleep(100 * (retry_count + 1)) # Sleep for 2 seconds between requests
break # Break out of the retry loop if successful
except Exception as e:
print(f"Error: {e}")
retry_count += 1
print("Sleeping secs before retry", 100 * (retry_count + 1))
time.sleep(100 * (retry_count + 1))
if retry_count >= max_retries:
print("Max retries reached. Exiting.")
break
# DataFrame from the collected job data
jobs_df = pd.DataFrame(all_jobs)
# Formatting
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", 50)
print(jobs_df)
jobs_df.to_csv(csv_filename, index=False)
print(f"Outputted to {csv_filename}")

View File

@@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "python-jobspy" name = "python-jobspy"
version = "1.1.29" version = "1.1.34"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy" homepage = "https://github.com/Bunsly/JobSpy"

View File

@@ -55,18 +55,24 @@ class JobType(Enum):
class Country(Enum): class Country(Enum):
ARGENTINA = ("argentina", "com.ar") """
Gets the subdomain for Indeed and Glassdoor.
The second item in the tuple is the subdomain for Indeed
The third item in the tuple is the subdomain (and tld if there's a ':' separator) for Glassdoor
"""
ARGENTINA = ("argentina", "ar", "com.ar")
AUSTRALIA = ("australia", "au", "com.au") AUSTRALIA = ("australia", "au", "com.au")
AUSTRIA = ("austria", "at", "at") AUSTRIA = ("austria", "at", "at")
BAHRAIN = ("bahrain", "bh") BAHRAIN = ("bahrain", "bh")
BELGIUM = ("belgium", "be", "nl:be") BELGIUM = ("belgium", "be", "fr:be")
BRAZIL = ("brazil", "br", "com.br") BRAZIL = ("brazil", "br", "com.br")
CANADA = ("canada", "ca", "ca") CANADA = ("canada", "ca", "ca")
CHILE = ("chile", "cl") CHILE = ("chile", "cl")
CHINA = ("china", "cn") CHINA = ("china", "cn")
COLOMBIA = ("colombia", "co") COLOMBIA = ("colombia", "co")
COSTARICA = ("costa rica", "cr") COSTARICA = ("costa rica", "cr")
CZECHREPUBLIC = ("czech republic", "cz") CZECHREPUBLIC = ("czech republic,czechia", "cz")
DENMARK = ("denmark", "dk") DENMARK = ("denmark", "dk")
ECUADOR = ("ecuador", "ec") ECUADOR = ("ecuador", "ec")
EGYPT = ("egypt", "eg") EGYPT = ("egypt", "eg")
@@ -112,8 +118,8 @@ class Country(Enum):
TURKEY = ("turkey", "tr") TURKEY = ("turkey", "tr")
UKRAINE = ("ukraine", "ua") UKRAINE = ("ukraine", "ua")
UNITEDARABEMIRATES = ("united arab emirates", "ae") UNITEDARABEMIRATES = ("united arab emirates", "ae")
UK = ("uk", "uk", "co.uk") UK = ("uk,united kingdom", "uk", "co.uk")
USA = ("usa", "www", "com") USA = ("usa,us,united states", "www", "com")
URUGUAY = ("uruguay", "uy") URUGUAY = ("uruguay", "uy")
VENEZUELA = ("venezuela", "ve") VENEZUELA = ("venezuela", "ve")
VIETNAM = ("vietnam", "vn") VIETNAM = ("vietnam", "vn")
@@ -147,7 +153,8 @@ class Country(Enum):
"""Convert a string to the corresponding Country enum.""" """Convert a string to the corresponding Country enum."""
country_str = country_str.strip().lower() country_str = country_str.strip().lower()
for country in cls: for country in cls:
if country.value[0] == country_str: country_names = country.value[0].split(',')
if country_str in country_names:
return country return country
valid_countries = [country.value for country in cls] valid_countries = [country.value for country in cls]
raise ValueError( raise ValueError(
@@ -167,10 +174,13 @@ class Location(BaseModel):
if self.state: if self.state:
location_parts.append(self.state) location_parts.append(self.state)
if self.country and self.country not in (Country.US_CANADA, Country.WORLDWIDE): if self.country and self.country not in (Country.US_CANADA, Country.WORLDWIDE):
if self.country.value[0] in ("usa", "uk"): country_name = self.country.value[0]
location_parts.append(self.country.value[0].upper()) if "," in country_name:
country_name = country_name.split(",")[0]
if country_name in ("usa", "uk"):
location_parts.append(country_name.upper())
else: else:
location_parts.append(self.country.value[0].title()) location_parts.append(country_name.title())
return ", ".join(location_parts) return ", ".join(location_parts)
@@ -181,6 +191,10 @@ class CompensationInterval(Enum):
DAILY = "daily" DAILY = "daily"
HOURLY = "hourly" HOURLY = "hourly"
@classmethod
def get_interval(cls, pay_period):
return cls[pay_period].value if pay_period in cls.__members__ else None
class Compensation(BaseModel): class Compensation(BaseModel):
interval: Optional[CompensationInterval] = None interval: Optional[CompensationInterval] = None

View File

@@ -4,17 +4,13 @@ jobspy.scrapers.glassdoor
This module contains routines to scrape Glassdoor. This module contains routines to scrape Glassdoor.
""" """
import math
import time
import re
import json import json
from datetime import datetime, date from typing import Optional, Any
from typing import Optional, Tuple, Any from datetime import datetime, timedelta
from bs4 import BeautifulSoup
from .. import Scraper, ScraperInput, Site from .. import Scraper, ScraperInput, Site
from ..exceptions import GlassdoorException from ..exceptions import GlassdoorException
from ..utils import count_urgent_words, extract_emails_from_text, create_session from ..utils import create_session
from ...jobs import ( from ...jobs import (
JobPost, JobPost,
Compensation, Compensation,
@@ -22,7 +18,6 @@ from ...jobs import (
Location, Location,
JobResponse, JobResponse,
JobType, JobType,
Country,
) )
@@ -31,7 +26,7 @@ class GlassdoorScraper(Scraper):
""" """
Initializes GlassdoorScraper with the Glassdoor job search url Initializes GlassdoorScraper with the Glassdoor job search url
""" """
site = Site(Site.ZIP_RECRUITER) site = Site(Site.GLASSDOOR)
super().__init__(site, proxy=proxy) super().__init__(site, proxy=proxy)
self.url = None self.url = None
@@ -49,15 +44,12 @@ class GlassdoorScraper(Scraper):
) -> (list[JobPost], str | None): ) -> (list[JobPost], str | None):
""" """
Scrapes a page of Glassdoor for jobs with scraper_input criteria Scrapes a page of Glassdoor for jobs with scraper_input criteria
:param scraper_input:
:return: jobs found on page
:return: cursor for next page
""" """
try: try:
payload = self.add_payload( payload = self.add_payload(
scraper_input, location_id, location_type, page_num, cursor scraper_input, location_id, location_type, page_num, cursor
) )
session = create_session(self.proxy, is_tls=False) session = create_session(self.proxy, is_tls=False, has_retry=True)
response = session.post( response = session.post(
f"{self.url}/graph", headers=self.headers(), timeout=10, data=payload f"{self.url}/graph", headers=self.headers(), timeout=10, data=payload
) )
@@ -86,8 +78,9 @@ class GlassdoorScraper(Scraper):
company_name = job["header"]["employerNameFromSearch"] company_name = job["header"]["employerNameFromSearch"]
location_name = job["header"].get("locationName", "") location_name = job["header"].get("locationName", "")
location_type = job["header"].get("locationType", "") location_type = job["header"].get("locationType", "")
is_remote = False age_in_days = job["header"].get("ageInDays")
location = None is_remote, location = False, None
date_posted = (datetime.now() - timedelta(days=age_in_days)).date() if age_in_days else None
if location_type == "S": if location_type == "S":
is_remote = True is_remote = True
@@ -99,10 +92,11 @@ class GlassdoorScraper(Scraper):
job = JobPost( job = JobPost(
title=title, title=title,
company_name=company_name, company_name=company_name,
date_posted=date_posted,
job_url=job_url, job_url=job_url,
location=location, location=location,
compensation=compensation, compensation=compensation,
is_remote=is_remote, is_remote=is_remote
) )
jobs.append(job) jobs.append(job)
@@ -161,15 +155,8 @@ class GlassdoorScraper(Scraper):
interval = None interval = None
if pay_period == "ANNUAL": if pay_period == "ANNUAL":
interval = CompensationInterval.YEARLY interval = CompensationInterval.YEARLY
elif pay_period == "MONTHLY": elif pay_period:
interval = CompensationInterval.MONTHLY interval = CompensationInterval.get_interval(pay_period)
elif pay_period == "WEEKLY":
interval = CompensationInterval.WEEKLY
elif pay_period == "DAILY":
interval = CompensationInterval.DAILY
elif pay_period == "HOURLY":
interval = CompensationInterval.HOURLY
min_amount = int(adjusted_pay.get("p10") // 1) min_amount = int(adjusted_pay.get("p10") // 1)
max_amount = int(adjusted_pay.get("p90") // 1) max_amount = int(adjusted_pay.get("p90") // 1)
@@ -180,17 +167,11 @@ class GlassdoorScraper(Scraper):
currency=currency, currency=currency,
) )
def get_job_type_enum(self, job_type_str: str) -> list[JobType] | None:
for job_type in JobType:
if job_type_str in job_type.value:
return [job_type]
return None
def get_location(self, location: str, is_remote: bool) -> (int, str): def get_location(self, location: str, is_remote: bool) -> (int, str):
if not location or is_remote: if not location or is_remote:
return "11047", "STATE" # remote options return "11047", "STATE" # remote options
url = f"{self.url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}" url = f"{self.url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}"
session = create_session(self.proxy) session = create_session(self.proxy, has_retry=True)
response = session.get(url) response = session.get(url)
if response.status_code != 200: if response.status_code != 200:
raise GlassdoorException( raise GlassdoorException(
@@ -213,7 +194,7 @@ class GlassdoorScraper(Scraper):
location_type: str, location_type: str,
page_num: int, page_num: int,
cursor: str | None = None, cursor: str | None = None,
) -> dict[str, str | Any]: ) -> str:
payload = { payload = {
"operationName": "JobSearchResultsQuery", "operationName": "JobSearchResultsQuery",
"variables": { "variables": {
@@ -243,10 +224,17 @@ class GlassdoorScraper(Scraper):
payload["variables"]["filterParams"].append( payload["variables"]["filterParams"].append(
{"filterKey": "jobType", "values": filter_value} {"filterKey": "jobType", "values": filter_value}
) )
return json.dumps([payload]) return json.dumps([payload])
def parse_location(self, location_name: str) -> Location: @staticmethod
def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
for job_type in JobType:
if job_type_str in job_type.value:
return [job_type]
return None
@staticmethod
def parse_location(location_name: str) -> Location:
if not location_name or location_name == "Remote": if not location_name or location_name == "Remote":
return None return None
city, _, state = location_name.partition(", ") city, _, state = location_name.partition(", ")

View File

@@ -64,6 +64,7 @@ class IndeedScraper(Scraper):
"l": scraper_input.location, "l": scraper_input.location,
"filter": 0, "filter": 0,
"start": scraper_input.offset + page * 10, "start": scraper_input.offset + page * 10,
"sort": "date"
} }
if scraper_input.distance: if scraper_input.distance:
params["radius"] = scraper_input.distance params["radius"] = scraper_input.distance
@@ -150,6 +151,7 @@ class IndeedScraper(Scraper):
title=job["normTitle"], title=job["normTitle"],
description=description, description=description,
company_name=job["company"], company_name=job["company"],
company_url=self.url + job["companyOverviewLink"] if "companyOverviewLink" in job else None,
location=Location( location=Location(
city=job.get("jobLocationCity"), city=job.get("jobLocationCity"),
state=job.get("jobLocationState"), state=job.get("jobLocationState"),
@@ -305,7 +307,7 @@ class IndeedScraper(Scraper):
raise IndeedException("Could not find mosaic provider job cards data") raise IndeedException("Could not find mosaic provider job cards data")
else: else:
raise IndeedException( raise IndeedException(
"Could not find a script tag containing mosaic provider data" "Could not find any results for the search"
) )
@staticmethod @staticmethod

View File

@@ -4,6 +4,7 @@ jobspy.scrapers.linkedin
This module contains routines to scrape LinkedIn. This module contains routines to scrape LinkedIn.
""" """
import random
from typing import Optional from typing import Optional
from datetime import datetime from datetime import datetime
@@ -16,14 +17,14 @@ from threading import Lock
from urllib.parse import urlparse, urlunparse from urllib.parse import urlparse, urlunparse
from .. import Scraper, ScraperInput, Site from .. import Scraper, ScraperInput, Site
from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type, currency_parser
from ..exceptions import LinkedInException from ..exceptions import LinkedInException
from ..utils import create_session
from ...jobs import JobPost, Location, JobResponse, JobType, Country, Compensation from ...jobs import JobPost, Location, JobResponse, JobType, Country, Compensation
from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type, currency_parser
class LinkedInScraper(Scraper): class LinkedInScraper(Scraper):
MAX_RETRIES = 3 DELAY = 3
DELAY = 10
def __init__(self, proxy: Optional[str] = None): def __init__(self, proxy: Optional[str] = None):
""" """
@@ -57,6 +58,7 @@ class LinkedInScraper(Scraper):
return mapping.get(job_type_enum, "") return mapping.get(job_type_enum, "")
while len(job_list) < scraper_input.results_wanted and page < 1000: while len(job_list) < scraper_input.results_wanted and page < 1000:
session = create_session(is_tls=False, has_retry=True, delay=5)
params = { params = {
"keywords": scraper_input.search_term, "keywords": scraper_input.search_term,
"location": scraper_input.location, "location": scraper_input.location,
@@ -71,44 +73,30 @@ class LinkedInScraper(Scraper):
} }
params = {k: v for k, v in params.items() if v is not None} params = {k: v for k, v in params.items() if v is not None}
retries = 0
while retries < self.MAX_RETRIES:
try: try:
response = requests.get( response = session.get(
f"{self.url}/jobs-guest/jobs/api/seeMoreJobPostings/search?", f"{self.url}/jobs-guest/jobs/api/seeMoreJobPostings/search?",
params=params, params=params,
allow_redirects=True, allow_redirects=True,
proxies=self.proxy, proxies=self.proxy,
headers=self.headers(),
timeout=10, timeout=10,
) )
response.raise_for_status() response.raise_for_status()
break
except requests.HTTPError as e: except requests.HTTPError as e:
if hasattr(e, "response") and e.response is not None: raise LinkedInException(f"bad response status code: {e.response.status_code}")
if e.response.status_code in (429, 502):
time.sleep(self.DELAY)
retries += 1
continue
else:
raise LinkedInException(
f"bad response status code: {e.response.status_code}"
)
else:
raise
except ProxyError as e: except ProxyError as e:
raise LinkedInException("bad proxy") raise LinkedInException("bad proxy")
except Exception as e: except Exception as e:
raise LinkedInException(str(e)) raise LinkedInException(str(e))
else:
# Raise an exception if the maximum number of retries is reached
raise LinkedInException(
"Max retries reached, failed to get a valid response"
)
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
job_cards = soup.find_all("div", class_="base-search-card")
if len(job_cards) == 0:
return JobResponse(jobs=job_list)
for job_card in soup.find_all("div", class_="base-search-card"): for job_card in job_cards:
job_url = None job_url = None
href_tag = job_card.find("a", class_="base-card__full-link") href_tag = job_card.find("a", class_="base-card__full-link")
if href_tag and "href" in href_tag.attrs: if href_tag and "href" in href_tag.attrs:
@@ -130,6 +118,7 @@ class LinkedInScraper(Scraper):
raise LinkedInException("Exception occurred while processing jobs") raise LinkedInException("Exception occurred while processing jobs")
page += 25 page += 25
time.sleep(random.uniform(LinkedInScraper.DELAY, LinkedInScraper.DELAY + 2))
job_list = job_list[: scraper_input.results_wanted] job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list) return JobResponse(jobs=job_list)
@@ -181,22 +170,22 @@ class LinkedInScraper(Scraper):
benefits_tag = job_card.find("span", class_="result-benefits__text") benefits_tag = job_card.find("span", class_="result-benefits__text")
benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None
description, job_type = self.get_job_description(job_url) # removed to speed up scraping
# description, job_type = None, [] # description, job_type = self.get_job_description(job_url)
return JobPost( return JobPost(
title=title, title=title,
description=description,
company_name=company, company_name=company,
company_url=company_url, company_url=company_url,
location=location, location=location,
date_posted=date_posted, date_posted=date_posted,
job_url=job_url, job_url=job_url,
job_type=job_type,
compensation=compensation, compensation=compensation,
benefits=benefits, benefits=benefits,
emails=extract_emails_from_text(description) if description else None, # job_type=job_type,
num_urgent_words=count_urgent_words(description) if description else None, # description=description,
# emails=extract_emails_from_text(description) if description else None,
# num_urgent_words=count_urgent_words(description) if description else None,
) )
def get_job_description( def get_job_description(
@@ -208,12 +197,10 @@ class LinkedInScraper(Scraper):
:return: description or None :return: description or None
""" """
try: try:
response = requests.get(job_page_url, timeout=5, proxies=self.proxy) session = create_session(is_tls=False, has_retry=True)
response = session.get(job_page_url, timeout=5, proxies=self.proxy)
response.raise_for_status() response.raise_for_status()
except requests.HTTPError as e: except requests.HTTPError as e:
if hasattr(e, "response") and e.response is not None:
if e.response.status_code in (429, 502):
time.sleep(self.DELAY)
return None, None return None, None
except Exception as e: except Exception as e:
return None, None return None, None
@@ -287,3 +274,21 @@ class LinkedInScraper(Scraper):
) )
return location return location
@staticmethod
def headers() -> dict:
return {
'authority': 'www.linkedin.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'max-age=0',
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
# 'sec-ch-ua-mobile': '?0',
# 'sec-ch-ua-platform': '"macOS"',
# 'sec-fetch-dest': 'document',
# 'sec-fetch-mode': 'navigate',
# 'sec-fetch-site': 'none',
# 'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}

View File

@@ -1,8 +1,10 @@
import re import re
import numpy as np import numpy as np
import requests
import tls_client import tls_client
import requests
from requests.adapters import HTTPAdapter, Retry
from ..jobs import JobType from ..jobs import JobType
@@ -27,11 +29,11 @@ def extract_emails_from_text(text: str) -> list[str] | None:
return email_regex.findall(text) return email_regex.findall(text)
def create_session(proxy: dict | None = None, is_tls: bool = True): def create_session(proxy: dict | None = None, is_tls: bool = True, has_retry: bool = False, delay: int = 1) -> requests.Session:
""" """
Creates a tls client session Creates a requests session with optional tls, proxy, and retry settings.
:return: A session object with or without proxies. :return: A session object
""" """
if is_tls: if is_tls:
session = tls_client.Session( session = tls_client.Session(
@@ -39,17 +41,21 @@ def create_session(proxy: dict | None = None, is_tls: bool = True):
random_tls_extension_order=True, random_tls_extension_order=True,
) )
session.proxies = proxy session.proxies = proxy
# TODO multiple proxies
# if self.proxies:
# session.proxies = {
# "http": random.choice(self.proxies),
# "https": random.choice(self.proxies),
# }
else: else:
session = requests.Session() session = requests.Session()
session.allow_redirects = True session.allow_redirects = True
if proxy: if proxy:
session.proxies.update(proxy) session.proxies.update(proxy)
if has_retry:
retries = Retry(total=3,
connect=3,
status=3,
status_forcelist=[500, 502, 503, 504, 429],
backoff_factor=delay)
adapter = HTTPAdapter(max_retries=retries)
session.mount('http://', adapter)
session.mount('https://', adapter)
return session return session

View File

@@ -10,6 +10,7 @@ import re
from datetime import datetime, date from datetime import datetime, date
from typing import Optional, Tuple, Any from typing import Optional, Tuple, Any
import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
@@ -26,6 +27,8 @@ class ZipRecruiterScraper(Scraper):
""" """
site = Site(Site.ZIP_RECRUITER) site = Site(Site.ZIP_RECRUITER)
self.url = "https://www.ziprecruiter.com" self.url = "https://www.ziprecruiter.com"
self.session = create_session(proxy)
self.get_cookies()
super().__init__(site, proxy=proxy) super().__init__(site, proxy=proxy)
self.jobs_per_page = 20 self.jobs_per_page = 20
@@ -44,12 +47,10 @@ class ZipRecruiterScraper(Scraper):
if continue_token: if continue_token:
params["continue"] = continue_token params["continue"] = continue_token
try: try:
session = create_session(self.proxy, is_tls=False) response = self.session.get(
response = session.get(
f"https://api.ziprecruiter.com/jobs-app/jobs", f"https://api.ziprecruiter.com/jobs-app/jobs",
headers=self.headers(), headers=self.headers(),
params=self.add_params(scraper_input), params=self.add_params(scraper_input),
timeout=10,
) )
if response.status_code != 200: if response.status_code != 200:
raise ZipRecruiterException( raise ZipRecruiterException(
@@ -156,6 +157,11 @@ class ZipRecruiterScraper(Scraper):
num_urgent_words=count_urgent_words(description) if description else None, num_urgent_words=count_urgent_words(description) if description else None,
) )
def get_cookies(self):
url="https://api.ziprecruiter.com/jobs-app/event"
data="event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple"
self.session.post(url, data=data, headers=ZipRecruiterScraper.headers())
@staticmethod @staticmethod
def get_job_type_enum(job_type_str: str) -> list[JobType] | None: def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
for job_type in JobType: for job_type in JobType:
@@ -200,7 +206,6 @@ class ZipRecruiterScraper(Scraper):
""" """
return { return {
"Host": "api.ziprecruiter.com", "Host": "api.ziprecruiter.com",
"Cookie": "ziprecruiter_browser=018188e0-045b-4ad7-aa50-627a6c3d43aa; ziprecruiter_session=5259b2219bf95b6d2299a1417424bc2edc9f4b38; SplitSV=2016-10-19%3AU2FsdGVkX19f9%2Bx70knxc%2FeR3xXR8lWoTcYfq5QjmLU%3D%0A; __cf_bm=qXim3DtLPbOL83GIp.ddQEOFVFTc1OBGPckiHYxcz3o-1698521532-0-AfUOCkgCZyVbiW1ziUwyefCfzNrJJTTKPYnif1FZGQkT60dMowmSU/Y/lP+WiygkFPW/KbYJmyc+MQSkkad5YygYaARflaRj51abnD+SyF9V; zglobalid=68d49bd5-0326-428e-aba8-8a04b64bc67c.af2d99ff7c03.653d61bb; ziprecruiter_browser=018188e0-045b-4ad7-aa50-627a6c3d43aa; ziprecruiter_session=5259b2219bf95b6d2299a1417424bc2edc9f4b38",
"accept": "*/*", "accept": "*/*",
"x-zr-zva-override": "100000000;vid:ZT1huzm_EQlDTVEc", "x-zr-zva-override": "100000000;vid:ZT1huzm_EQlDTVEc",
"x-pushnotificationid": "0ff4983d38d7fc5b3370297f2bcffcf4b3321c418f5c22dd152a0264707602a0", "x-pushnotificationid": "0ff4983d38d7fc5b3370297f2bcffcf4b3321c418f5c22dd152a0264707602a0",