Compare commits

...

7 Commits

Author SHA1 Message Date
Augusto Gunsch
33d442bf1e Add czech to Indeed (#72) 2023-12-02 02:42:54 -06:00
Zachary Hampton
6587e464fa Update README.md 2023-11-30 11:49:31 -07:00
Vincent Yan
eed7fca300 Get full indeed description (#70) 2023-11-27 15:00:36 -06:00
Faraz Khan
dfb8c18c51 include location with 3 parts (#69) 2023-11-10 16:59:42 -06:00
Faraz Khan
81f70ff8a5 added salary data for linkedin (#68) 2023-11-09 14:57:15 -06:00
Cullen Watson
cc9e7866b7 fix linkedin bug & add linkedin company url (#67) 2023-11-08 15:51:07 -06:00
Zachary Hampton
a2c8fe046e Update README.md 2023-11-06 22:13:19 -07:00
9 changed files with 146 additions and 118 deletions

View File

@@ -4,11 +4,8 @@
**Not technical?** Try out the web scraping tool on our site at [usejobspy.com](https://usejobspy.com). **Not technical?** Try out the web scraping tool on our site at [usejobspy.com](https://usejobspy.com).
*Looking to build a data-focused software product?* **[Book a call](https://calendly.com/bunsly/15min)** *to *Looking to build a data-focused software product?* **[Book a call](https://bunsly.com/)** *to
work with us.* work with us.*
Check out another project we wrote: ***[HomeHarvest](https://github.com/Bunsly/HomeHarvest)** a Python package
for real estate scraping*
## Features ## Features
@@ -62,7 +59,7 @@ zip_recruiter Software Developer TEKsystems Phoenix
```plaintext ```plaintext
Required Required
├── site_type (List[enum]): linkedin, zip_recruiter, indeed ├── site_type (List[enum]): linkedin, zip_recruiter, indeed, glassdoor
└── search_term (str) └── search_term (str)
Optional Optional
├── location (int) ├── location (int)
@@ -107,21 +104,22 @@ The following exceptions may be raised when using JobSpy:
* `LinkedInException` * `LinkedInException`
* `IndeedException` * `IndeedException`
* `ZipRecruiterException` * `ZipRecruiterException`
* `GlassdoorException`
## Supported Countries for Job Searching ## Supported Countries for Job Searching
### **LinkedIn** ### **LinkedIn**
LinkedIn searches globally & uses only the `location` parameter. LinkedIn searches globally & uses only the `location` parameter. You can only fetch 1000 jobs max from the LinkedIn endpoint we're using
### **ZipRecruiter** ### **ZipRecruiter**
ZipRecruiter searches for jobs in **US/Canada** & uses only the `location` parameter. ZipRecruiter searches for jobs in **US/Canada** & uses only the `location` parameter.
### **Indeed** ### **Indeed / Glassdoor**
Indeed & Glassdoor supports most countries, but the `country_indeed` parameter is required. Additionally, use the `location` Indeed & Glassdoor supports most countries, but the `country_indeed` parameter is required. Additionally, use the `location`
parameter to narrow down the location, e.g. city & state if necessary. parameter to narrow down the location, e.g. city & state if necessary.
You can specify the following countries when searching on Indeed (use the exact name, * indicates support for Glassdoor): You can specify the following countries when searching on Indeed (use the exact name, * indicates support for Glassdoor):
@@ -145,6 +143,7 @@ You can specify the following countries when searching on Indeed (use the exact
| Venezuela | Vietnam | | | | Venezuela | Vietnam | | |
Glassdoor can only fetch 900 jobs from the endpoint we're using on a given search.
## Frequently Asked Questions ## Frequently Asked Questions
--- ---

View File

@@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "python-jobspy" name = "python-jobspy"
version = "1.1.25" version = "1.1.30"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy" homepage = "https://github.com/Bunsly/JobSpy"

View File

@@ -163,6 +163,7 @@ def scrape_jobs(
"site", "site",
"title", "title",
"company", "company",
"company_url",
"location", "location",
"job_type", "job_type",
"date_posted", "date_posted",

View File

@@ -55,18 +55,24 @@ class JobType(Enum):
class Country(Enum): class Country(Enum):
ARGENTINA = ("argentina", "com.ar") """
Gets the subdomain for Indeed and Glassdoor.
The second item in the tuple is the subdomain for Indeed
The third item in the tuple is the subdomain (and tld if there's a ':' separator) for Glassdoor
"""
ARGENTINA = ("argentina", "ar", "com.ar")
AUSTRALIA = ("australia", "au", "com.au") AUSTRALIA = ("australia", "au", "com.au")
AUSTRIA = ("austria", "at", "at") AUSTRIA = ("austria", "at", "at")
BAHRAIN = ("bahrain", "bh") BAHRAIN = ("bahrain", "bh")
BELGIUM = ("belgium", "be", "nl:be") BELGIUM = ("belgium", "be", "fr:be")
BRAZIL = ("brazil", "br", "com.br") BRAZIL = ("brazil", "br", "com.br")
CANADA = ("canada", "ca", "ca") CANADA = ("canada", "ca", "ca")
CHILE = ("chile", "cl") CHILE = ("chile", "cl")
CHINA = ("china", "cn") CHINA = ("china", "cn")
COLOMBIA = ("colombia", "co") COLOMBIA = ("colombia", "co")
COSTARICA = ("costa rica", "cr") COSTARICA = ("costa rica", "cr")
CZECHREPUBLIC = ("czech republic", "cz") CZECHREPUBLIC = ("czech republic,czechia", "cz")
DENMARK = ("denmark", "dk") DENMARK = ("denmark", "dk")
ECUADOR = ("ecuador", "ec") ECUADOR = ("ecuador", "ec")
EGYPT = ("egypt", "eg") EGYPT = ("egypt", "eg")
@@ -112,8 +118,8 @@ class Country(Enum):
TURKEY = ("turkey", "tr") TURKEY = ("turkey", "tr")
UKRAINE = ("ukraine", "ua") UKRAINE = ("ukraine", "ua")
UNITEDARABEMIRATES = ("united arab emirates", "ae") UNITEDARABEMIRATES = ("united arab emirates", "ae")
UK = ("uk", "uk", "co.uk") UK = ("uk,united kingdom", "uk", "co.uk")
USA = ("usa", "www", "com") USA = ("usa,us,united states", "www", "com")
URUGUAY = ("uruguay", "uy") URUGUAY = ("uruguay", "uy")
VENEZUELA = ("venezuela", "ve") VENEZUELA = ("venezuela", "ve")
VIETNAM = ("vietnam", "vn") VIETNAM = ("vietnam", "vn")
@@ -121,7 +127,7 @@ class Country(Enum):
# internal for ziprecruiter # internal for ziprecruiter
US_CANADA = ("usa/ca", "www") US_CANADA = ("usa/ca", "www")
# internal for linkeind # internal for linkedin
WORLDWIDE = ("worldwide", "www") WORLDWIDE = ("worldwide", "www")
@property @property
@@ -147,7 +153,8 @@ class Country(Enum):
"""Convert a string to the corresponding Country enum.""" """Convert a string to the corresponding Country enum."""
country_str = country_str.strip().lower() country_str = country_str.strip().lower()
for country in cls: for country in cls:
if country.value[0] == country_str: country_names = country.value[0].split(',')
if country_str in country_names:
return country return country
valid_countries = [country.value for country in cls] valid_countries = [country.value for country in cls]
raise ValueError( raise ValueError(
@@ -167,10 +174,13 @@ class Location(BaseModel):
if self.state: if self.state:
location_parts.append(self.state) location_parts.append(self.state)
if self.country and self.country not in (Country.US_CANADA, Country.WORLDWIDE): if self.country and self.country not in (Country.US_CANADA, Country.WORLDWIDE):
if self.country.value[0] in ("usa", "uk"): country_name = self.country.value[0]
location_parts.append(self.country.value[0].upper()) if "," in country_name:
country_name = country_name.split(",")[0]
if country_name in ("usa", "uk"):
location_parts.append(country_name.upper())
else: else:
location_parts.append(self.country.value[0].title()) location_parts.append(country_name.title())
return ", ".join(location_parts) return ", ".join(location_parts)
@@ -181,6 +191,10 @@ class CompensationInterval(Enum):
DAILY = "daily" DAILY = "daily"
HOURLY = "hourly" HOURLY = "hourly"
@classmethod
def get_interval(cls, pay_period):
return cls[pay_period].value if pay_period in cls.__members__ else None
class Compensation(BaseModel): class Compensation(BaseModel):
interval: Optional[CompensationInterval] = None interval: Optional[CompensationInterval] = None
@@ -196,6 +210,8 @@ class JobPost(BaseModel):
location: Optional[Location] location: Optional[Location]
description: str | None = None description: str | None = None
company_url: str | None = None
job_type: list[JobType] | None = None job_type: list[JobType] | None = None
compensation: Compensation | None = None compensation: Compensation | None = None
date_posted: date | None = None date_posted: date | None = None

View File

@@ -4,17 +4,13 @@ jobspy.scrapers.glassdoor
This module contains routines to scrape Glassdoor. This module contains routines to scrape Glassdoor.
""" """
import math
import time
import re
import json import json
from datetime import datetime, date from typing import Optional, Any
from typing import Optional, Tuple, Any from datetime import datetime, timedelta
from bs4 import BeautifulSoup
from .. import Scraper, ScraperInput, Site from .. import Scraper, ScraperInput, Site
from ..exceptions import GlassdoorException from ..exceptions import GlassdoorException
from ..utils import count_urgent_words, extract_emails_from_text, create_session from ..utils import create_session
from ...jobs import ( from ...jobs import (
JobPost, JobPost,
Compensation, Compensation,
@@ -22,7 +18,6 @@ from ...jobs import (
Location, Location,
JobResponse, JobResponse,
JobType, JobType,
Country,
) )
@@ -49,9 +44,6 @@ class GlassdoorScraper(Scraper):
) -> (list[JobPost], str | None): ) -> (list[JobPost], str | None):
""" """
Scrapes a page of Glassdoor for jobs with scraper_input criteria Scrapes a page of Glassdoor for jobs with scraper_input criteria
:param scraper_input:
:return: jobs found on page
:return: cursor for next page
""" """
try: try:
payload = self.add_payload( payload = self.add_payload(
@@ -86,8 +78,9 @@ class GlassdoorScraper(Scraper):
company_name = job["header"]["employerNameFromSearch"] company_name = job["header"]["employerNameFromSearch"]
location_name = job["header"].get("locationName", "") location_name = job["header"].get("locationName", "")
location_type = job["header"].get("locationType", "") location_type = job["header"].get("locationType", "")
is_remote = False age_in_days = job["header"].get("ageInDays")
location = None is_remote, location = False, None
date_posted = (datetime.now() - timedelta(days=age_in_days)).date() if age_in_days else None
if location_type == "S": if location_type == "S":
is_remote = True is_remote = True
@@ -99,10 +92,11 @@ class GlassdoorScraper(Scraper):
job = JobPost( job = JobPost(
title=title, title=title,
company_name=company_name, company_name=company_name,
date_posted=date_posted,
job_url=job_url, job_url=job_url,
location=location, location=location,
compensation=compensation, compensation=compensation,
is_remote=is_remote, is_remote=is_remote
) )
jobs.append(job) jobs.append(job)
@@ -161,15 +155,8 @@ class GlassdoorScraper(Scraper):
interval = None interval = None
if pay_period == "ANNUAL": if pay_period == "ANNUAL":
interval = CompensationInterval.YEARLY interval = CompensationInterval.YEARLY
elif pay_period == "MONTHLY": elif pay_period:
interval = CompensationInterval.MONTHLY interval = CompensationInterval.get_interval(pay_period)
elif pay_period == "WEEKLY":
interval = CompensationInterval.WEEKLY
elif pay_period == "DAILY":
interval = CompensationInterval.DAILY
elif pay_period == "HOURLY":
interval = CompensationInterval.HOURLY
min_amount = int(adjusted_pay.get("p10") // 1) min_amount = int(adjusted_pay.get("p10") // 1)
max_amount = int(adjusted_pay.get("p90") // 1) max_amount = int(adjusted_pay.get("p90") // 1)
@@ -180,12 +167,6 @@ class GlassdoorScraper(Scraper):
currency=currency, currency=currency,
) )
def get_job_type_enum(self, job_type_str: str) -> list[JobType] | None:
for job_type in JobType:
if job_type_str in job_type.value:
return [job_type]
return None
def get_location(self, location: str, is_remote: bool) -> (int, str): def get_location(self, location: str, is_remote: bool) -> (int, str):
if not location or is_remote: if not location or is_remote:
return "11047", "STATE" # remote options return "11047", "STATE" # remote options
@@ -243,10 +224,17 @@ class GlassdoorScraper(Scraper):
payload["variables"]["filterParams"].append( payload["variables"]["filterParams"].append(
{"filterKey": "jobType", "values": filter_value} {"filterKey": "jobType", "values": filter_value}
) )
return json.dumps([payload]) return json.dumps([payload])
def parse_location(self, location_name: str) -> Location: @staticmethod
def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
for job_type in JobType:
if job_type_str in job_type.value:
return [job_type]
return None
@staticmethod
def parse_location(location_name: str) -> Location:
if not location_name or location_name == "Remote": if not location_name or location_name == "Remote":
return None return None
city, _, state = location_name.partition(", ") city, _, state = location_name.partition(", ")

View File

@@ -64,6 +64,7 @@ class IndeedScraper(Scraper):
"l": scraper_input.location, "l": scraper_input.location,
"filter": 0, "filter": 0,
"start": scraper_input.offset + page * 10, "start": scraper_input.offset + page * 10,
"sort": "date"
} }
if scraper_input.distance: if scraper_input.distance:
params["radius"] = scraper_input.distance params["radius"] = scraper_input.distance
@@ -150,6 +151,7 @@ class IndeedScraper(Scraper):
title=job["normTitle"], title=job["normTitle"],
description=description, description=description,
company_name=job["company"], company_name=job["company"],
company_url=self.url + job["companyOverviewLink"] if "companyOverviewLink" in job else None,
location=Location( location=Location(
city=job.get("jobLocationCity"), city=job.get("jobLocationCity"),
state=job.get("jobLocationState"), state=job.get("jobLocationState"),
@@ -235,24 +237,9 @@ class IndeedScraper(Scraper):
if response.status_code not in range(200, 400): if response.status_code not in range(200, 400):
return None return None
soup = BeautifulSoup(response.text, "html.parser")
script_tag = soup.find(
"script", text=lambda x: x and "window._initialData" in x
)
if not script_tag:
return None
script_code = script_tag.string
match = re.search(r"window\._initialData\s*=\s*({.*?})\s*;", script_code, re.S)
if not match:
return None
json_string = match.group(1)
data = json.loads(json_string)
try: try:
job_description = data["jobInfoWrapperModel"]["jobInfoModel"][ data = json.loads(response.text)
job_description = data["body"]["jobInfoWrapperModel"]["jobInfoModel"][
"sanitizedJobDescription" "sanitizedJobDescription"
] ]
except (KeyError, TypeError, IndexError): except (KeyError, TypeError, IndexError):
@@ -320,7 +307,7 @@ class IndeedScraper(Scraper):
raise IndeedException("Could not find mosaic provider job cards data") raise IndeedException("Could not find mosaic provider job cards data")
else: else:
raise IndeedException( raise IndeedException(
"Could not find a script tag containing mosaic provider data" "Could not find any results for the search"
) )
@staticmethod @staticmethod

View File

@@ -10,15 +10,15 @@ from datetime import datetime
import requests import requests
import time import time
from requests.exceptions import ProxyError from requests.exceptions import ProxyError
from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bs4.element import Tag from bs4.element import Tag
from threading import Lock from threading import Lock
from urllib.parse import urlparse, urlunparse
from .. import Scraper, ScraperInput, Site from .. import Scraper, ScraperInput, Site
from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type, currency_parser
from ..exceptions import LinkedInException from ..exceptions import LinkedInException
from ...jobs import JobPost, Location, JobResponse, JobType, Country from ...jobs import JobPost, Location, JobResponse, JobType, Country, Compensation
class LinkedInScraper(Scraper): class LinkedInScraper(Scraper):
@@ -66,12 +66,10 @@ class LinkedInScraper(Scraper):
if scraper_input.job_type if scraper_input.job_type
else None, else None,
"pageNum": 0, "pageNum": 0,
page: page + scraper_input.offset, "start": page + scraper_input.offset,
"f_AL": "true" if scraper_input.easy_apply else None, "f_AL": "true" if scraper_input.easy_apply else None,
} }
params = {k: v for k, v in params.items() if v is not None}
params = {k: v for k, v in params.items() if v is not None} params = {k: v for k, v in params.items() if v is not None}
retries = 0 retries = 0
while retries < self.MAX_RETRIES: while retries < self.MAX_RETRIES:
@@ -88,7 +86,7 @@ class LinkedInScraper(Scraper):
break break
except requests.HTTPError as e: except requests.HTTPError as e:
if hasattr(e, "response") and e.response is not None: if hasattr(e, "response") and e.response is not None:
if e.response.status_code == 429: if e.response.status_code in (429, 502):
time.sleep(self.DELAY) time.sleep(self.DELAY)
retries += 1 retries += 1
continue continue
@@ -110,43 +108,59 @@ class LinkedInScraper(Scraper):
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
with ThreadPoolExecutor(max_workers=5) as executor: for job_card in soup.find_all("div", class_="base-search-card"):
futures = [] job_url = None
for job_card in soup.find_all("div", class_="base-search-card"): href_tag = job_card.find("a", class_="base-card__full-link")
job_url = None if href_tag and "href" in href_tag.attrs:
href_tag = job_card.find("a", class_="base-card__full-link") href = href_tag.attrs["href"].split("?")[0]
if href_tag and "href" in href_tag.attrs: job_id = href.split("-")[-1]
href = href_tag.attrs["href"].split("?")[0] job_url = f"{self.url}/jobs/view/{job_id}"
job_id = href.split("-")[-1]
job_url = f"{self.url}/jobs/view/{job_id}"
with url_lock: with url_lock:
if job_url in seen_urls: if job_url in seen_urls:
continue continue
seen_urls.add(job_url) seen_urls.add(job_url)
futures.append(executor.submit(self.process_job, job_card, job_url)) # Call process_job directly without threading
try:
job_post = self.process_job(job_card, job_url)
if job_post:
job_list.append(job_post)
except Exception as e:
raise LinkedInException("Exception occurred while processing jobs")
for future in as_completed(futures):
try:
job_post = future.result()
if job_post:
job_list.append(job_post)
except Exception as e:
raise LinkedInException(
"Exception occurred while processing jobs"
)
page += 25 page += 25
job_list = job_list[: scraper_input.results_wanted] job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list) return JobResponse(jobs=job_list)
def process_job(self, job_card: Tag, job_url: str) -> Optional[JobPost]: def process_job(self, job_card: Tag, job_url: str) -> Optional[JobPost]:
salary_tag = job_card.find('span', class_='job-search-card__salary-info')
compensation = None
if salary_tag:
salary_text = salary_tag.get_text(separator=' ').strip()
salary_values = [currency_parser(value) for value in salary_text.split('-')]
salary_min = salary_values[0]
salary_max = salary_values[1]
currency = salary_text[0] if salary_text[0] != '$' else 'USD'
compensation = Compensation(
min_amount=int(salary_min),
max_amount=int(salary_max),
currency=currency,
)
title_tag = job_card.find("span", class_="sr-only") title_tag = job_card.find("span", class_="sr-only")
title = title_tag.get_text(strip=True) if title_tag else "N/A" title = title_tag.get_text(strip=True) if title_tag else "N/A"
company_tag = job_card.find("h4", class_="base-search-card__subtitle") company_tag = job_card.find("h4", class_="base-search-card__subtitle")
company_a_tag = company_tag.find("a") if company_tag else None company_a_tag = company_tag.find("a") if company_tag else None
company_url = (
urlunparse(urlparse(company_a_tag.get("href"))._replace(query=""))
if company_a_tag and company_a_tag.has_attr("href")
else ""
)
company = company_a_tag.get_text(strip=True) if company_a_tag else "N/A" company = company_a_tag.get_text(strip=True) if company_a_tag else "N/A"
metadata_card = job_card.find("div", class_="base-search-card__metadata") metadata_card = job_card.find("div", class_="base-search-card__metadata")
@@ -168,15 +182,18 @@ class LinkedInScraper(Scraper):
benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None
description, job_type = self.get_job_description(job_url) description, job_type = self.get_job_description(job_url)
# description, job_type = None, []
return JobPost( return JobPost(
title=title, title=title,
description=description, description=description,
company_name=company, company_name=company,
company_url=company_url,
location=location, location=location,
date_posted=date_posted, date_posted=date_posted,
job_url=job_url, job_url=job_url,
job_type=job_type, job_type=job_type,
compensation=compensation,
benefits=benefits, benefits=benefits,
emails=extract_emails_from_text(description) if description else None, emails=extract_emails_from_text(description) if description else None,
num_urgent_words=count_urgent_words(description) if description else None, num_urgent_words=count_urgent_words(description) if description else None,
@@ -193,8 +210,15 @@ class LinkedInScraper(Scraper):
try: try:
response = requests.get(job_page_url, timeout=5, proxies=self.proxy) response = requests.get(job_page_url, timeout=5, proxies=self.proxy)
response.raise_for_status() response.raise_for_status()
except requests.HTTPError as e:
if hasattr(e, "response") and e.response is not None:
if e.response.status_code in (429, 502):
time.sleep(self.DELAY)
return None, None
except Exception as e: except Exception as e:
return None, None return None, None
if response.url == "https://www.linkedin.com/signup":
return None, None
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
div_content = soup.find( div_content = soup.find(
@@ -230,7 +254,7 @@ class LinkedInScraper(Scraper):
employment_type = employment_type.lower() employment_type = employment_type.lower()
employment_type = employment_type.replace("-", "") employment_type = employment_type.replace("-", "")
return [get_enum_from_job_type(employment_type)] return [get_enum_from_job_type(employment_type)] if employment_type else []
return description, get_job_type(soup) return description, get_job_type(soup)
@@ -254,5 +278,12 @@ class LinkedInScraper(Scraper):
state=state, state=state,
country=Country.from_string(self.country), country=Country.from_string(self.country),
) )
elif len(parts) == 3:
city, state, country = parts
location = Location(
city=city,
state=state,
country=Country.from_string(country),
)
return location return location

View File

@@ -1,4 +1,5 @@
import re import re
import numpy as np
import requests import requests
import tls_client import tls_client
@@ -38,12 +39,6 @@ def create_session(proxy: dict | None = None, is_tls: bool = True):
random_tls_extension_order=True, random_tls_extension_order=True,
) )
session.proxies = proxy session.proxies = proxy
# TODO multiple proxies
# if self.proxies:
# session.proxies = {
# "http": random.choice(self.proxies),
# "https": random.choice(self.proxies),
# }
else: else:
session = requests.Session() session = requests.Session()
session.allow_redirects = True session.allow_redirects = True
@@ -62,3 +57,19 @@ def get_enum_from_job_type(job_type_str: str) -> JobType | None:
if job_type_str in job_type.value: if job_type_str in job_type.value:
res = job_type res = job_type
return res return res
def currency_parser(cur_str):
# Remove any non-numerical characters
# except for ',' '.' or '-' (e.g. EUR)
cur_str = re.sub("[^-0-9.,]", '', cur_str)
# Remove any 000s separators (either , or .)
cur_str = re.sub("[.,]", '', cur_str[:-3]) + cur_str[-3:]
if '.' in list(cur_str[-3:]):
num = float(cur_str)
elif ',' in list(cur_str[-3:]):
num = float(cur_str.replace(',', '.'))
else:
num = float(cur_str)
return np.round(num, 2)

View File

@@ -44,12 +44,12 @@ class ZipRecruiterScraper(Scraper):
if continue_token: if continue_token:
params["continue"] = continue_token params["continue"] = continue_token
try: try:
session = create_session(self.proxy, is_tls=False) session = create_session(self.proxy, is_tls=True)
response = session.get( response = session.get(
f"https://api.ziprecruiter.com/jobs-app/jobs", f"https://api.ziprecruiter.com/jobs-app/jobs",
headers=self.headers(), headers=self.headers(),
params=self.add_params(scraper_input), params=self.add_params(scraper_input),
timeout=10, timeout_seconds=10,
) )
if response.status_code != 200: if response.status_code != 200:
raise ZipRecruiterException( raise ZipRecruiterException(
@@ -195,17 +195,12 @@ class ZipRecruiterScraper(Scraper):
@staticmethod @staticmethod
def headers() -> dict: def headers() -> dict:
""" """
Returns headers needed for requests Returns headers needed for ZipRecruiter API requests
:return: dict - Dictionary containing headers :return: dict - Dictionary containing headers
""" """
return { return {
"Host": "api.ziprecruiter.com", 'Host': 'api.ziprecruiter.com',
"Cookie": "ziprecruiter_browser=018188e0-045b-4ad7-aa50-627a6c3d43aa; ziprecruiter_session=5259b2219bf95b6d2299a1417424bc2edc9f4b38; SplitSV=2016-10-19%3AU2FsdGVkX19f9%2Bx70knxc%2FeR3xXR8lWoTcYfq5QjmLU%3D%0A; __cf_bm=qXim3DtLPbOL83GIp.ddQEOFVFTc1OBGPckiHYxcz3o-1698521532-0-AfUOCkgCZyVbiW1ziUwyefCfzNrJJTTKPYnif1FZGQkT60dMowmSU/Y/lP+WiygkFPW/KbYJmyc+MQSkkad5YygYaARflaRj51abnD+SyF9V; zglobalid=68d49bd5-0326-428e-aba8-8a04b64bc67c.af2d99ff7c03.653d61bb; ziprecruiter_browser=018188e0-045b-4ad7-aa50-627a6c3d43aa; ziprecruiter_session=5259b2219bf95b6d2299a1417424bc2edc9f4b38", 'accept': '*/*',
"accept": "*/*", 'authorization': 'Basic YTBlZjMyZDYtN2I0Yy00MWVkLWEyODMtYTI1NDAzMzI0YTcyOg==',
"x-zr-zva-override": "100000000;vid:ZT1huzm_EQlDTVEc", 'Cookie': '__cf_bm=DZ7eJOw6lka.Bwy5jLeDqWanaZ8BJlVAwaXrmcbYnxM-1701505132-0-AfGaVIfTA2kJlmleK14o722vbVwpZ+4UxFznsWv+guvzXSpD9KVEy/+pNzvEZUx88yaEShJwGt3/EVjhHirX/ASustKxg47V/aXRd2XIO2QN; zglobalid=61f94830-1990-4130-b222-d9d0e09c7825.57da9ea9581c.656ae86b; ziprecruiter_browser=018188e0-045b-4ad7-aa50-627a6c3d43aa; ziprecruiter_session=5259b2219bf95b6d2299a1417424bc2edc9f4b38; zva=100000000%3Bvid%3AZWroa0x_F1KEeGeU'
"x-pushnotificationid": "0ff4983d38d7fc5b3370297f2bcffcf4b3321c418f5c22dd152a0264707602a0",
"x-deviceid": "D77B3A92-E589-46A4-8A39-6EF6F1D86006",
"user-agent": "Job Search/87.0 (iPhone; CPU iOS 16_6_1 like Mac OS X)",
"authorization": "Basic YTBlZjMyZDYtN2I0Yy00MWVkLWEyODMtYTI1NDAzMzI0YTcyOg==",
"accept-language": "en-US,en;q=0.9",
} }