Merge remote-tracking branch 'upstream/main'

pull/91/head
WillBlears 2023-12-04 13:23:22 -05:00
commit 6ad11e3229
7 changed files with 65 additions and 85 deletions

View File

@ -5,10 +5,7 @@
**Not technical?** Try out the web scraping tool on our site at [usejobspy.com](https://usejobspy.com).
*Looking to build a data-focused software product?* **[Book a call](https://bunsly.com/)** *to
work with us.*
Check out another project we wrote: ***[HomeHarvest](https://github.com/Bunsly/HomeHarvest)** a Python package
for real estate scraping*
work with us.*
## Features

View File

@ -1,6 +1,6 @@
[tool.poetry]
name = "python-jobspy"
version = "1.1.28"
version = "1.1.30"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy"

View File

@ -55,18 +55,24 @@ class JobType(Enum):
class Country(Enum):
ARGENTINA = ("argentina", "com.ar")
"""
Gets the subdomain for Indeed and Glassdoor.
The second item in the tuple is the subdomain for Indeed
The third item in the tuple is the subdomain (and tld if there's a ':' separator) for Glassdoor
"""
ARGENTINA = ("argentina", "ar", "com.ar")
AUSTRALIA = ("australia", "au", "com.au")
AUSTRIA = ("austria", "at", "at")
BAHRAIN = ("bahrain", "bh")
BELGIUM = ("belgium", "be", "nl:be")
BELGIUM = ("belgium", "be", "fr:be")
BRAZIL = ("brazil", "br", "com.br")
CANADA = ("canada", "ca", "ca")
CHILE = ("chile", "cl")
CHINA = ("china", "cn")
COLOMBIA = ("colombia", "co")
COSTARICA = ("costa rica", "cr")
CZECHREPUBLIC = ("czech republic", "cz")
CZECHREPUBLIC = ("czech republic,czechia", "cz")
DENMARK = ("denmark", "dk")
ECUADOR = ("ecuador", "ec")
EGYPT = ("egypt", "eg")
@ -112,8 +118,8 @@ class Country(Enum):
TURKEY = ("turkey", "tr")
UKRAINE = ("ukraine", "ua")
UNITEDARABEMIRATES = ("united arab emirates", "ae")
UK = ("uk", "uk", "co.uk")
USA = ("usa", "www", "com")
UK = ("uk,united kingdom", "uk", "co.uk")
USA = ("usa,us,united states", "www", "com")
URUGUAY = ("uruguay", "uy")
VENEZUELA = ("venezuela", "ve")
VIETNAM = ("vietnam", "vn")
@ -121,7 +127,7 @@ class Country(Enum):
# internal for ziprecruiter
US_CANADA = ("usa/ca", "www")
# internal for linkeind
# internal for linkedin
WORLDWIDE = ("worldwide", "www")
@property
@ -147,7 +153,8 @@ class Country(Enum):
"""Convert a string to the corresponding Country enum."""
country_str = country_str.strip().lower()
for country in cls:
if country.value[0] == country_str:
country_names = country.value[0].split(',')
if country_str in country_names:
return country
valid_countries = [country.value for country in cls]
raise ValueError(
@ -167,10 +174,13 @@ class Location(BaseModel):
if self.state:
location_parts.append(self.state)
if self.country and self.country not in (Country.US_CANADA, Country.WORLDWIDE):
if self.country.value[0] in ("usa", "uk"):
location_parts.append(self.country.value[0].upper())
country_name = self.country.value[0]
if "," in country_name:
country_name = country_name.split(",")[0]
if country_name in ("usa", "uk"):
location_parts.append(country_name.upper())
else:
location_parts.append(self.country.value[0].title())
location_parts.append(country_name.title())
return ", ".join(location_parts)
@ -181,6 +191,10 @@ class CompensationInterval(Enum):
DAILY = "daily"
HOURLY = "hourly"
@classmethod
def get_interval(cls, pay_period):
return cls[pay_period].value if pay_period in cls.__members__ else None
class Compensation(BaseModel):
interval: Optional[CompensationInterval] = None

View File

@ -4,17 +4,13 @@ jobspy.scrapers.glassdoor
This module contains routines to scrape Glassdoor.
"""
import math
import time
import re
import json
from datetime import datetime, date
from typing import Optional, Tuple, Any
from bs4 import BeautifulSoup
from typing import Optional, Any
from datetime import datetime, timedelta
from .. import Scraper, ScraperInput, Site
from ..exceptions import GlassdoorException
from ..utils import count_urgent_words, extract_emails_from_text, create_session
from ..utils import create_session
from ...jobs import (
JobPost,
Compensation,
@ -22,7 +18,6 @@ from ...jobs import (
Location,
JobResponse,
JobType,
Country,
)
@ -49,9 +44,6 @@ class GlassdoorScraper(Scraper):
) -> (list[JobPost], str | None):
"""
Scrapes a page of Glassdoor for jobs with scraper_input criteria
:param scraper_input:
:return: jobs found on page
:return: cursor for next page
"""
try:
payload = self.add_payload(
@ -86,8 +78,9 @@ class GlassdoorScraper(Scraper):
company_name = job["header"]["employerNameFromSearch"]
location_name = job["header"].get("locationName", "")
location_type = job["header"].get("locationType", "")
is_remote = False
location = None
age_in_days = job["header"].get("ageInDays")
is_remote, location = False, None
date_posted = (datetime.now() - timedelta(days=age_in_days)).date() if age_in_days else None
if location_type == "S":
is_remote = True
@ -99,10 +92,11 @@ class GlassdoorScraper(Scraper):
job = JobPost(
title=title,
company_name=company_name,
date_posted=date_posted,
job_url=job_url,
location=location,
compensation=compensation,
is_remote=is_remote,
is_remote=is_remote
)
jobs.append(job)
@ -161,15 +155,8 @@ class GlassdoorScraper(Scraper):
interval = None
if pay_period == "ANNUAL":
interval = CompensationInterval.YEARLY
elif pay_period == "MONTHLY":
interval = CompensationInterval.MONTHLY
elif pay_period == "WEEKLY":
interval = CompensationInterval.WEEKLY
elif pay_period == "DAILY":
interval = CompensationInterval.DAILY
elif pay_period == "HOURLY":
interval = CompensationInterval.HOURLY
elif pay_period:
interval = CompensationInterval.get_interval(pay_period)
min_amount = int(adjusted_pay.get("p10") // 1)
max_amount = int(adjusted_pay.get("p90") // 1)
@ -180,12 +167,6 @@ class GlassdoorScraper(Scraper):
currency=currency,
)
def get_job_type_enum(self, job_type_str: str) -> list[JobType] | None:
for job_type in JobType:
if job_type_str in job_type.value:
return [job_type]
return None
def get_location(self, location: str, is_remote: bool) -> (int, str):
if not location or is_remote:
return "11047", "STATE" # remote options
@ -243,10 +224,17 @@ class GlassdoorScraper(Scraper):
payload["variables"]["filterParams"].append(
{"filterKey": "jobType", "values": filter_value}
)
return json.dumps([payload])
def parse_location(self, location_name: str) -> Location:
@staticmethod
def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
for job_type in JobType:
if job_type_str in job_type.value:
return [job_type]
return None
@staticmethod
def parse_location(location_name: str) -> Location:
if not location_name or location_name == "Remote":
return None
city, _, state = location_name.partition(", ")

View File

@ -65,6 +65,7 @@ class IndeedScraper(Scraper):
"l": scraper_input.location,
"filter": 0,
"start": scraper_input.offset + page * 10,
"sort": "date"
}
if scraper_input.distance:
params["radius"] = scraper_input.distance
@ -151,6 +152,7 @@ class IndeedScraper(Scraper):
title=job["displayTitle"],
description=description,
company_name=job["company"],
company_url=self.url + job["companyOverviewLink"] if "companyOverviewLink" in job else None,
location=Location(
city=job.get("jobLocationCity"),
state=job.get("jobLocationState"),
@ -237,28 +239,18 @@ class IndeedScraper(Scraper):
if response.status_code not in range(200, 400):
return None
# Search for job description in the response content
job_desc_pattern = re.compile(r'"sanitizedJobDescription":"(.*?)"\s*,', re.DOTALL)
job_desc_match = job_desc_pattern.search(response.text)
# If a match is found, parse the HTML to extract the text
if job_desc_match:
# Extracting the job description HTML content
job_desc_html = job_desc_match.group(1)
# Unescape HTML entities
job_desc_html = html.unescape(job_desc_html)
# Replace escaped forward slashes and remove line breaks
job_desc_html = job_desc_html.replace('\\/', '/').replace('\\n', ' ')
# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(job_desc_html, "html.parser")
# Extract text content from the HTML, with whitespace normalized
text_content = ' '.join(soup.get_text(separator=" ").split())
# Further clean up to remove any tags that might have been missed
clean_text = re.sub(r'<[^>]+>', '', text_content)
return clean_text.strip()
else:
try:
data = json.loads(response.text)
job_description = data["body"]["jobInfoWrapperModel"]["jobInfoModel"][
"sanitizedJobDescription"
]
except (KeyError, TypeError, IndexError):
return None
soup = BeautifulSoup(job_description, "html.parser")
text_content = " ".join(soup.get_text(separator=" ").split()).strip()
return text_content
@staticmethod
def get_job_type(job: dict) -> list[JobType] | None:
@ -317,7 +309,7 @@ class IndeedScraper(Scraper):
raise IndeedException("Could not find mosaic provider job cards data")
else:
raise IndeedException(
"Could not find a script tag containing mosaic provider data"
"Could not find any results for the search"
)
@staticmethod

View File

@ -39,12 +39,6 @@ def create_session(proxy: dict | None = None, is_tls: bool = True):
random_tls_extension_order=True,
)
session.proxies = proxy
# TODO multiple proxies
# if self.proxies:
# session.proxies = {
# "http": random.choice(self.proxies),
# "https": random.choice(self.proxies),
# }
else:
session = requests.Session()
session.allow_redirects = True

View File

@ -44,12 +44,12 @@ class ZipRecruiterScraper(Scraper):
if continue_token:
params["continue"] = continue_token
try:
session = create_session(self.proxy, is_tls=False)
session = create_session(self.proxy, is_tls=True)
response = session.get(
f"https://api.ziprecruiter.com/jobs-app/jobs",
headers=self.headers(),
params=self.add_params(scraper_input),
timeout=10,
timeout_seconds=10,
)
if response.status_code != 200:
raise ZipRecruiterException(
@ -195,17 +195,12 @@ class ZipRecruiterScraper(Scraper):
@staticmethod
def headers() -> dict:
"""
Returns headers needed for requests
Returns headers needed for ZipRecruiter API requests
:return: dict - Dictionary containing headers
"""
return {
"Host": "api.ziprecruiter.com",
"Cookie": "ziprecruiter_browser=018188e0-045b-4ad7-aa50-627a6c3d43aa; ziprecruiter_session=5259b2219bf95b6d2299a1417424bc2edc9f4b38; SplitSV=2016-10-19%3AU2FsdGVkX19f9%2Bx70knxc%2FeR3xXR8lWoTcYfq5QjmLU%3D%0A; __cf_bm=qXim3DtLPbOL83GIp.ddQEOFVFTc1OBGPckiHYxcz3o-1698521532-0-AfUOCkgCZyVbiW1ziUwyefCfzNrJJTTKPYnif1FZGQkT60dMowmSU/Y/lP+WiygkFPW/KbYJmyc+MQSkkad5YygYaARflaRj51abnD+SyF9V; zglobalid=68d49bd5-0326-428e-aba8-8a04b64bc67c.af2d99ff7c03.653d61bb; ziprecruiter_browser=018188e0-045b-4ad7-aa50-627a6c3d43aa; ziprecruiter_session=5259b2219bf95b6d2299a1417424bc2edc9f4b38",
"accept": "*/*",
"x-zr-zva-override": "100000000;vid:ZT1huzm_EQlDTVEc",
"x-pushnotificationid": "0ff4983d38d7fc5b3370297f2bcffcf4b3321c418f5c22dd152a0264707602a0",
"x-deviceid": "D77B3A92-E589-46A4-8A39-6EF6F1D86006",
"user-agent": "Job Search/87.0 (iPhone; CPU iOS 16_6_1 like Mac OS X)",
"authorization": "Basic YTBlZjMyZDYtN2I0Yy00MWVkLWEyODMtYTI1NDAzMzI0YTcyOg==",
"accept-language": "en-US,en;q=0.9",
'Host': 'api.ziprecruiter.com',
'accept': '*/*',
'authorization': 'Basic YTBlZjMyZDYtN2I0Yy00MWVkLWEyODMtYTI1NDAzMzI0YTcyOg==',
'Cookie': '__cf_bm=DZ7eJOw6lka.Bwy5jLeDqWanaZ8BJlVAwaXrmcbYnxM-1701505132-0-AfGaVIfTA2kJlmleK14o722vbVwpZ+4UxFznsWv+guvzXSpD9KVEy/+pNzvEZUx88yaEShJwGt3/EVjhHirX/ASustKxg47V/aXRd2XIO2QN; zglobalid=61f94830-1990-4130-b222-d9d0e09c7825.57da9ea9581c.656ae86b; ziprecruiter_browser=018188e0-045b-4ad7-aa50-627a6c3d43aa; ziprecruiter_session=5259b2219bf95b6d2299a1417424bc2edc9f4b38; zva=100000000%3Bvid%3AZWroa0x_F1KEeGeU'
}