Compare commits

...

8 Commits

Author SHA1 Message Date
Augusto Gunsch
33d442bf1e Add czech to Indeed (#72) 2023-12-02 02:42:54 -06:00
Zachary Hampton
6587e464fa Update README.md 2023-11-30 11:49:31 -07:00
Vincent Yan
eed7fca300 Get full indeed description (#70) 2023-11-27 15:00:36 -06:00
Faraz Khan
dfb8c18c51 include location with 3 parts (#69) 2023-11-10 16:59:42 -06:00
Faraz Khan
81f70ff8a5 added salary data for linkedin (#68) 2023-11-09 14:57:15 -06:00
Cullen Watson
cc9e7866b7 fix linkedin bug & add linkedin company url (#67) 2023-11-08 15:51:07 -06:00
Zachary Hampton
a2c8fe046e Update README.md 2023-11-06 22:13:19 -07:00
Cullen Watson
2b7fea40a5 [fix] glassdoor duplicates 2023-10-30 20:29:55 -05:00
9 changed files with 149 additions and 118 deletions

View File

@@ -4,12 +4,9 @@
**Not technical?** Try out the web scraping tool on our site at [usejobspy.com](https://usejobspy.com).
*Looking to build a data-focused software product?* **[Book a call](https://calendly.com/bunsly/15min)** *to
*Looking to build a data-focused software product?* **[Book a call](https://bunsly.com/)** *to
work with us.*
Check out another project we wrote: ***[HomeHarvest](https://github.com/Bunsly/HomeHarvest)** a Python package
for real estate scraping*
## Features
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, & **ZipRecruiter** simultaneously
@@ -62,7 +59,7 @@ zip_recruiter Software Developer TEKsystems Phoenix
```plaintext
Required
├── site_type (List[enum]): linkedin, zip_recruiter, indeed
├── site_type (List[enum]): linkedin, zip_recruiter, indeed, glassdoor
└── search_term (str)
Optional
├── location (int)
@@ -107,18 +104,19 @@ The following exceptions may be raised when using JobSpy:
* `LinkedInException`
* `IndeedException`
* `ZipRecruiterException`
* `GlassdoorException`
## Supported Countries for Job Searching
### **LinkedIn**
LinkedIn searches globally & uses only the `location` parameter.
LinkedIn searches globally & uses only the `location` parameter. You can only fetch 1000 jobs max from the LinkedIn endpoint we're using
### **ZipRecruiter**
ZipRecruiter searches for jobs in **US/Canada** & uses only the `location` parameter.
### **Indeed**
### **Indeed / Glassdoor**
Indeed & Glassdoor supports most countries, but the `country_indeed` parameter is required. Additionally, use the `location`
parameter to narrow down the location, e.g. city & state if necessary.
@@ -145,6 +143,7 @@ You can specify the following countries when searching on Indeed (use the exact
| Venezuela | Vietnam | | |
Glassdoor can only fetch 900 jobs from the endpoint we're using on a given search.
## Frequently Asked Questions
---

View File

@@ -1,6 +1,6 @@
[tool.poetry]
name = "python-jobspy"
version = "1.1.24"
version = "1.1.30"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy"

View File

@@ -163,6 +163,7 @@ def scrape_jobs(
"site",
"title",
"company",
"company_url",
"location",
"job_type",
"date_posted",

View File

@@ -55,18 +55,24 @@ class JobType(Enum):
class Country(Enum):
ARGENTINA = ("argentina", "com.ar")
"""
Gets the subdomain for Indeed and Glassdoor.
The second item in the tuple is the subdomain for Indeed
The third item in the tuple is the subdomain (and tld if there's a ':' separator) for Glassdoor
"""
ARGENTINA = ("argentina", "ar", "com.ar")
AUSTRALIA = ("australia", "au", "com.au")
AUSTRIA = ("austria", "at", "at")
BAHRAIN = ("bahrain", "bh")
BELGIUM = ("belgium", "be", "nl:be")
BELGIUM = ("belgium", "be", "fr:be")
BRAZIL = ("brazil", "br", "com.br")
CANADA = ("canada", "ca", "ca")
CHILE = ("chile", "cl")
CHINA = ("china", "cn")
COLOMBIA = ("colombia", "co")
COSTARICA = ("costa rica", "cr")
CZECHREPUBLIC = ("czech republic", "cz")
CZECHREPUBLIC = ("czech republic,czechia", "cz")
DENMARK = ("denmark", "dk")
ECUADOR = ("ecuador", "ec")
EGYPT = ("egypt", "eg")
@@ -112,8 +118,8 @@ class Country(Enum):
TURKEY = ("turkey", "tr")
UKRAINE = ("ukraine", "ua")
UNITEDARABEMIRATES = ("united arab emirates", "ae")
UK = ("uk", "uk", "co.uk")
USA = ("usa", "www", "com")
UK = ("uk,united kingdom", "uk", "co.uk")
USA = ("usa,us,united states", "www", "com")
URUGUAY = ("uruguay", "uy")
VENEZUELA = ("venezuela", "ve")
VIETNAM = ("vietnam", "vn")
@@ -121,7 +127,7 @@ class Country(Enum):
# internal for ziprecruiter
US_CANADA = ("usa/ca", "www")
# internal for linkeind
# internal for linkedin
WORLDWIDE = ("worldwide", "www")
@property
@@ -147,7 +153,8 @@ class Country(Enum):
"""Convert a string to the corresponding Country enum."""
country_str = country_str.strip().lower()
for country in cls:
if country.value[0] == country_str:
country_names = country.value[0].split(',')
if country_str in country_names:
return country
valid_countries = [country.value for country in cls]
raise ValueError(
@@ -167,10 +174,13 @@ class Location(BaseModel):
if self.state:
location_parts.append(self.state)
if self.country and self.country not in (Country.US_CANADA, Country.WORLDWIDE):
if self.country.value[0] in ("usa", "uk"):
location_parts.append(self.country.value[0].upper())
country_name = self.country.value[0]
if "," in country_name:
country_name = country_name.split(",")[0]
if country_name in ("usa", "uk"):
location_parts.append(country_name.upper())
else:
location_parts.append(self.country.value[0].title())
location_parts.append(country_name.title())
return ", ".join(location_parts)
@@ -181,6 +191,10 @@ class CompensationInterval(Enum):
DAILY = "daily"
HOURLY = "hourly"
@classmethod
def get_interval(cls, pay_period):
return cls[pay_period].value if pay_period in cls.__members__ else None
class Compensation(BaseModel):
interval: Optional[CompensationInterval] = None
@@ -196,6 +210,8 @@ class JobPost(BaseModel):
location: Optional[Location]
description: str | None = None
company_url: str | None = None
job_type: list[JobType] | None = None
compensation: Compensation | None = None
date_posted: date | None = None

View File

@@ -4,17 +4,13 @@ jobspy.scrapers.glassdoor
This module contains routines to scrape Glassdoor.
"""
import math
import time
import re
import json
from datetime import datetime, date
from typing import Optional, Tuple, Any
from bs4 import BeautifulSoup
from typing import Optional, Any
from datetime import datetime, timedelta
from .. import Scraper, ScraperInput, Site
from ..exceptions import GlassdoorException
from ..utils import count_urgent_words, extract_emails_from_text, create_session
from ..utils import create_session
from ...jobs import (
JobPost,
Compensation,
@@ -22,7 +18,6 @@ from ...jobs import (
Location,
JobResponse,
JobType,
Country,
)
@@ -49,9 +44,6 @@ class GlassdoorScraper(Scraper):
) -> (list[JobPost], str | None):
"""
Scrapes a page of Glassdoor for jobs with scraper_input criteria
:param scraper_input:
:return: jobs found on page
:return: cursor for next page
"""
try:
payload = self.add_payload(
@@ -78,13 +70,17 @@ class GlassdoorScraper(Scraper):
job_url = res_json["data"]["jobListings"]["jobListingSeoLinks"][
"linkItems"
][i]["url"]
if job_url in self.seen_urls:
continue
self.seen_urls.add(job_url)
job = job["jobview"]
title = job["job"]["jobTitleText"]
company_name = job["header"]["employerNameFromSearch"]
location_name = job["header"].get("locationName", "")
location_type = job["header"].get("locationType", "")
is_remote = False
location = None
age_in_days = job["header"].get("ageInDays")
is_remote, location = False, None
date_posted = (datetime.now() - timedelta(days=age_in_days)).date() if age_in_days else None
if location_type == "S":
is_remote = True
@@ -96,10 +92,11 @@ class GlassdoorScraper(Scraper):
job = JobPost(
title=title,
company_name=company_name,
date_posted=date_posted,
job_url=job_url,
location=location,
compensation=compensation,
is_remote=is_remote,
is_remote=is_remote
)
jobs.append(job)
@@ -158,15 +155,8 @@ class GlassdoorScraper(Scraper):
interval = None
if pay_period == "ANNUAL":
interval = CompensationInterval.YEARLY
elif pay_period == "MONTHLY":
interval = CompensationInterval.MONTHLY
elif pay_period == "WEEKLY":
interval = CompensationInterval.WEEKLY
elif pay_period == "DAILY":
interval = CompensationInterval.DAILY
elif pay_period == "HOURLY":
interval = CompensationInterval.HOURLY
elif pay_period:
interval = CompensationInterval.get_interval(pay_period)
min_amount = int(adjusted_pay.get("p10") // 1)
max_amount = int(adjusted_pay.get("p90") // 1)
@@ -177,12 +167,6 @@ class GlassdoorScraper(Scraper):
currency=currency,
)
def get_job_type_enum(self, job_type_str: str) -> list[JobType] | None:
for job_type in JobType:
if job_type_str in job_type.value:
return [job_type]
return None
def get_location(self, location: str, is_remote: bool) -> (int, str):
if not location or is_remote:
return "11047", "STATE" # remote options
@@ -240,10 +224,17 @@ class GlassdoorScraper(Scraper):
payload["variables"]["filterParams"].append(
{"filterKey": "jobType", "values": filter_value}
)
return json.dumps([payload])
def parse_location(self, location_name: str) -> Location:
@staticmethod
def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
for job_type in JobType:
if job_type_str in job_type.value:
return [job_type]
return None
@staticmethod
def parse_location(location_name: str) -> Location:
if not location_name or location_name == "Remote":
return None
city, _, state = location_name.partition(", ")

View File

@@ -64,6 +64,7 @@ class IndeedScraper(Scraper):
"l": scraper_input.location,
"filter": 0,
"start": scraper_input.offset + page * 10,
"sort": "date"
}
if scraper_input.distance:
params["radius"] = scraper_input.distance
@@ -150,6 +151,7 @@ class IndeedScraper(Scraper):
title=job["normTitle"],
description=description,
company_name=job["company"],
company_url=self.url + job["companyOverviewLink"] if "companyOverviewLink" in job else None,
location=Location(
city=job.get("jobLocationCity"),
state=job.get("jobLocationState"),
@@ -235,24 +237,9 @@ class IndeedScraper(Scraper):
if response.status_code not in range(200, 400):
return None
soup = BeautifulSoup(response.text, "html.parser")
script_tag = soup.find(
"script", text=lambda x: x and "window._initialData" in x
)
if not script_tag:
return None
script_code = script_tag.string
match = re.search(r"window\._initialData\s*=\s*({.*?})\s*;", script_code, re.S)
if not match:
return None
json_string = match.group(1)
data = json.loads(json_string)
try:
job_description = data["jobInfoWrapperModel"]["jobInfoModel"][
data = json.loads(response.text)
job_description = data["body"]["jobInfoWrapperModel"]["jobInfoModel"][
"sanitizedJobDescription"
]
except (KeyError, TypeError, IndexError):
@@ -320,7 +307,7 @@ class IndeedScraper(Scraper):
raise IndeedException("Could not find mosaic provider job cards data")
else:
raise IndeedException(
"Could not find a script tag containing mosaic provider data"
"Could not find any results for the search"
)
@staticmethod

View File

@@ -10,15 +10,15 @@ from datetime import datetime
import requests
import time
from requests.exceptions import ProxyError
from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup
from bs4.element import Tag
from threading import Lock
from urllib.parse import urlparse, urlunparse
from .. import Scraper, ScraperInput, Site
from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type
from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type, currency_parser
from ..exceptions import LinkedInException
from ...jobs import JobPost, Location, JobResponse, JobType, Country
from ...jobs import JobPost, Location, JobResponse, JobType, Country, Compensation
class LinkedInScraper(Scraper):
@@ -66,12 +66,10 @@ class LinkedInScraper(Scraper):
if scraper_input.job_type
else None,
"pageNum": 0,
page: page + scraper_input.offset,
"start": page + scraper_input.offset,
"f_AL": "true" if scraper_input.easy_apply else None,
}
params = {k: v for k, v in params.items() if v is not None}
params = {k: v for k, v in params.items() if v is not None}
retries = 0
while retries < self.MAX_RETRIES:
@@ -88,7 +86,7 @@ class LinkedInScraper(Scraper):
break
except requests.HTTPError as e:
if hasattr(e, "response") and e.response is not None:
if e.response.status_code == 429:
if e.response.status_code in (429, 502):
time.sleep(self.DELAY)
retries += 1
continue
@@ -110,43 +108,59 @@ class LinkedInScraper(Scraper):
soup = BeautifulSoup(response.text, "html.parser")
with ThreadPoolExecutor(max_workers=5) as executor:
futures = []
for job_card in soup.find_all("div", class_="base-search-card"):
job_url = None
href_tag = job_card.find("a", class_="base-card__full-link")
if href_tag and "href" in href_tag.attrs:
href = href_tag.attrs["href"].split("?")[0]
job_id = href.split("-")[-1]
job_url = f"{self.url}/jobs/view/{job_id}"
for job_card in soup.find_all("div", class_="base-search-card"):
job_url = None
href_tag = job_card.find("a", class_="base-card__full-link")
if href_tag and "href" in href_tag.attrs:
href = href_tag.attrs["href"].split("?")[0]
job_id = href.split("-")[-1]
job_url = f"{self.url}/jobs/view/{job_id}"
with url_lock:
if job_url in seen_urls:
continue
seen_urls.add(job_url)
with url_lock:
if job_url in seen_urls:
continue
seen_urls.add(job_url)
futures.append(executor.submit(self.process_job, job_card, job_url))
# Call process_job directly without threading
try:
job_post = self.process_job(job_card, job_url)
if job_post:
job_list.append(job_post)
except Exception as e:
raise LinkedInException("Exception occurred while processing jobs")
for future in as_completed(futures):
try:
job_post = future.result()
if job_post:
job_list.append(job_post)
except Exception as e:
raise LinkedInException(
"Exception occurred while processing jobs"
)
page += 25
job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list)
def process_job(self, job_card: Tag, job_url: str) -> Optional[JobPost]:
salary_tag = job_card.find('span', class_='job-search-card__salary-info')
compensation = None
if salary_tag:
salary_text = salary_tag.get_text(separator=' ').strip()
salary_values = [currency_parser(value) for value in salary_text.split('-')]
salary_min = salary_values[0]
salary_max = salary_values[1]
currency = salary_text[0] if salary_text[0] != '$' else 'USD'
compensation = Compensation(
min_amount=int(salary_min),
max_amount=int(salary_max),
currency=currency,
)
title_tag = job_card.find("span", class_="sr-only")
title = title_tag.get_text(strip=True) if title_tag else "N/A"
company_tag = job_card.find("h4", class_="base-search-card__subtitle")
company_a_tag = company_tag.find("a") if company_tag else None
company_url = (
urlunparse(urlparse(company_a_tag.get("href"))._replace(query=""))
if company_a_tag and company_a_tag.has_attr("href")
else ""
)
company = company_a_tag.get_text(strip=True) if company_a_tag else "N/A"
metadata_card = job_card.find("div", class_="base-search-card__metadata")
@@ -168,15 +182,18 @@ class LinkedInScraper(Scraper):
benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None
description, job_type = self.get_job_description(job_url)
# description, job_type = None, []
return JobPost(
title=title,
description=description,
company_name=company,
company_url=company_url,
location=location,
date_posted=date_posted,
job_url=job_url,
job_type=job_type,
compensation=compensation,
benefits=benefits,
emails=extract_emails_from_text(description) if description else None,
num_urgent_words=count_urgent_words(description) if description else None,
@@ -193,8 +210,15 @@ class LinkedInScraper(Scraper):
try:
response = requests.get(job_page_url, timeout=5, proxies=self.proxy)
response.raise_for_status()
except requests.HTTPError as e:
if hasattr(e, "response") and e.response is not None:
if e.response.status_code in (429, 502):
time.sleep(self.DELAY)
return None, None
except Exception as e:
return None, None
if response.url == "https://www.linkedin.com/signup":
return None, None
soup = BeautifulSoup(response.text, "html.parser")
div_content = soup.find(
@@ -230,7 +254,7 @@ class LinkedInScraper(Scraper):
employment_type = employment_type.lower()
employment_type = employment_type.replace("-", "")
return [get_enum_from_job_type(employment_type)]
return [get_enum_from_job_type(employment_type)] if employment_type else []
return description, get_job_type(soup)
@@ -254,5 +278,12 @@ class LinkedInScraper(Scraper):
state=state,
country=Country.from_string(self.country),
)
elif len(parts) == 3:
city, state, country = parts
location = Location(
city=city,
state=state,
country=Country.from_string(country),
)
return location

View File

@@ -1,4 +1,5 @@
import re
import numpy as np
import requests
import tls_client
@@ -38,12 +39,6 @@ def create_session(proxy: dict | None = None, is_tls: bool = True):
random_tls_extension_order=True,
)
session.proxies = proxy
# TODO multiple proxies
# if self.proxies:
# session.proxies = {
# "http": random.choice(self.proxies),
# "https": random.choice(self.proxies),
# }
else:
session = requests.Session()
session.allow_redirects = True
@@ -62,3 +57,19 @@ def get_enum_from_job_type(job_type_str: str) -> JobType | None:
if job_type_str in job_type.value:
res = job_type
return res
def currency_parser(cur_str):
# Remove any non-numerical characters
# except for ',' '.' or '-' (e.g. EUR)
cur_str = re.sub("[^-0-9.,]", '', cur_str)
# Remove any 000s separators (either , or .)
cur_str = re.sub("[.,]", '', cur_str[:-3]) + cur_str[-3:]
if '.' in list(cur_str[-3:]):
num = float(cur_str)
elif ',' in list(cur_str[-3:]):
num = float(cur_str.replace(',', '.'))
else:
num = float(cur_str)
return np.round(num, 2)

View File

@@ -44,12 +44,12 @@ class ZipRecruiterScraper(Scraper):
if continue_token:
params["continue"] = continue_token
try:
session = create_session(self.proxy, is_tls=False)
session = create_session(self.proxy, is_tls=True)
response = session.get(
f"https://api.ziprecruiter.com/jobs-app/jobs",
headers=self.headers(),
params=self.add_params(scraper_input),
timeout=10,
timeout_seconds=10,
)
if response.status_code != 200:
raise ZipRecruiterException(
@@ -195,17 +195,12 @@ class ZipRecruiterScraper(Scraper):
@staticmethod
def headers() -> dict:
"""
Returns headers needed for requests
Returns headers needed for ZipRecruiter API requests
:return: dict - Dictionary containing headers
"""
return {
"Host": "api.ziprecruiter.com",
"Cookie": "ziprecruiter_browser=018188e0-045b-4ad7-aa50-627a6c3d43aa; ziprecruiter_session=5259b2219bf95b6d2299a1417424bc2edc9f4b38; SplitSV=2016-10-19%3AU2FsdGVkX19f9%2Bx70knxc%2FeR3xXR8lWoTcYfq5QjmLU%3D%0A; __cf_bm=qXim3DtLPbOL83GIp.ddQEOFVFTc1OBGPckiHYxcz3o-1698521532-0-AfUOCkgCZyVbiW1ziUwyefCfzNrJJTTKPYnif1FZGQkT60dMowmSU/Y/lP+WiygkFPW/KbYJmyc+MQSkkad5YygYaARflaRj51abnD+SyF9V; zglobalid=68d49bd5-0326-428e-aba8-8a04b64bc67c.af2d99ff7c03.653d61bb; ziprecruiter_browser=018188e0-045b-4ad7-aa50-627a6c3d43aa; ziprecruiter_session=5259b2219bf95b6d2299a1417424bc2edc9f4b38",
"accept": "*/*",
"x-zr-zva-override": "100000000;vid:ZT1huzm_EQlDTVEc",
"x-pushnotificationid": "0ff4983d38d7fc5b3370297f2bcffcf4b3321c418f5c22dd152a0264707602a0",
"x-deviceid": "D77B3A92-E589-46A4-8A39-6EF6F1D86006",
"user-agent": "Job Search/87.0 (iPhone; CPU iOS 16_6_1 like Mac OS X)",
"authorization": "Basic YTBlZjMyZDYtN2I0Yy00MWVkLWEyODMtYTI1NDAzMzI0YTcyOg==",
"accept-language": "en-US,en;q=0.9",
'Host': 'api.ziprecruiter.com',
'accept': '*/*',
'authorization': 'Basic YTBlZjMyZDYtN2I0Yy00MWVkLWEyODMtYTI1NDAzMzI0YTcyOg==',
'Cookie': '__cf_bm=DZ7eJOw6lka.Bwy5jLeDqWanaZ8BJlVAwaXrmcbYnxM-1701505132-0-AfGaVIfTA2kJlmleK14o722vbVwpZ+4UxFznsWv+guvzXSpD9KVEy/+pNzvEZUx88yaEShJwGt3/EVjhHirX/ASustKxg47V/aXRd2XIO2QN; zglobalid=61f94830-1990-4130-b222-d9d0e09c7825.57da9ea9581c.656ae86b; ziprecruiter_browser=018188e0-045b-4ad7-aa50-627a6c3d43aa; ziprecruiter_session=5259b2219bf95b6d2299a1417424bc2edc9f4b38; zva=100000000%3Bvid%3AZWroa0x_F1KEeGeU'
}