mirror of https://github.com/Bunsly/JobSpy
Compare commits
No commits in common. "5bd199d0a5199a96d4a6b25057ff68c3d5eb7ae0" and "13c74a0fedd2d4000c977858b6cb446e4a0b1e3f" have entirely different histories.
5bd199d0a5
...
13c74a0fed
|
@ -1,33 +1,50 @@
|
||||||
name: Publish JobSpy to PyPi
|
name: Publish Python 🐍 distributions 📦 to PyPI
|
||||||
on: push
|
on:
|
||||||
|
pull_request:
|
||||||
|
types:
|
||||||
|
- closed
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: write
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build-n-publish:
|
build-n-publish:
|
||||||
name: Build and publish JobSpy to PyPi
|
name: Build and publish Python 🐍 distributions 📦 to PyPI
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
if: github.event.pull_request.merged == true && github.event.pull_request.base.ref == 'main'
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@v4
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: "3.10"
|
python-version: "3.10"
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: pip install toml
|
||||||
|
|
||||||
|
- name: Increment version
|
||||||
|
run: python increment_version.py
|
||||||
|
|
||||||
|
- name: Commit version increment
|
||||||
|
run: |
|
||||||
|
git config --global user.name 'github-actions'
|
||||||
|
git config --global user.email 'github-actions@github.com'
|
||||||
|
git add pyproject.toml
|
||||||
|
git commit -m 'Increment version'
|
||||||
|
|
||||||
|
- name: Push changes
|
||||||
|
run: git push
|
||||||
|
|
||||||
- name: Install poetry
|
- name: Install poetry
|
||||||
run: >-
|
run: pip install poetry --user
|
||||||
python3 -m
|
|
||||||
pip install
|
|
||||||
poetry
|
|
||||||
--user
|
|
||||||
|
|
||||||
- name: Build distribution 📦
|
- name: Build distribution 📦
|
||||||
run: >-
|
run: poetry build
|
||||||
python3 -m
|
|
||||||
poetry
|
|
||||||
build
|
|
||||||
|
|
||||||
- name: Publish distribution 📦 to PyPI
|
- name: Publish distribution 📦 to PyPI
|
||||||
if: startsWith(github.ref, 'refs/tags')
|
|
||||||
uses: pypa/gh-action-pypi-publish@release/v1
|
uses: pypa/gh-action-pypi-publish@release/v1
|
||||||
with:
|
with:
|
||||||
password: ${{ secrets.PYPI_API_TOKEN }}
|
password: ${{ secrets.PYPI_API_TOKEN }}
|
||||||
|
|
|
@ -0,0 +1,22 @@
|
||||||
|
name: Python Tests
|
||||||
|
|
||||||
|
on:
|
||||||
|
pull_request:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
test:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
- name: Set up Python
|
||||||
|
uses: actions/setup-python@v2
|
||||||
|
with:
|
||||||
|
python-version: '3.8'
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
pip install poetry
|
||||||
|
poetry install
|
||||||
|
- name: Run tests
|
||||||
|
run: poetry run pytest tests/test_all.py
|
13
README.md
13
README.md
|
@ -1,10 +1,10 @@
|
||||||
<img src="https://github.com/cullenwatson/JobSpy/assets/78247585/ae185b7e-e444-4712-8bb9-fa97f53e896b" width="400">
|
<img src="https://github.com/cullenwatson/JobSpy/assets/78247585/ae185b7e-e444-4712-8bb9-fa97f53e896b" width="400">
|
||||||
|
|
||||||
**JobSpy** is a job scraping library with the goal of aggregating all the jobs from popular job boards with one tool.
|
**JobSpy** is a simple, yet comprehensive, job scraping library.
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, **ZipRecruiter**, & **Bayt** concurrently
|
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, & **ZipRecruiter** simultaneously
|
||||||
- Aggregates the job postings in a dataframe
|
- Aggregates the job postings in a dataframe
|
||||||
- Proxies support to bypass blocking
|
- Proxies support to bypass blocking
|
||||||
|
|
||||||
|
@ -25,7 +25,7 @@ import csv
|
||||||
from jobspy import scrape_jobs
|
from jobspy import scrape_jobs
|
||||||
|
|
||||||
jobs = scrape_jobs(
|
jobs = scrape_jobs(
|
||||||
site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google", "bayt"],
|
site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google"],
|
||||||
search_term="software engineer",
|
search_term="software engineer",
|
||||||
google_search_term="software engineer jobs near San Francisco, CA since yesterday",
|
google_search_term="software engineer jobs near San Francisco, CA since yesterday",
|
||||||
location="San Francisco, CA",
|
location="San Francisco, CA",
|
||||||
|
@ -58,7 +58,7 @@ zip_recruiter Software Developer TEKsystems Phoenix
|
||||||
```plaintext
|
```plaintext
|
||||||
Optional
|
Optional
|
||||||
├── site_name (list|str):
|
├── site_name (list|str):
|
||||||
| linkedin, zip_recruiter, indeed, glassdoor, google, bayt
|
| linkedin, zip_recruiter, indeed, glassdoor, google
|
||||||
| (default is all)
|
| (default is all)
|
||||||
│
|
│
|
||||||
├── search_term (str)
|
├── search_term (str)
|
||||||
|
@ -165,11 +165,6 @@ You can specify the following countries when searching on Indeed (use the exact
|
||||||
| United Arab Emirates | UK* | USA* | Uruguay |
|
| United Arab Emirates | UK* | USA* | Uruguay |
|
||||||
| Venezuela | Vietnam* | | |
|
| Venezuela | Vietnam* | | |
|
||||||
|
|
||||||
### **Bayt**
|
|
||||||
|
|
||||||
Bayt only uses the search_term parameter currently and searches internationally
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Notes
|
## Notes
|
||||||
* Indeed is the best scraper currently with no rate limiting.
|
* Indeed is the best scraper currently with no rate limiting.
|
||||||
|
|
|
@ -0,0 +1,21 @@
|
||||||
|
import toml
|
||||||
|
|
||||||
|
def increment_version(version):
|
||||||
|
major, minor, patch = map(int, version.split('.'))
|
||||||
|
patch += 1
|
||||||
|
return f"{major}.{minor}.{patch}"
|
||||||
|
|
||||||
|
# Load pyproject.toml
|
||||||
|
with open('pyproject.toml', 'r') as file:
|
||||||
|
pyproject = toml.load(file)
|
||||||
|
|
||||||
|
# Increment the version
|
||||||
|
current_version = pyproject['tool']['poetry']['version']
|
||||||
|
new_version = increment_version(current_version)
|
||||||
|
pyproject['tool']['poetry']['version'] = new_version
|
||||||
|
|
||||||
|
# Save the updated pyproject.toml
|
||||||
|
with open('pyproject.toml', 'w') as file:
|
||||||
|
toml.dump(pyproject, file)
|
||||||
|
|
||||||
|
print(f"Version updated from {current_version} to {new_version}")
|
|
@ -1,145 +0,0 @@
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import random
|
|
||||||
import time
|
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
from jobspy.model import (
|
|
||||||
Scraper,
|
|
||||||
ScraperInput,
|
|
||||||
Site,
|
|
||||||
JobPost,
|
|
||||||
JobResponse,
|
|
||||||
Location,
|
|
||||||
Country,
|
|
||||||
)
|
|
||||||
from jobspy.util import create_logger, create_session
|
|
||||||
|
|
||||||
log = create_logger("Bayt")
|
|
||||||
|
|
||||||
|
|
||||||
class BaytScraper(Scraper):
|
|
||||||
base_url = "https://www.bayt.com"
|
|
||||||
delay = 2
|
|
||||||
band_delay = 3
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
|
||||||
):
|
|
||||||
super().__init__(Site.BAYT, proxies=proxies, ca_cert=ca_cert)
|
|
||||||
self.scraper_input = None
|
|
||||||
self.session = None
|
|
||||||
self.country = "worldwide"
|
|
||||||
|
|
||||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
|
||||||
self.scraper_input = scraper_input
|
|
||||||
self.session = create_session(
|
|
||||||
proxies=self.proxies, ca_cert=self.ca_cert, is_tls=False, has_retry=True
|
|
||||||
)
|
|
||||||
job_list: list[JobPost] = []
|
|
||||||
page = 1
|
|
||||||
results_wanted = (
|
|
||||||
scraper_input.results_wanted if scraper_input.results_wanted else 10
|
|
||||||
)
|
|
||||||
|
|
||||||
while len(job_list) < results_wanted:
|
|
||||||
log.info(f"Fetching Bayt jobs page {page}")
|
|
||||||
job_elements = self._fetch_jobs(self.scraper_input.search_term, page)
|
|
||||||
if not job_elements:
|
|
||||||
break
|
|
||||||
|
|
||||||
if job_elements:
|
|
||||||
log.debug(
|
|
||||||
"First job element snippet:\n" + job_elements[0].prettify()[:500]
|
|
||||||
)
|
|
||||||
|
|
||||||
initial_count = len(job_list)
|
|
||||||
for job in job_elements:
|
|
||||||
try:
|
|
||||||
job_post = self._extract_job_info(job)
|
|
||||||
if job_post:
|
|
||||||
job_list.append(job_post)
|
|
||||||
if len(job_list) >= results_wanted:
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
log.debug(
|
|
||||||
"Extraction returned None. Job snippet:\n"
|
|
||||||
+ job.prettify()[:500]
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
log.error(f"Bayt: Error extracting job info: {str(e)}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
if len(job_list) == initial_count:
|
|
||||||
log.info(f"No new jobs found on page {page}. Ending pagination.")
|
|
||||||
break
|
|
||||||
|
|
||||||
page += 1
|
|
||||||
time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
|
|
||||||
|
|
||||||
job_list = job_list[: scraper_input.results_wanted]
|
|
||||||
return JobResponse(jobs=job_list)
|
|
||||||
|
|
||||||
def _fetch_jobs(self, query: str, page: int) -> list | None:
|
|
||||||
"""
|
|
||||||
Grabs the job results for the given query and page number.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
url = f"{self.base_url}/en/international/jobs/{query}-jobs/?page={page}"
|
|
||||||
response = self.session.get(url)
|
|
||||||
response.raise_for_status()
|
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
|
||||||
job_listings = soup.find_all("li", attrs={"data-js-job": ""})
|
|
||||||
log.debug(f"Found {len(job_listings)} job listing elements")
|
|
||||||
return job_listings
|
|
||||||
except Exception as e:
|
|
||||||
log.error(f"Bayt: Error fetching jobs - {str(e)}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _extract_job_info(self, job: BeautifulSoup) -> JobPost | None:
|
|
||||||
"""
|
|
||||||
Extracts the job information from a single job listing.
|
|
||||||
"""
|
|
||||||
# Find the h2 element holding the title and link (no class filtering)
|
|
||||||
job_general_information = job.find("h2")
|
|
||||||
if not job_general_information:
|
|
||||||
return
|
|
||||||
|
|
||||||
job_title = job_general_information.get_text(strip=True)
|
|
||||||
job_url = self._extract_job_url(job_general_information)
|
|
||||||
if not job_url:
|
|
||||||
return
|
|
||||||
|
|
||||||
# Extract company name using the original approach:
|
|
||||||
company_tag = job.find("div", class_="t-nowrap p10l")
|
|
||||||
company_name = (
|
|
||||||
company_tag.find("span").get_text(strip=True)
|
|
||||||
if company_tag and company_tag.find("span")
|
|
||||||
else None
|
|
||||||
)
|
|
||||||
|
|
||||||
# Extract location using the original approach:
|
|
||||||
location_tag = job.find("div", class_="t-mute t-small")
|
|
||||||
location = location_tag.get_text(strip=True) if location_tag else None
|
|
||||||
|
|
||||||
job_id = f"bayt-{abs(hash(job_url))}"
|
|
||||||
location_obj = Location(
|
|
||||||
city=location,
|
|
||||||
country=Country.from_string(self.country),
|
|
||||||
)
|
|
||||||
return JobPost(
|
|
||||||
id=job_id,
|
|
||||||
title=job_title,
|
|
||||||
company_name=company_name,
|
|
||||||
location=location_obj,
|
|
||||||
job_url=job_url,
|
|
||||||
)
|
|
||||||
|
|
||||||
def _extract_job_url(self, job_general_information: BeautifulSoup) -> str | None:
|
|
||||||
"""
|
|
||||||
Pulls the job URL from the 'a' within the h2 element.
|
|
||||||
"""
|
|
||||||
a_tag = job_general_information.find("a")
|
|
||||||
if a_tag and a_tag.has_attr("href"):
|
|
||||||
return self.base_url + a_tag["href"].strip()
|
|
|
@ -1,42 +0,0 @@
|
||||||
from jobspy.model import Compensation, CompensationInterval, Location, JobType
|
|
||||||
|
|
||||||
|
|
||||||
def parse_compensation(data: dict) -> Compensation | None:
|
|
||||||
pay_period = data.get("payPeriod")
|
|
||||||
adjusted_pay = data.get("payPeriodAdjustedPay")
|
|
||||||
currency = data.get("payCurrency", "USD")
|
|
||||||
if not pay_period or not adjusted_pay:
|
|
||||||
return None
|
|
||||||
|
|
||||||
interval = None
|
|
||||||
if pay_period == "ANNUAL":
|
|
||||||
interval = CompensationInterval.YEARLY
|
|
||||||
elif pay_period:
|
|
||||||
interval = CompensationInterval.get_interval(pay_period)
|
|
||||||
min_amount = int(adjusted_pay.get("p10") // 1)
|
|
||||||
max_amount = int(adjusted_pay.get("p90") // 1)
|
|
||||||
return Compensation(
|
|
||||||
interval=interval,
|
|
||||||
min_amount=min_amount,
|
|
||||||
max_amount=max_amount,
|
|
||||||
currency=currency,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
|
|
||||||
for job_type in JobType:
|
|
||||||
if job_type_str in job_type.value:
|
|
||||||
return [job_type]
|
|
||||||
|
|
||||||
|
|
||||||
def parse_location(location_name: str) -> Location | None:
|
|
||||||
if not location_name or location_name == "Remote":
|
|
||||||
return
|
|
||||||
city, _, state = location_name.partition(", ")
|
|
||||||
return Location(city=city, state=state)
|
|
||||||
|
|
||||||
|
|
||||||
def get_cursor_for_page(pagination_cursors, page_num):
|
|
||||||
for cursor_data in pagination_cursors:
|
|
||||||
if cursor_data["pageNumber"] == page_num:
|
|
||||||
return cursor_data["cursor"]
|
|
|
@ -1,41 +0,0 @@
|
||||||
import re
|
|
||||||
|
|
||||||
from jobspy.util import create_logger
|
|
||||||
|
|
||||||
log = create_logger("Google")
|
|
||||||
|
|
||||||
|
|
||||||
def find_job_info(jobs_data: list | dict) -> list | None:
|
|
||||||
"""Iterates through the JSON data to find the job listings"""
|
|
||||||
if isinstance(jobs_data, dict):
|
|
||||||
for key, value in jobs_data.items():
|
|
||||||
if key == "520084652" and isinstance(value, list):
|
|
||||||
return value
|
|
||||||
else:
|
|
||||||
result = find_job_info(value)
|
|
||||||
if result:
|
|
||||||
return result
|
|
||||||
elif isinstance(jobs_data, list):
|
|
||||||
for item in jobs_data:
|
|
||||||
result = find_job_info(item)
|
|
||||||
if result:
|
|
||||||
return result
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def find_job_info_initial_page(html_text: str):
|
|
||||||
pattern = f'520084652":(' + r"\[.*?\]\s*])\s*}\s*]\s*]\s*]\s*]\s*]"
|
|
||||||
results = []
|
|
||||||
matches = re.finditer(pattern, html_text)
|
|
||||||
|
|
||||||
import json
|
|
||||||
|
|
||||||
for match in matches:
|
|
||||||
try:
|
|
||||||
parsed_data = json.loads(match.group(1))
|
|
||||||
results.append(parsed_data)
|
|
||||||
|
|
||||||
except json.JSONDecodeError as e:
|
|
||||||
log.error(f"Failed to parse match: {str(e)}")
|
|
||||||
results.append({"raw_match": match.group(0), "error": str(e)})
|
|
||||||
return results
|
|
|
@ -1,83 +0,0 @@
|
||||||
from jobspy.model import CompensationInterval, JobType, Compensation
|
|
||||||
from jobspy.util import get_enum_from_job_type
|
|
||||||
|
|
||||||
|
|
||||||
def get_job_type(attributes: list) -> list[JobType]:
|
|
||||||
"""
|
|
||||||
Parses the attributes to get list of job types
|
|
||||||
:param attributes:
|
|
||||||
:return: list of JobType
|
|
||||||
"""
|
|
||||||
job_types: list[JobType] = []
|
|
||||||
for attribute in attributes:
|
|
||||||
job_type_str = attribute["label"].replace("-", "").replace(" ", "").lower()
|
|
||||||
job_type = get_enum_from_job_type(job_type_str)
|
|
||||||
if job_type:
|
|
||||||
job_types.append(job_type)
|
|
||||||
return job_types
|
|
||||||
|
|
||||||
|
|
||||||
def get_compensation(compensation: dict) -> Compensation | None:
|
|
||||||
"""
|
|
||||||
Parses the job to get compensation
|
|
||||||
:param sssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrompensation:
|
|
||||||
:return: compensation object
|
|
||||||
"""
|
|
||||||
if not compensation["baseSalary"] and not compensation["estimated"]:
|
|
||||||
return None
|
|
||||||
comp = (
|
|
||||||
compensation["baseSalary"]
|
|
||||||
if compensation["baseSalary"]
|
|
||||||
else compensation["estimated"]["baseSalary"]
|
|
||||||
)
|
|
||||||
if not comp:
|
|
||||||
return None
|
|
||||||
interval = get_compensation_interval(comp["unitOfWork"])
|
|
||||||
if not interval:
|
|
||||||
return None
|
|
||||||
min_range = comp["range"].get("min")
|
|
||||||
max_range = comp["range"].get("max")
|
|
||||||
return Compensation(
|
|
||||||
interval=interval,
|
|
||||||
min_amount=int(min_range) if min_range is not None else None,
|
|
||||||
max_amount=int(max_range) if max_range is not None else None,
|
|
||||||
currency=(
|
|
||||||
compensation["estimated"]["currencyCode"]
|
|
||||||
if compensation["estimated"]
|
|
||||||
else compensation["currencyCode"]
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def is_job_remote(job: dict, description: str) -> bool:
|
|
||||||
"""
|
|
||||||
Searches the description, location, and attributes to check if job is remote
|
|
||||||
"""
|
|
||||||
remote_keywords = ["remote", "work from home", "wfh"]
|
|
||||||
is_remote_in_attributes = any(
|
|
||||||
any(keyword in attr["label"].lower() for keyword in remote_keywords)
|
|
||||||
for attr in job["attributes"]
|
|
||||||
)
|
|
||||||
is_remote_in_description = any(
|
|
||||||
keyword in description.lower() for keyword in remote_keywords
|
|
||||||
)
|
|
||||||
is_remote_in_location = any(
|
|
||||||
keyword in job["location"]["formatted"]["long"].lower()
|
|
||||||
for keyword in remote_keywords
|
|
||||||
)
|
|
||||||
return is_remote_in_attributes or is_remote_in_description or is_remote_in_location
|
|
||||||
|
|
||||||
|
|
||||||
def get_compensation_interval(interval: str) -> CompensationInterval:
|
|
||||||
interval_mapping = {
|
|
||||||
"DAY": "DAILY",
|
|
||||||
"YEAR": "YEARLY",
|
|
||||||
"HOUR": "HOURLY",
|
|
||||||
"WEEK": "WEEKLY",
|
|
||||||
"MONTH": "MONTHLY",
|
|
||||||
}
|
|
||||||
mapped_interval = interval_mapping.get(interval.upper(), None)
|
|
||||||
if mapped_interval and mapped_interval in CompensationInterval.__members__:
|
|
||||||
return CompensationInterval[mapped_interval]
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unsupported interval: {interval}")
|
|
|
@ -1,85 +0,0 @@
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
from jobspy.model import JobType
|
|
||||||
from jobspy.util import get_enum_from_job_type
|
|
||||||
|
|
||||||
|
|
||||||
def job_type_code(job_type_enum: JobType) -> str:
|
|
||||||
return {
|
|
||||||
JobType.FULL_TIME: "F",
|
|
||||||
JobType.PART_TIME: "P",
|
|
||||||
JobType.INTERNSHIP: "I",
|
|
||||||
JobType.CONTRACT: "C",
|
|
||||||
JobType.TEMPORARY: "T",
|
|
||||||
}.get(job_type_enum, "")
|
|
||||||
|
|
||||||
|
|
||||||
def parse_job_type(soup_job_type: BeautifulSoup) -> list[JobType] | None:
|
|
||||||
"""
|
|
||||||
Gets the job type from job page
|
|
||||||
:param soup_job_type:
|
|
||||||
:return: JobType
|
|
||||||
"""
|
|
||||||
h3_tag = soup_job_type.find(
|
|
||||||
"h3",
|
|
||||||
class_="description__job-criteria-subheader",
|
|
||||||
string=lambda text: "Employment type" in text,
|
|
||||||
)
|
|
||||||
employment_type = None
|
|
||||||
if h3_tag:
|
|
||||||
employment_type_span = h3_tag.find_next_sibling(
|
|
||||||
"span",
|
|
||||||
class_="description__job-criteria-text description__job-criteria-text--criteria",
|
|
||||||
)
|
|
||||||
if employment_type_span:
|
|
||||||
employment_type = employment_type_span.get_text(strip=True)
|
|
||||||
employment_type = employment_type.lower()
|
|
||||||
employment_type = employment_type.replace("-", "")
|
|
||||||
|
|
||||||
return [get_enum_from_job_type(employment_type)] if employment_type else []
|
|
||||||
|
|
||||||
|
|
||||||
def parse_job_level(soup_job_level: BeautifulSoup) -> str | None:
|
|
||||||
"""
|
|
||||||
Gets the job level from job page
|
|
||||||
:param soup_job_level:
|
|
||||||
:return: str
|
|
||||||
"""
|
|
||||||
h3_tag = soup_job_level.find(
|
|
||||||
"h3",
|
|
||||||
class_="description__job-criteria-subheader",
|
|
||||||
string=lambda text: "Seniority level" in text,
|
|
||||||
)
|
|
||||||
job_level = None
|
|
||||||
if h3_tag:
|
|
||||||
job_level_span = h3_tag.find_next_sibling(
|
|
||||||
"span",
|
|
||||||
class_="description__job-criteria-text description__job-criteria-text--criteria",
|
|
||||||
)
|
|
||||||
if job_level_span:
|
|
||||||
job_level = job_level_span.get_text(strip=True)
|
|
||||||
|
|
||||||
return job_level
|
|
||||||
|
|
||||||
|
|
||||||
def parse_company_industry(soup_industry: BeautifulSoup) -> str | None:
|
|
||||||
"""
|
|
||||||
Gets the company industry from job page
|
|
||||||
:param soup_industry:
|
|
||||||
:return: str
|
|
||||||
"""
|
|
||||||
h3_tag = soup_industry.find(
|
|
||||||
"h3",
|
|
||||||
class_="description__job-criteria-subheader",
|
|
||||||
string=lambda text: "Industries" in text,
|
|
||||||
)
|
|
||||||
industry = None
|
|
||||||
if h3_tag:
|
|
||||||
industry_span = h3_tag.find_next_sibling(
|
|
||||||
"span",
|
|
||||||
class_="description__job-criteria-text description__job-criteria-text--criteria",
|
|
||||||
)
|
|
||||||
if industry_span:
|
|
||||||
industry = industry_span.get_text(strip=True)
|
|
||||||
|
|
||||||
return industry
|
|
|
@ -1,29 +0,0 @@
|
||||||
headers = {
|
|
||||||
"Host": "api.ziprecruiter.com",
|
|
||||||
"accept": "*/*",
|
|
||||||
"x-zr-zva-override": "100000000;vid:ZT1huzm_EQlDTVEc",
|
|
||||||
"x-pushnotificationid": "0ff4983d38d7fc5b3370297f2bcffcf4b3321c418f5c22dd152a0264707602a0",
|
|
||||||
"x-deviceid": "D77B3A92-E589-46A4-8A39-6EF6F1D86006",
|
|
||||||
"user-agent": "Job Search/87.0 (iPhone; CPU iOS 16_6_1 like Mac OS X)",
|
|
||||||
"authorization": "Basic YTBlZjMyZDYtN2I0Yy00MWVkLWEyODMtYTI1NDAzMzI0YTcyOg==",
|
|
||||||
"accept-language": "en-US,en;q=0.9",
|
|
||||||
}
|
|
||||||
|
|
||||||
get_cookie_data = [
|
|
||||||
("event_type", "session"),
|
|
||||||
("logged_in", "false"),
|
|
||||||
("number_of_retry", "1"),
|
|
||||||
("property", "model:iPhone"),
|
|
||||||
("property", "os:iOS"),
|
|
||||||
("property", "locale:en_us"),
|
|
||||||
("property", "app_build_number:4734"),
|
|
||||||
("property", "app_version:91.0"),
|
|
||||||
("property", "manufacturer:Apple"),
|
|
||||||
("property", "timestamp:2025-01-12T12:04:42-06:00"),
|
|
||||||
("property", "screen_height:852"),
|
|
||||||
("property", "os_version:16.6.1"),
|
|
||||||
("property", "source:install"),
|
|
||||||
("property", "screen_width:393"),
|
|
||||||
("property", "device_model:iPhone 14 Pro"),
|
|
||||||
("property", "brand:Apple"),
|
|
||||||
]
|
|
|
@ -1,31 +0,0 @@
|
||||||
from jobspy.model import JobType
|
|
||||||
|
|
||||||
|
|
||||||
def add_params(scraper_input) -> dict[str, str | int]:
|
|
||||||
params: dict[str, str | int] = {
|
|
||||||
"search": scraper_input.search_term,
|
|
||||||
"location": scraper_input.location,
|
|
||||||
}
|
|
||||||
if scraper_input.hours_old:
|
|
||||||
params["days"] = max(scraper_input.hours_old // 24, 1)
|
|
||||||
|
|
||||||
job_type_map = {JobType.FULL_TIME: "full_time", JobType.PART_TIME: "part_time"}
|
|
||||||
if scraper_input.job_type:
|
|
||||||
job_type = scraper_input.job_type
|
|
||||||
params["employment_type"] = job_type_map.get(job_type, job_type.value[0])
|
|
||||||
|
|
||||||
if scraper_input.easy_apply:
|
|
||||||
params["zipapply"] = 1
|
|
||||||
if scraper_input.is_remote:
|
|
||||||
params["remote"] = 1
|
|
||||||
if scraper_input.distance:
|
|
||||||
params["radius"] = scraper_input.distance
|
|
||||||
|
|
||||||
return {k: v for k, v in params.items() if v is not None}
|
|
||||||
|
|
||||||
|
|
||||||
def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
|
|
||||||
for job_type in JobType:
|
|
||||||
if job_type_str in job_type.value:
|
|
||||||
return [job_type]
|
|
||||||
return None
|
|
|
@ -4,14 +4,15 @@ build-backend = "poetry.core.masonry.api"
|
||||||
|
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "python-jobspy"
|
name = "python-jobspy"
|
||||||
version = "1.1.77"
|
version = "1.1.76"
|
||||||
description = "Job scraper for LinkedIn, Indeed, Glassdoor, ZipRecruiter & Bayt"
|
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
||||||
authors = ["Cullen Watson <cullen@cullenwatson.com>", "Zachary Hampton <zachary@zacharysproducts.com>"]
|
authors = [ "Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>",]
|
||||||
homepage = "https://github.com/cullenwatson/JobSpy"
|
homepage = "https://github.com/Bunsly/JobSpy"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
keywords = [ "jobs-scraper", "linkedin", "indeed", "glassdoor", "ziprecruiter", "bayt"]
|
keywords = [ "jobs-scraper", "linkedin", "indeed", "glassdoor", "ziprecruiter",]
|
||||||
[[tool.poetry.packages]]
|
[[tool.poetry.packages]]
|
||||||
include = "jobspy"
|
include = "jobspy"
|
||||||
|
from = "src"
|
||||||
|
|
||||||
[tool.black]
|
[tool.black]
|
||||||
line-length = 88
|
line-length = 88
|
||||||
|
@ -28,6 +29,7 @@ markdownify = "^0.13.1"
|
||||||
regex = "^2024.4.28"
|
regex = "^2024.4.28"
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
|
pytest = "^7.4.1"
|
||||||
jupyter = "^1.0.0"
|
jupyter = "^1.0.0"
|
||||||
black = "*"
|
black = "*"
|
||||||
pre-commit = "*"
|
pre-commit = "*"
|
||||||
|
|
|
@ -1,27 +1,24 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
||||||
from typing import Tuple
|
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
from typing import Tuple
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
|
||||||
from jobspy.bayt import BaytScraper
|
from .jobs import JobType, Location
|
||||||
from jobspy.glassdoor import Glassdoor
|
from .scrapers.utils import set_logger_level, extract_salary, create_logger
|
||||||
from jobspy.google import Google
|
from .scrapers.indeed import IndeedScraper
|
||||||
from jobspy.indeed import Indeed
|
from .scrapers.ziprecruiter import ZipRecruiterScraper
|
||||||
from jobspy.linkedin import LinkedIn
|
from .scrapers.glassdoor import GlassdoorScraper
|
||||||
from jobspy.model import JobType, Location, JobResponse, Country
|
from .scrapers.google import GoogleJobsScraper
|
||||||
from jobspy.model import SalarySource, ScraperInput, Site
|
from .scrapers.linkedin import LinkedInScraper
|
||||||
from jobspy.util import (
|
from .scrapers import SalarySource, ScraperInput, Site, JobResponse, Country
|
||||||
set_logger_level,
|
from .scrapers.exceptions import (
|
||||||
extract_salary,
|
LinkedInException,
|
||||||
create_logger,
|
IndeedException,
|
||||||
get_enum_from_value,
|
ZipRecruiterException,
|
||||||
map_str_to_site,
|
GlassdoorException,
|
||||||
convert_to_annual,
|
GoogleJobsException,
|
||||||
desired_order,
|
|
||||||
)
|
)
|
||||||
from jobspy.ziprecruiter import ZipRecruiter
|
|
||||||
|
|
||||||
|
|
||||||
def scrape_jobs(
|
def scrape_jobs(
|
||||||
|
@ -35,6 +32,7 @@ def scrape_jobs(
|
||||||
easy_apply: bool | None = None,
|
easy_apply: bool | None = None,
|
||||||
results_wanted: int = 15,
|
results_wanted: int = 15,
|
||||||
country_indeed: str = "usa",
|
country_indeed: str = "usa",
|
||||||
|
hyperlinks: bool = False,
|
||||||
proxies: list[str] | str | None = None,
|
proxies: list[str] | str | None = None,
|
||||||
ca_cert: str | None = None,
|
ca_cert: str | None = None,
|
||||||
description_format: str = "markdown",
|
description_format: str = "markdown",
|
||||||
|
@ -43,22 +41,31 @@ def scrape_jobs(
|
||||||
offset: int | None = 0,
|
offset: int | None = 0,
|
||||||
hours_old: int = None,
|
hours_old: int = None,
|
||||||
enforce_annual_salary: bool = False,
|
enforce_annual_salary: bool = False,
|
||||||
verbose: int = 0,
|
verbose: int = 2,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
Scrapes job data from job boards concurrently
|
Simultaneously scrapes job data from multiple job sites.
|
||||||
:return: Pandas DataFrame containing job data
|
:return: pandas dataframe containing job data
|
||||||
"""
|
"""
|
||||||
SCRAPER_MAPPING = {
|
SCRAPER_MAPPING = {
|
||||||
Site.LINKEDIN: LinkedIn,
|
Site.LINKEDIN: LinkedInScraper,
|
||||||
Site.INDEED: Indeed,
|
Site.INDEED: IndeedScraper,
|
||||||
Site.ZIP_RECRUITER: ZipRecruiter,
|
Site.ZIP_RECRUITER: ZipRecruiterScraper,
|
||||||
Site.GLASSDOOR: Glassdoor,
|
Site.GLASSDOOR: GlassdoorScraper,
|
||||||
Site.GOOGLE: Google,
|
Site.GOOGLE: GoogleJobsScraper,
|
||||||
Site.BAYT: BaytScraper,
|
|
||||||
}
|
}
|
||||||
set_logger_level(verbose)
|
set_logger_level(verbose)
|
||||||
|
|
||||||
|
def map_str_to_site(site_name: str) -> Site:
|
||||||
|
return Site[site_name.upper()]
|
||||||
|
|
||||||
|
def get_enum_from_value(value_str):
|
||||||
|
for job_type in JobType:
|
||||||
|
if value_str in job_type.value:
|
||||||
|
return job_type
|
||||||
|
raise Exception(f"Invalid job type: {value_str}")
|
||||||
|
|
||||||
job_type = get_enum_from_value(job_type) if job_type else None
|
job_type = get_enum_from_value(job_type) if job_type else None
|
||||||
|
|
||||||
def get_site_type():
|
def get_site_type():
|
||||||
|
@ -118,12 +125,28 @@ def scrape_jobs(
|
||||||
site_value, scraped_data = future.result()
|
site_value, scraped_data = future.result()
|
||||||
site_to_jobs_dict[site_value] = scraped_data
|
site_to_jobs_dict[site_value] = scraped_data
|
||||||
|
|
||||||
|
def convert_to_annual(job_data: dict):
|
||||||
|
if job_data["interval"] == "hourly":
|
||||||
|
job_data["min_amount"] *= 2080
|
||||||
|
job_data["max_amount"] *= 2080
|
||||||
|
if job_data["interval"] == "monthly":
|
||||||
|
job_data["min_amount"] *= 12
|
||||||
|
job_data["max_amount"] *= 12
|
||||||
|
if job_data["interval"] == "weekly":
|
||||||
|
job_data["min_amount"] *= 52
|
||||||
|
job_data["max_amount"] *= 52
|
||||||
|
if job_data["interval"] == "daily":
|
||||||
|
job_data["min_amount"] *= 260
|
||||||
|
job_data["max_amount"] *= 260
|
||||||
|
job_data["interval"] = "yearly"
|
||||||
|
|
||||||
jobs_dfs: list[pd.DataFrame] = []
|
jobs_dfs: list[pd.DataFrame] = []
|
||||||
|
|
||||||
for site, job_response in site_to_jobs_dict.items():
|
for site, job_response in site_to_jobs_dict.items():
|
||||||
for job in job_response.jobs:
|
for job in job_response.jobs:
|
||||||
job_data = job.dict()
|
job_data = job.dict()
|
||||||
job_url = job_data["job_url"]
|
job_url = job_data["job_url"]
|
||||||
|
job_data["job_url_hyper"] = f'<a href="{job_url}">{job_url}</a>'
|
||||||
job_data["site"] = site
|
job_data["site"] = site
|
||||||
job_data["company"] = job_data["company_name"]
|
job_data["company"] = job_data["company_name"]
|
||||||
job_data["job_type"] = (
|
job_data["job_type"] = (
|
||||||
|
@ -186,6 +209,38 @@ def scrape_jobs(
|
||||||
# Step 2: Concatenate the filtered DataFrames
|
# Step 2: Concatenate the filtered DataFrames
|
||||||
jobs_df = pd.concat(filtered_dfs, ignore_index=True)
|
jobs_df = pd.concat(filtered_dfs, ignore_index=True)
|
||||||
|
|
||||||
|
# Desired column order
|
||||||
|
desired_order = [
|
||||||
|
"id",
|
||||||
|
"site",
|
||||||
|
"job_url_hyper" if hyperlinks else "job_url",
|
||||||
|
"job_url_direct",
|
||||||
|
"title",
|
||||||
|
"company",
|
||||||
|
"location",
|
||||||
|
"date_posted",
|
||||||
|
"job_type",
|
||||||
|
"salary_source",
|
||||||
|
"interval",
|
||||||
|
"min_amount",
|
||||||
|
"max_amount",
|
||||||
|
"currency",
|
||||||
|
"is_remote",
|
||||||
|
"job_level",
|
||||||
|
"job_function",
|
||||||
|
"listing_type",
|
||||||
|
"emails",
|
||||||
|
"description",
|
||||||
|
"company_industry",
|
||||||
|
"company_url",
|
||||||
|
"company_logo",
|
||||||
|
"company_url_direct",
|
||||||
|
"company_addresses",
|
||||||
|
"company_num_employees",
|
||||||
|
"company_revenue",
|
||||||
|
"company_description",
|
||||||
|
]
|
||||||
|
|
||||||
# Step 3: Ensure all desired columns are present, adding missing ones as empty
|
# Step 3: Ensure all desired columns are present, adding missing ones as empty
|
||||||
for column in desired_order:
|
for column in desired_order:
|
||||||
if column not in jobs_df.columns:
|
if column not in jobs_df.columns:
|
|
@ -1,6 +1,5 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from abc import ABC, abstractmethod
|
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from datetime import date
|
from datetime import date
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
@ -266,49 +265,3 @@ class JobPost(BaseModel):
|
||||||
|
|
||||||
class JobResponse(BaseModel):
|
class JobResponse(BaseModel):
|
||||||
jobs: list[JobPost] = []
|
jobs: list[JobPost] = []
|
||||||
|
|
||||||
|
|
||||||
class Site(Enum):
|
|
||||||
LINKEDIN = "linkedin"
|
|
||||||
INDEED = "indeed"
|
|
||||||
ZIP_RECRUITER = "zip_recruiter"
|
|
||||||
GLASSDOOR = "glassdoor"
|
|
||||||
GOOGLE = "google"
|
|
||||||
BAYT = "bayt"
|
|
||||||
|
|
||||||
|
|
||||||
class SalarySource(Enum):
|
|
||||||
DIRECT_DATA = "direct_data"
|
|
||||||
DESCRIPTION = "description"
|
|
||||||
|
|
||||||
|
|
||||||
class ScraperInput(BaseModel):
|
|
||||||
site_type: list[Site]
|
|
||||||
search_term: str | None = None
|
|
||||||
google_search_term: str | None = None
|
|
||||||
|
|
||||||
location: str | None = None
|
|
||||||
country: Country | None = Country.USA
|
|
||||||
distance: int | None = None
|
|
||||||
is_remote: bool = False
|
|
||||||
job_type: JobType | None = None
|
|
||||||
easy_apply: bool | None = None
|
|
||||||
offset: int = 0
|
|
||||||
linkedin_fetch_description: bool = False
|
|
||||||
linkedin_company_ids: list[int] | None = None
|
|
||||||
description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN
|
|
||||||
|
|
||||||
results_wanted: int = 15
|
|
||||||
hours_old: int | None = None
|
|
||||||
|
|
||||||
|
|
||||||
class Scraper(ABC):
|
|
||||||
def __init__(
|
|
||||||
self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None
|
|
||||||
):
|
|
||||||
self.site = site
|
|
||||||
self.proxies = proxies
|
|
||||||
self.ca_cert = ca_cert
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...
|
|
|
@ -0,0 +1,57 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
|
||||||
|
from ..jobs import (
|
||||||
|
Enum,
|
||||||
|
BaseModel,
|
||||||
|
JobType,
|
||||||
|
JobResponse,
|
||||||
|
Country,
|
||||||
|
DescriptionFormat,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class Site(Enum):
|
||||||
|
LINKEDIN = "linkedin"
|
||||||
|
INDEED = "indeed"
|
||||||
|
ZIP_RECRUITER = "zip_recruiter"
|
||||||
|
GLASSDOOR = "glassdoor"
|
||||||
|
GOOGLE = "google"
|
||||||
|
|
||||||
|
|
||||||
|
class SalarySource(Enum):
|
||||||
|
DIRECT_DATA = "direct_data"
|
||||||
|
DESCRIPTION = "description"
|
||||||
|
|
||||||
|
|
||||||
|
class ScraperInput(BaseModel):
|
||||||
|
site_type: list[Site]
|
||||||
|
search_term: str | None = None
|
||||||
|
google_search_term: str | None = None
|
||||||
|
|
||||||
|
location: str | None = None
|
||||||
|
country: Country | None = Country.USA
|
||||||
|
distance: int | None = None
|
||||||
|
is_remote: bool = False
|
||||||
|
job_type: JobType | None = None
|
||||||
|
easy_apply: bool | None = None
|
||||||
|
offset: int = 0
|
||||||
|
linkedin_fetch_description: bool = False
|
||||||
|
linkedin_company_ids: list[int] | None = None
|
||||||
|
description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN
|
||||||
|
|
||||||
|
results_wanted: int = 15
|
||||||
|
hours_old: int | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class Scraper(ABC):
|
||||||
|
def __init__(
|
||||||
|
self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None
|
||||||
|
):
|
||||||
|
self.site = site
|
||||||
|
self.proxies = proxies
|
||||||
|
self.ca_cert = ca_cert
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...
|
|
@ -1,5 +1,5 @@
|
||||||
"""
|
"""
|
||||||
jobspy.jobboard.exceptions
|
jobspy.scrapers.exceptions
|
||||||
~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
This module contains the set of Scrapers' exceptions.
|
This module contains the set of Scrapers' exceptions.
|
||||||
|
@ -29,8 +29,3 @@ class GlassdoorException(Exception):
|
||||||
class GoogleJobsException(Exception):
|
class GoogleJobsException(Exception):
|
||||||
def __init__(self, message=None):
|
def __init__(self, message=None):
|
||||||
super().__init__(message or "An error occurred with Google Jobs")
|
super().__init__(message or "An error occurred with Google Jobs")
|
||||||
|
|
||||||
|
|
||||||
class BaytException(Exception):
|
|
||||||
def __init__(self, message=None):
|
|
||||||
super().__init__(message or "An error occurred with Bayt")
|
|
|
@ -1,38 +1,41 @@
|
||||||
|
"""
|
||||||
|
jobspy.scrapers.glassdoor
|
||||||
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
This module contains routines to scrape Glassdoor.
|
||||||
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
import requests
|
import requests
|
||||||
from typing import Tuple
|
from typing import Optional, Tuple
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
|
||||||
from jobspy.glassdoor.constant import fallback_token, query_template, headers
|
from .constants import fallback_token, query_template, headers
|
||||||
from jobspy.glassdoor.util import (
|
from .. import Scraper, ScraperInput, Site
|
||||||
get_cursor_for_page,
|
from ..utils import extract_emails_from_text, create_logger
|
||||||
parse_compensation,
|
from ..exceptions import GlassdoorException
|
||||||
parse_location,
|
from ..utils import (
|
||||||
)
|
|
||||||
from jobspy.util import (
|
|
||||||
extract_emails_from_text,
|
|
||||||
create_logger,
|
|
||||||
create_session,
|
create_session,
|
||||||
markdown_converter,
|
markdown_converter,
|
||||||
)
|
)
|
||||||
from jobspy.exception import GlassdoorException
|
from ...jobs import (
|
||||||
from jobspy.model import (
|
|
||||||
JobPost,
|
JobPost,
|
||||||
|
Compensation,
|
||||||
|
CompensationInterval,
|
||||||
|
Location,
|
||||||
JobResponse,
|
JobResponse,
|
||||||
|
JobType,
|
||||||
DescriptionFormat,
|
DescriptionFormat,
|
||||||
Scraper,
|
|
||||||
ScraperInput,
|
|
||||||
Site,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
log = create_logger("Glassdoor")
|
logger = create_logger("Glassdoor")
|
||||||
|
|
||||||
|
|
||||||
class Glassdoor(Scraper):
|
class GlassdoorScraper(Scraper):
|
||||||
def __init__(
|
def __init__(
|
||||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
||||||
):
|
):
|
||||||
|
@ -61,7 +64,7 @@ class Glassdoor(Scraper):
|
||||||
self.base_url = self.scraper_input.country.get_glassdoor_url()
|
self.base_url = self.scraper_input.country.get_glassdoor_url()
|
||||||
|
|
||||||
self.session = create_session(
|
self.session = create_session(
|
||||||
proxies=self.proxies, ca_cert=self.ca_cert, has_retry=True
|
proxies=self.proxies, ca_cert=self.ca_cert, is_tls=True, has_retry=True
|
||||||
)
|
)
|
||||||
token = self._get_csrf_token()
|
token = self._get_csrf_token()
|
||||||
headers["gd-csrf-token"] = token if token else fallback_token
|
headers["gd-csrf-token"] = token if token else fallback_token
|
||||||
|
@ -71,7 +74,7 @@ class Glassdoor(Scraper):
|
||||||
scraper_input.location, scraper_input.is_remote
|
scraper_input.location, scraper_input.is_remote
|
||||||
)
|
)
|
||||||
if location_type is None:
|
if location_type is None:
|
||||||
log.error("Glassdoor: location not parsed")
|
logger.error("Glassdoor: location not parsed")
|
||||||
return JobResponse(jobs=[])
|
return JobResponse(jobs=[])
|
||||||
job_list: list[JobPost] = []
|
job_list: list[JobPost] = []
|
||||||
cursor = None
|
cursor = None
|
||||||
|
@ -80,7 +83,7 @@ class Glassdoor(Scraper):
|
||||||
tot_pages = (scraper_input.results_wanted // self.jobs_per_page) + 2
|
tot_pages = (scraper_input.results_wanted // self.jobs_per_page) + 2
|
||||||
range_end = min(tot_pages, self.max_pages + 1)
|
range_end = min(tot_pages, self.max_pages + 1)
|
||||||
for page in range(range_start, range_end):
|
for page in range(range_start, range_end):
|
||||||
log.info(f"search page: {page} / {range_end - 1}")
|
logger.info(f"search page: {page} / {range_end-1}")
|
||||||
try:
|
try:
|
||||||
jobs, cursor = self._fetch_jobs_page(
|
jobs, cursor = self._fetch_jobs_page(
|
||||||
scraper_input, location_id, location_type, page, cursor
|
scraper_input, location_id, location_type, page, cursor
|
||||||
|
@ -90,7 +93,7 @@ class Glassdoor(Scraper):
|
||||||
job_list = job_list[: scraper_input.results_wanted]
|
job_list = job_list[: scraper_input.results_wanted]
|
||||||
break
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.error(f"Glassdoor: {str(e)}")
|
logger.error(f"Glassdoor: {str(e)}")
|
||||||
break
|
break
|
||||||
return JobResponse(jobs=job_list)
|
return JobResponse(jobs=job_list)
|
||||||
|
|
||||||
|
@ -126,7 +129,7 @@ class Glassdoor(Scraper):
|
||||||
ValueError,
|
ValueError,
|
||||||
Exception,
|
Exception,
|
||||||
) as e:
|
) as e:
|
||||||
log.error(f"Glassdoor: {str(e)}")
|
logger.error(f"Glassdoor: {str(e)}")
|
||||||
return jobs, None
|
return jobs, None
|
||||||
|
|
||||||
jobs_data = res_json["data"]["jobListings"]["jobListings"]
|
jobs_data = res_json["data"]["jobListings"]["jobListings"]
|
||||||
|
@ -143,7 +146,7 @@ class Glassdoor(Scraper):
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
raise GlassdoorException(f"Glassdoor generated an exception: {exc}")
|
raise GlassdoorException(f"Glassdoor generated an exception: {exc}")
|
||||||
|
|
||||||
return jobs, get_cursor_for_page(
|
return jobs, self.get_cursor_for_page(
|
||||||
res_json["data"]["jobListings"]["paginationCursors"], page_num + 1
|
res_json["data"]["jobListings"]["paginationCursors"], page_num + 1
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -182,9 +185,9 @@ class Glassdoor(Scraper):
|
||||||
if location_type == "S":
|
if location_type == "S":
|
||||||
is_remote = True
|
is_remote = True
|
||||||
else:
|
else:
|
||||||
location = parse_location(location_name)
|
location = self.parse_location(location_name)
|
||||||
|
|
||||||
compensation = parse_compensation(job["header"])
|
compensation = self.parse_compensation(job["header"])
|
||||||
try:
|
try:
|
||||||
description = self._fetch_job_description(job_id)
|
description = self._fetch_job_description(job_id)
|
||||||
except:
|
except:
|
||||||
|
@ -261,12 +264,12 @@ class Glassdoor(Scraper):
|
||||||
if res.status_code != 200:
|
if res.status_code != 200:
|
||||||
if res.status_code == 429:
|
if res.status_code == 429:
|
||||||
err = f"429 Response - Blocked by Glassdoor for too many requests"
|
err = f"429 Response - Blocked by Glassdoor for too many requests"
|
||||||
log.error(err)
|
logger.error(err)
|
||||||
return None, None
|
return None, None
|
||||||
else:
|
else:
|
||||||
err = f"Glassdoor response status code {res.status_code}"
|
err = f"Glassdoor response status code {res.status_code}"
|
||||||
err += f" - {res.text}"
|
err += f" - {res.text}"
|
||||||
log.error(f"Glassdoor response status code {res.status_code}")
|
logger.error(f"Glassdoor response status code {res.status_code}")
|
||||||
return None, None
|
return None, None
|
||||||
items = res.json()
|
items = res.json()
|
||||||
|
|
||||||
|
@ -318,3 +321,44 @@ class Glassdoor(Scraper):
|
||||||
{"filterKey": "jobType", "values": self.scraper_input.job_type.value[0]}
|
{"filterKey": "jobType", "values": self.scraper_input.job_type.value[0]}
|
||||||
)
|
)
|
||||||
return json.dumps([payload])
|
return json.dumps([payload])
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def parse_compensation(data: dict) -> Optional[Compensation]:
|
||||||
|
pay_period = data.get("payPeriod")
|
||||||
|
adjusted_pay = data.get("payPeriodAdjustedPay")
|
||||||
|
currency = data.get("payCurrency", "USD")
|
||||||
|
if not pay_period or not adjusted_pay:
|
||||||
|
return None
|
||||||
|
|
||||||
|
interval = None
|
||||||
|
if pay_period == "ANNUAL":
|
||||||
|
interval = CompensationInterval.YEARLY
|
||||||
|
elif pay_period:
|
||||||
|
interval = CompensationInterval.get_interval(pay_period)
|
||||||
|
min_amount = int(adjusted_pay.get("p10") // 1)
|
||||||
|
max_amount = int(adjusted_pay.get("p90") // 1)
|
||||||
|
return Compensation(
|
||||||
|
interval=interval,
|
||||||
|
min_amount=min_amount,
|
||||||
|
max_amount=max_amount,
|
||||||
|
currency=currency,
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
|
||||||
|
for job_type in JobType:
|
||||||
|
if job_type_str in job_type.value:
|
||||||
|
return [job_type]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def parse_location(location_name: str) -> Location | None:
|
||||||
|
if not location_name or location_name == "Remote":
|
||||||
|
return
|
||||||
|
city, _, state = location_name.partition(", ")
|
||||||
|
return Location(city=city, state=state)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_cursor_for_page(pagination_cursors, page_num):
|
||||||
|
for cursor_data in pagination_cursors:
|
||||||
|
if cursor_data["pageNumber"] == page_num:
|
||||||
|
return cursor_data["cursor"]
|
|
@ -1,3 +1,10 @@
|
||||||
|
"""
|
||||||
|
jobspy.scrapers.google
|
||||||
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
This module contains routines to scrape Google.
|
||||||
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import math
|
import math
|
||||||
|
@ -6,21 +13,23 @@ import json
|
||||||
from typing import Tuple
|
from typing import Tuple
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
from jobspy.google.constant import headers_jobs, headers_initial, async_param
|
from .constants import headers_jobs, headers_initial, async_param
|
||||||
from jobspy.model import (
|
from .. import Scraper, ScraperInput, Site
|
||||||
Scraper,
|
from ..utils import extract_emails_from_text, create_logger, extract_job_type
|
||||||
ScraperInput,
|
from ..utils import (
|
||||||
Site,
|
create_session,
|
||||||
|
)
|
||||||
|
from ...jobs import (
|
||||||
JobPost,
|
JobPost,
|
||||||
JobResponse,
|
JobResponse,
|
||||||
Location,
|
Location,
|
||||||
JobType,
|
JobType,
|
||||||
)
|
)
|
||||||
from jobspy.util import extract_emails_from_text, extract_job_type, create_session
|
|
||||||
from jobspy.google.util import log, find_job_info_initial_page, find_job_info
|
logger = create_logger("Google")
|
||||||
|
|
||||||
|
|
||||||
class Google(Scraper):
|
class GoogleJobsScraper(Scraper):
|
||||||
def __init__(
|
def __init__(
|
||||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
||||||
):
|
):
|
||||||
|
@ -52,7 +61,7 @@ class Google(Scraper):
|
||||||
)
|
)
|
||||||
forward_cursor, job_list = self._get_initial_cursor_and_jobs()
|
forward_cursor, job_list = self._get_initial_cursor_and_jobs()
|
||||||
if forward_cursor is None:
|
if forward_cursor is None:
|
||||||
log.warning(
|
logger.warning(
|
||||||
"initial cursor not found, try changing your query or there was at most 10 results"
|
"initial cursor not found, try changing your query or there was at most 10 results"
|
||||||
)
|
)
|
||||||
return JobResponse(jobs=job_list)
|
return JobResponse(jobs=job_list)
|
||||||
|
@ -63,16 +72,16 @@ class Google(Scraper):
|
||||||
len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset
|
len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset
|
||||||
and forward_cursor
|
and forward_cursor
|
||||||
):
|
):
|
||||||
log.info(
|
logger.info(
|
||||||
f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
|
f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
jobs, forward_cursor = self._get_jobs_next_page(forward_cursor)
|
jobs, forward_cursor = self._get_jobs_next_page(forward_cursor)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.error(f"failed to get jobs on page: {page}, {e}")
|
logger.error(f"failed to get jobs on page: {page}, {e}")
|
||||||
break
|
break
|
||||||
if not jobs:
|
if not jobs:
|
||||||
log.info(f"found no jobs on page: {page}")
|
logger.info(f"found no jobs on page: {page}")
|
||||||
break
|
break
|
||||||
job_list += jobs
|
job_list += jobs
|
||||||
page += 1
|
page += 1
|
||||||
|
@ -126,7 +135,7 @@ class Google(Scraper):
|
||||||
pattern_fc = r'<div jsname="Yust4d"[^>]+data-async-fc="([^"]+)"'
|
pattern_fc = r'<div jsname="Yust4d"[^>]+data-async-fc="([^"]+)"'
|
||||||
match_fc = re.search(pattern_fc, response.text)
|
match_fc = re.search(pattern_fc, response.text)
|
||||||
data_async_fc = match_fc.group(1) if match_fc else None
|
data_async_fc = match_fc.group(1) if match_fc else None
|
||||||
jobs_raw = find_job_info_initial_page(response.text)
|
jobs_raw = self._find_job_info_initial_page(response.text)
|
||||||
jobs = []
|
jobs = []
|
||||||
for job_raw in jobs_raw:
|
for job_raw in jobs_raw:
|
||||||
job_post = self._parse_job(job_raw)
|
job_post = self._parse_job(job_raw)
|
||||||
|
@ -158,7 +167,7 @@ class Google(Scraper):
|
||||||
continue
|
continue
|
||||||
job_d = json.loads(job_data)
|
job_d = json.loads(job_data)
|
||||||
|
|
||||||
job_info = find_job_info(job_d)
|
job_info = self._find_job_info(job_d)
|
||||||
job_post = self._parse_job(job_info)
|
job_post = self._parse_job(job_info)
|
||||||
if job_post:
|
if job_post:
|
||||||
jobs_on_page.append(job_post)
|
jobs_on_page.append(job_post)
|
||||||
|
@ -200,3 +209,42 @@ class Google(Scraper):
|
||||||
job_type=extract_job_type(description),
|
job_type=extract_job_type(description),
|
||||||
)
|
)
|
||||||
return job_post
|
return job_post
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _find_job_info(jobs_data: list | dict) -> list | None:
|
||||||
|
"""Iterates through the JSON data to find the job listings"""
|
||||||
|
if isinstance(jobs_data, dict):
|
||||||
|
for key, value in jobs_data.items():
|
||||||
|
if key == "520084652" and isinstance(value, list):
|
||||||
|
return value
|
||||||
|
else:
|
||||||
|
result = GoogleJobsScraper._find_job_info(value)
|
||||||
|
if result:
|
||||||
|
return result
|
||||||
|
elif isinstance(jobs_data, list):
|
||||||
|
for item in jobs_data:
|
||||||
|
result = GoogleJobsScraper._find_job_info(item)
|
||||||
|
if result:
|
||||||
|
return result
|
||||||
|
return None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _find_job_info_initial_page(html_text: str):
|
||||||
|
pattern = (
|
||||||
|
f'520084652":('
|
||||||
|
+ r"\[.*?\]\s*])\s*}\s*]\s*]\s*]\s*]\s*]"
|
||||||
|
)
|
||||||
|
results = []
|
||||||
|
matches = re.finditer(pattern, html_text)
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
|
for match in matches:
|
||||||
|
try:
|
||||||
|
parsed_data = json.loads(match.group(1))
|
||||||
|
results.append(parsed_data)
|
||||||
|
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
logger.error(f"Failed to parse match: {str(e)}")
|
||||||
|
results.append({"raw_match": match.group(0), "error": str(e)})
|
||||||
|
return results
|
|
@ -1,32 +1,39 @@
|
||||||
|
"""
|
||||||
|
jobspy.scrapers.indeed
|
||||||
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
This module contains routines to scrape Indeed.
|
||||||
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import math
|
import math
|
||||||
from datetime import datetime
|
|
||||||
from typing import Tuple
|
from typing import Tuple
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
from jobspy.indeed.constant import job_search_query, api_headers
|
from .constants import job_search_query, api_headers
|
||||||
from jobspy.indeed.util import is_job_remote, get_compensation, get_job_type
|
from .. import Scraper, ScraperInput, Site
|
||||||
from jobspy.model import (
|
from ..utils import (
|
||||||
Scraper,
|
extract_emails_from_text,
|
||||||
ScraperInput,
|
get_enum_from_job_type,
|
||||||
Site,
|
markdown_converter,
|
||||||
|
create_session,
|
||||||
|
create_logger,
|
||||||
|
)
|
||||||
|
from ...jobs import (
|
||||||
JobPost,
|
JobPost,
|
||||||
|
Compensation,
|
||||||
|
CompensationInterval,
|
||||||
Location,
|
Location,
|
||||||
JobResponse,
|
JobResponse,
|
||||||
JobType,
|
JobType,
|
||||||
DescriptionFormat,
|
DescriptionFormat,
|
||||||
)
|
)
|
||||||
from jobspy.util import (
|
|
||||||
extract_emails_from_text,
|
|
||||||
markdown_converter,
|
|
||||||
create_session,
|
|
||||||
create_logger,
|
|
||||||
)
|
|
||||||
|
|
||||||
log = create_logger("Indeed")
|
logger = create_logger("Indeed")
|
||||||
|
|
||||||
|
|
||||||
class Indeed(Scraper):
|
class IndeedScraper(Scraper):
|
||||||
def __init__(
|
def __init__(
|
||||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
||||||
):
|
):
|
||||||
|
@ -64,12 +71,12 @@ class Indeed(Scraper):
|
||||||
cursor = None
|
cursor = None
|
||||||
|
|
||||||
while len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset:
|
while len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset:
|
||||||
log.info(
|
logger.info(
|
||||||
f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
|
f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
|
||||||
)
|
)
|
||||||
jobs, cursor = self._scrape_page(cursor)
|
jobs, cursor = self._scrape_page(cursor)
|
||||||
if not jobs:
|
if not jobs:
|
||||||
log.info(f"found no jobs on page: {page}")
|
logger.info(f"found no jobs on page: {page}")
|
||||||
break
|
break
|
||||||
job_list += jobs
|
job_list += jobs
|
||||||
page += 1
|
page += 1
|
||||||
|
@ -115,10 +122,9 @@ class Indeed(Scraper):
|
||||||
headers=api_headers_temp,
|
headers=api_headers_temp,
|
||||||
json=payload,
|
json=payload,
|
||||||
timeout=10,
|
timeout=10,
|
||||||
verify=False,
|
|
||||||
)
|
)
|
||||||
if not response.ok:
|
if not response.ok:
|
||||||
log.info(
|
logger.info(
|
||||||
f"responded with status code: {response.status_code} (submit GitHub issue if this appears to be a bug)"
|
f"responded with status code: {response.status_code} (submit GitHub issue if this appears to be a bug)"
|
||||||
)
|
)
|
||||||
return jobs, new_cursor
|
return jobs, new_cursor
|
||||||
|
@ -206,7 +212,7 @@ class Indeed(Scraper):
|
||||||
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
||||||
description = markdown_converter(description)
|
description = markdown_converter(description)
|
||||||
|
|
||||||
job_type = get_job_type(job["attributes"])
|
job_type = self._get_job_type(job["attributes"])
|
||||||
timestamp_seconds = job["datePublished"] / 1000
|
timestamp_seconds = job["datePublished"] / 1000
|
||||||
date_posted = datetime.fromtimestamp(timestamp_seconds).strftime("%Y-%m-%d")
|
date_posted = datetime.fromtimestamp(timestamp_seconds).strftime("%Y-%m-%d")
|
||||||
employer = job["employer"].get("dossier") if job["employer"] else None
|
employer = job["employer"].get("dossier") if job["employer"] else None
|
||||||
|
@ -227,14 +233,14 @@ class Indeed(Scraper):
|
||||||
country=job.get("location", {}).get("countryCode"),
|
country=job.get("location", {}).get("countryCode"),
|
||||||
),
|
),
|
||||||
job_type=job_type,
|
job_type=job_type,
|
||||||
compensation=get_compensation(job["compensation"]),
|
compensation=self._get_compensation(job["compensation"]),
|
||||||
date_posted=date_posted,
|
date_posted=date_posted,
|
||||||
job_url=job_url,
|
job_url=job_url,
|
||||||
job_url_direct=(
|
job_url_direct=(
|
||||||
job["recruit"].get("viewJobUrl") if job.get("recruit") else None
|
job["recruit"].get("viewJobUrl") if job.get("recruit") else None
|
||||||
),
|
),
|
||||||
emails=extract_emails_from_text(description) if description else None,
|
emails=extract_emails_from_text(description) if description else None,
|
||||||
is_remote=is_job_remote(job, description),
|
is_remote=self._is_job_remote(job, description),
|
||||||
company_addresses=(
|
company_addresses=(
|
||||||
employer_details["addresses"][0]
|
employer_details["addresses"][0]
|
||||||
if employer_details.get("addresses")
|
if employer_details.get("addresses")
|
||||||
|
@ -258,3 +264,86 @@ class Indeed(Scraper):
|
||||||
else None
|
else None
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_job_type(attributes: list) -> list[JobType]:
|
||||||
|
"""
|
||||||
|
Parses the attributes to get list of job types
|
||||||
|
:param attributes:
|
||||||
|
:return: list of JobType
|
||||||
|
"""
|
||||||
|
job_types: list[JobType] = []
|
||||||
|
for attribute in attributes:
|
||||||
|
job_type_str = attribute["label"].replace("-", "").replace(" ", "").lower()
|
||||||
|
job_type = get_enum_from_job_type(job_type_str)
|
||||||
|
if job_type:
|
||||||
|
job_types.append(job_type)
|
||||||
|
return job_types
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_compensation(compensation: dict) -> Compensation | None:
|
||||||
|
"""
|
||||||
|
Parses the job to get compensation
|
||||||
|
:param job:
|
||||||
|
:return: compensation object
|
||||||
|
"""
|
||||||
|
if not compensation["baseSalary"] and not compensation["estimated"]:
|
||||||
|
return None
|
||||||
|
comp = (
|
||||||
|
compensation["baseSalary"]
|
||||||
|
if compensation["baseSalary"]
|
||||||
|
else compensation["estimated"]["baseSalary"]
|
||||||
|
)
|
||||||
|
if not comp:
|
||||||
|
return None
|
||||||
|
interval = IndeedScraper._get_compensation_interval(comp["unitOfWork"])
|
||||||
|
if not interval:
|
||||||
|
return None
|
||||||
|
min_range = comp["range"].get("min")
|
||||||
|
max_range = comp["range"].get("max")
|
||||||
|
return Compensation(
|
||||||
|
interval=interval,
|
||||||
|
min_amount=int(min_range) if min_range is not None else None,
|
||||||
|
max_amount=int(max_range) if max_range is not None else None,
|
||||||
|
currency=(
|
||||||
|
compensation["estimated"]["currencyCode"]
|
||||||
|
if compensation["estimated"]
|
||||||
|
else compensation["currencyCode"]
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _is_job_remote(job: dict, description: str) -> bool:
|
||||||
|
"""
|
||||||
|
Searches the description, location, and attributes to check if job is remote
|
||||||
|
"""
|
||||||
|
remote_keywords = ["remote", "work from home", "wfh"]
|
||||||
|
is_remote_in_attributes = any(
|
||||||
|
any(keyword in attr["label"].lower() for keyword in remote_keywords)
|
||||||
|
for attr in job["attributes"]
|
||||||
|
)
|
||||||
|
is_remote_in_description = any(
|
||||||
|
keyword in description.lower() for keyword in remote_keywords
|
||||||
|
)
|
||||||
|
is_remote_in_location = any(
|
||||||
|
keyword in job["location"]["formatted"]["long"].lower()
|
||||||
|
for keyword in remote_keywords
|
||||||
|
)
|
||||||
|
return (
|
||||||
|
is_remote_in_attributes or is_remote_in_description or is_remote_in_location
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_compensation_interval(interval: str) -> CompensationInterval:
|
||||||
|
interval_mapping = {
|
||||||
|
"DAY": "DAILY",
|
||||||
|
"YEAR": "YEARLY",
|
||||||
|
"HOUR": "HOURLY",
|
||||||
|
"WEEK": "WEEKLY",
|
||||||
|
"MONTH": "MONTHLY",
|
||||||
|
}
|
||||||
|
mapped_interval = interval_mapping.get(interval.upper(), None)
|
||||||
|
if mapped_interval and mapped_interval in CompensationInterval.__members__:
|
||||||
|
return CompensationInterval[mapped_interval]
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported interval: {interval}")
|
|
@ -1,48 +1,47 @@
|
||||||
|
"""
|
||||||
|
jobspy.scrapers.linkedin
|
||||||
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
This module contains routines to scrape LinkedIn.
|
||||||
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import math
|
import math
|
||||||
import random
|
|
||||||
import time
|
import time
|
||||||
from datetime import datetime
|
import random
|
||||||
|
import regex as re
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from bs4.element import Tag
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
from urllib.parse import urlparse, urlunparse, unquote
|
from urllib.parse import urlparse, urlunparse, unquote
|
||||||
|
|
||||||
import regex as re
|
from .constants import headers
|
||||||
from bs4 import BeautifulSoup
|
from .. import Scraper, ScraperInput, Site
|
||||||
from bs4.element import Tag
|
from ..exceptions import LinkedInException
|
||||||
|
from ..utils import create_session, remove_attributes, create_logger
|
||||||
from jobspy.exception import LinkedInException
|
from ...jobs import (
|
||||||
from jobspy.linkedin.constant import headers
|
|
||||||
from jobspy.linkedin.util import (
|
|
||||||
job_type_code,
|
|
||||||
parse_job_type,
|
|
||||||
parse_job_level,
|
|
||||||
parse_company_industry,
|
|
||||||
)
|
|
||||||
from jobspy.model import (
|
|
||||||
JobPost,
|
JobPost,
|
||||||
Location,
|
Location,
|
||||||
JobResponse,
|
JobResponse,
|
||||||
|
JobType,
|
||||||
Country,
|
Country,
|
||||||
Compensation,
|
Compensation,
|
||||||
DescriptionFormat,
|
DescriptionFormat,
|
||||||
Scraper,
|
|
||||||
ScraperInput,
|
|
||||||
Site,
|
|
||||||
)
|
)
|
||||||
from jobspy.util import (
|
from ..utils import (
|
||||||
extract_emails_from_text,
|
extract_emails_from_text,
|
||||||
|
get_enum_from_job_type,
|
||||||
currency_parser,
|
currency_parser,
|
||||||
markdown_converter,
|
markdown_converter,
|
||||||
create_session,
|
|
||||||
remove_attributes,
|
|
||||||
create_logger,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
log = create_logger("LinkedIn")
|
logger = create_logger("LinkedIn")
|
||||||
|
|
||||||
|
|
||||||
class LinkedIn(Scraper):
|
class LinkedInScraper(Scraper):
|
||||||
base_url = "https://www.linkedin.com"
|
base_url = "https://www.linkedin.com"
|
||||||
delay = 3
|
delay = 3
|
||||||
band_delay = 4
|
band_delay = 4
|
||||||
|
@ -87,7 +86,7 @@ class LinkedIn(Scraper):
|
||||||
)
|
)
|
||||||
while continue_search():
|
while continue_search():
|
||||||
request_count += 1
|
request_count += 1
|
||||||
log.info(
|
logger.info(
|
||||||
f"search page: {request_count} / {math.ceil(scraper_input.results_wanted / 10)}"
|
f"search page: {request_count} / {math.ceil(scraper_input.results_wanted / 10)}"
|
||||||
)
|
)
|
||||||
params = {
|
params = {
|
||||||
|
@ -96,7 +95,7 @@ class LinkedIn(Scraper):
|
||||||
"distance": scraper_input.distance,
|
"distance": scraper_input.distance,
|
||||||
"f_WT": 2 if scraper_input.is_remote else None,
|
"f_WT": 2 if scraper_input.is_remote else None,
|
||||||
"f_JT": (
|
"f_JT": (
|
||||||
job_type_code(scraper_input.job_type)
|
self.job_type_code(scraper_input.job_type)
|
||||||
if scraper_input.job_type
|
if scraper_input.job_type
|
||||||
else None
|
else None
|
||||||
),
|
),
|
||||||
|
@ -127,13 +126,13 @@ class LinkedIn(Scraper):
|
||||||
else:
|
else:
|
||||||
err = f"LinkedIn response status code {response.status_code}"
|
err = f"LinkedIn response status code {response.status_code}"
|
||||||
err += f" - {response.text}"
|
err += f" - {response.text}"
|
||||||
log.error(err)
|
logger.error(err)
|
||||||
return JobResponse(jobs=job_list)
|
return JobResponse(jobs=job_list)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if "Proxy responded with" in str(e):
|
if "Proxy responded with" in str(e):
|
||||||
log.error(f"LinkedIn: Bad proxy")
|
logger.error(f"LinkedIn: Bad proxy")
|
||||||
else:
|
else:
|
||||||
log.error(f"LinkedIn: {str(e)}")
|
logger.error(f"LinkedIn: {str(e)}")
|
||||||
return JobResponse(jobs=job_list)
|
return JobResponse(jobs=job_list)
|
||||||
|
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
@ -283,9 +282,9 @@ class LinkedIn(Scraper):
|
||||||
)
|
)
|
||||||
return {
|
return {
|
||||||
"description": description,
|
"description": description,
|
||||||
"job_level": parse_job_level(soup),
|
"job_level": self._parse_job_level(soup),
|
||||||
"company_industry": parse_company_industry(soup),
|
"company_industry": self._parse_company_industry(soup),
|
||||||
"job_type": parse_job_type(soup),
|
"job_type": self._parse_job_type(soup),
|
||||||
"job_url_direct": self._parse_job_url_direct(soup),
|
"job_url_direct": self._parse_job_url_direct(soup),
|
||||||
"company_logo": company_logo,
|
"company_logo": company_logo,
|
||||||
"job_function": job_function,
|
"job_function": job_function,
|
||||||
|
@ -317,6 +316,77 @@ class LinkedIn(Scraper):
|
||||||
location = Location(city=city, state=state, country=country)
|
location = Location(city=city, state=state, country=country)
|
||||||
return location
|
return location
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _parse_job_type(soup_job_type: BeautifulSoup) -> list[JobType] | None:
|
||||||
|
"""
|
||||||
|
Gets the job type from job page
|
||||||
|
:param soup_job_type:
|
||||||
|
:return: JobType
|
||||||
|
"""
|
||||||
|
h3_tag = soup_job_type.find(
|
||||||
|
"h3",
|
||||||
|
class_="description__job-criteria-subheader",
|
||||||
|
string=lambda text: "Employment type" in text,
|
||||||
|
)
|
||||||
|
employment_type = None
|
||||||
|
if h3_tag:
|
||||||
|
employment_type_span = h3_tag.find_next_sibling(
|
||||||
|
"span",
|
||||||
|
class_="description__job-criteria-text description__job-criteria-text--criteria",
|
||||||
|
)
|
||||||
|
if employment_type_span:
|
||||||
|
employment_type = employment_type_span.get_text(strip=True)
|
||||||
|
employment_type = employment_type.lower()
|
||||||
|
employment_type = employment_type.replace("-", "")
|
||||||
|
|
||||||
|
return [get_enum_from_job_type(employment_type)] if employment_type else []
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _parse_job_level(soup_job_level: BeautifulSoup) -> str | None:
|
||||||
|
"""
|
||||||
|
Gets the job level from job page
|
||||||
|
:param soup_job_level:
|
||||||
|
:return: str
|
||||||
|
"""
|
||||||
|
h3_tag = soup_job_level.find(
|
||||||
|
"h3",
|
||||||
|
class_="description__job-criteria-subheader",
|
||||||
|
string=lambda text: "Seniority level" in text,
|
||||||
|
)
|
||||||
|
job_level = None
|
||||||
|
if h3_tag:
|
||||||
|
job_level_span = h3_tag.find_next_sibling(
|
||||||
|
"span",
|
||||||
|
class_="description__job-criteria-text description__job-criteria-text--criteria",
|
||||||
|
)
|
||||||
|
if job_level_span:
|
||||||
|
job_level = job_level_span.get_text(strip=True)
|
||||||
|
|
||||||
|
return job_level
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _parse_company_industry(soup_industry: BeautifulSoup) -> str | None:
|
||||||
|
"""
|
||||||
|
Gets the company industry from job page
|
||||||
|
:param soup_industry:
|
||||||
|
:return: str
|
||||||
|
"""
|
||||||
|
h3_tag = soup_industry.find(
|
||||||
|
"h3",
|
||||||
|
class_="description__job-criteria-subheader",
|
||||||
|
string=lambda text: "Industries" in text,
|
||||||
|
)
|
||||||
|
industry = None
|
||||||
|
if h3_tag:
|
||||||
|
industry_span = h3_tag.find_next_sibling(
|
||||||
|
"span",
|
||||||
|
class_="description__job-criteria-text description__job-criteria-text--criteria",
|
||||||
|
)
|
||||||
|
if industry_span:
|
||||||
|
industry = industry_span.get_text(strip=True)
|
||||||
|
|
||||||
|
return industry
|
||||||
|
|
||||||
def _parse_job_url_direct(self, soup: BeautifulSoup) -> str | None:
|
def _parse_job_url_direct(self, soup: BeautifulSoup) -> str | None:
|
||||||
"""
|
"""
|
||||||
Gets the job url direct from job page
|
Gets the job url direct from job page
|
||||||
|
@ -333,3 +403,13 @@ class LinkedIn(Scraper):
|
||||||
job_url_direct = unquote(job_url_direct_match.group())
|
job_url_direct = unquote(job_url_direct_match.group())
|
||||||
|
|
||||||
return job_url_direct
|
return job_url_direct
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def job_type_code(job_type_enum: JobType) -> str:
|
||||||
|
return {
|
||||||
|
JobType.FULL_TIME: "F",
|
||||||
|
JobType.PART_TIME: "P",
|
||||||
|
JobType.INTERNSHIP: "I",
|
||||||
|
JobType.CONTRACT: "C",
|
||||||
|
JobType.TEMPORARY: "T",
|
||||||
|
}.get(job_type_enum, "")
|
|
@ -1,19 +1,16 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import logging
|
|
||||||
import re
|
import re
|
||||||
|
import logging
|
||||||
from itertools import cycle
|
from itertools import cycle
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import requests
|
import requests
|
||||||
import tls_client
|
import tls_client
|
||||||
import urllib3
|
import numpy as np
|
||||||
from markdownify import markdownify as md
|
from markdownify import markdownify as md
|
||||||
from requests.adapters import HTTPAdapter, Retry
|
from requests.adapters import HTTPAdapter, Retry
|
||||||
|
|
||||||
from jobspy.model import CompensationInterval, JobType, Site
|
from ..jobs import CompensationInterval, JobType
|
||||||
|
|
||||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
||||||
|
|
||||||
|
|
||||||
def create_logger(name: str):
|
def create_logger(name: str):
|
||||||
|
@ -132,7 +129,7 @@ def create_session(
|
||||||
return session
|
return session
|
||||||
|
|
||||||
|
|
||||||
def set_logger_level(verbose: int):
|
def set_logger_level(verbose: int = 2):
|
||||||
"""
|
"""
|
||||||
Adjusts the logger's level. This function allows the logging level to be changed at runtime.
|
Adjusts the logger's level. This function allows the logging level to be changed at runtime.
|
||||||
|
|
||||||
|
@ -286,62 +283,3 @@ def extract_job_type(description: str):
|
||||||
listing_types.append(key)
|
listing_types.append(key)
|
||||||
|
|
||||||
return listing_types if listing_types else None
|
return listing_types if listing_types else None
|
||||||
|
|
||||||
|
|
||||||
def map_str_to_site(site_name: str) -> Site:
|
|
||||||
return Site[site_name.upper()]
|
|
||||||
|
|
||||||
|
|
||||||
def get_enum_from_value(value_str):
|
|
||||||
for job_type in JobType:
|
|
||||||
if value_str in job_type.value:
|
|
||||||
return job_type
|
|
||||||
raise Exception(f"Invalid job type: {value_str}")
|
|
||||||
|
|
||||||
|
|
||||||
def convert_to_annual(job_data: dict):
|
|
||||||
if job_data["interval"] == "hourly":
|
|
||||||
job_data["min_amount"] *= 2080
|
|
||||||
job_data["max_amount"] *= 2080
|
|
||||||
if job_data["interval"] == "monthly":
|
|
||||||
job_data["min_amount"] *= 12
|
|
||||||
job_data["max_amount"] *= 12
|
|
||||||
if job_data["interval"] == "weekly":
|
|
||||||
job_data["min_amount"] *= 52
|
|
||||||
job_data["max_amount"] *= 52
|
|
||||||
if job_data["interval"] == "daily":
|
|
||||||
job_data["min_amount"] *= 260
|
|
||||||
job_data["max_amount"] *= 260
|
|
||||||
job_data["interval"] = "yearly"
|
|
||||||
|
|
||||||
|
|
||||||
desired_order = [
|
|
||||||
"id",
|
|
||||||
"site",
|
|
||||||
"job_url",
|
|
||||||
"job_url_direct",
|
|
||||||
"title",
|
|
||||||
"company",
|
|
||||||
"location",
|
|
||||||
"date_posted",
|
|
||||||
"job_type",
|
|
||||||
"salary_source",
|
|
||||||
"interval",
|
|
||||||
"min_amount",
|
|
||||||
"max_amount",
|
|
||||||
"currency",
|
|
||||||
"is_remote",
|
|
||||||
"job_level",
|
|
||||||
"job_function",
|
|
||||||
"listing_type",
|
|
||||||
"emails",
|
|
||||||
"description",
|
|
||||||
"company_industry",
|
|
||||||
"company_url",
|
|
||||||
"company_logo",
|
|
||||||
"company_url_direct",
|
|
||||||
"company_addresses",
|
|
||||||
"company_num_employees",
|
|
||||||
"company_revenue",
|
|
||||||
"company_description",
|
|
||||||
]
|
|
|
@ -1,39 +1,46 @@
|
||||||
|
"""
|
||||||
|
jobspy.scrapers.ziprecruiter
|
||||||
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
This module contains routines to scrape ZipRecruiter.
|
||||||
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import math
|
import math
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from typing import Optional, Tuple, Any
|
||||||
|
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from jobspy.ziprecruiter.constant import headers, get_cookie_data
|
from .constants import headers
|
||||||
from jobspy.util import (
|
from .. import Scraper, ScraperInput, Site
|
||||||
|
from ..utils import (
|
||||||
extract_emails_from_text,
|
extract_emails_from_text,
|
||||||
create_session,
|
create_session,
|
||||||
markdown_converter,
|
markdown_converter,
|
||||||
remove_attributes,
|
remove_attributes,
|
||||||
create_logger,
|
create_logger,
|
||||||
)
|
)
|
||||||
from jobspy.model import (
|
from ...jobs import (
|
||||||
JobPost,
|
JobPost,
|
||||||
Compensation,
|
Compensation,
|
||||||
Location,
|
Location,
|
||||||
JobResponse,
|
JobResponse,
|
||||||
|
JobType,
|
||||||
Country,
|
Country,
|
||||||
DescriptionFormat,
|
DescriptionFormat,
|
||||||
Scraper,
|
|
||||||
ScraperInput,
|
|
||||||
Site,
|
|
||||||
)
|
)
|
||||||
from jobspy.ziprecruiter.util import get_job_type_enum, add_params
|
|
||||||
|
|
||||||
log = create_logger("ZipRecruiter")
|
logger = create_logger("ZipRecruiter")
|
||||||
|
|
||||||
|
|
||||||
class ZipRecruiter(Scraper):
|
class ZipRecruiterScraper(Scraper):
|
||||||
base_url = "https://www.ziprecruiter.com"
|
base_url = "https://www.ziprecruiter.com"
|
||||||
api_url = "https://api.ziprecruiter.com"
|
api_url = "https://api.ziprecruiter.com"
|
||||||
|
|
||||||
|
@ -70,7 +77,7 @@ class ZipRecruiter(Scraper):
|
||||||
break
|
break
|
||||||
if page > 1:
|
if page > 1:
|
||||||
time.sleep(self.delay)
|
time.sleep(self.delay)
|
||||||
log.info(f"search page: {page} / {max_pages}")
|
logger.info(f"search page: {page} / {max_pages}")
|
||||||
jobs_on_page, continue_token = self._find_jobs_in_page(
|
jobs_on_page, continue_token = self._find_jobs_in_page(
|
||||||
scraper_input, continue_token
|
scraper_input, continue_token
|
||||||
)
|
)
|
||||||
|
@ -84,7 +91,7 @@ class ZipRecruiter(Scraper):
|
||||||
|
|
||||||
def _find_jobs_in_page(
|
def _find_jobs_in_page(
|
||||||
self, scraper_input: ScraperInput, continue_token: str | None = None
|
self, scraper_input: ScraperInput, continue_token: str | None = None
|
||||||
) -> tuple[list[JobPost], str | None]:
|
) -> Tuple[list[JobPost], Optional[str]]:
|
||||||
"""
|
"""
|
||||||
Scrapes a page of ZipRecruiter for jobs with scraper_input criteria
|
Scrapes a page of ZipRecruiter for jobs with scraper_input criteria
|
||||||
:param scraper_input:
|
:param scraper_input:
|
||||||
|
@ -92,7 +99,7 @@ class ZipRecruiter(Scraper):
|
||||||
:return: jobs found on page
|
:return: jobs found on page
|
||||||
"""
|
"""
|
||||||
jobs_list = []
|
jobs_list = []
|
||||||
params = add_params(scraper_input)
|
params = self._add_params(scraper_input)
|
||||||
if continue_token:
|
if continue_token:
|
||||||
params["continue_from"] = continue_token
|
params["continue_from"] = continue_token
|
||||||
try:
|
try:
|
||||||
|
@ -103,13 +110,13 @@ class ZipRecruiter(Scraper):
|
||||||
else:
|
else:
|
||||||
err = f"ZipRecruiter response status code {res.status_code}"
|
err = f"ZipRecruiter response status code {res.status_code}"
|
||||||
err += f" with response: {res.text}" # ZipRecruiter likely not available in EU
|
err += f" with response: {res.text}" # ZipRecruiter likely not available in EU
|
||||||
log.error(err)
|
logger.error(err)
|
||||||
return jobs_list, ""
|
return jobs_list, ""
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if "Proxy responded with" in str(e):
|
if "Proxy responded with" in str(e):
|
||||||
log.error(f"Indeed: Bad proxy")
|
logger.error(f"Indeed: Bad proxy")
|
||||||
else:
|
else:
|
||||||
log.error(f"Indeed: {str(e)}")
|
logger.error(f"Indeed: {str(e)}")
|
||||||
return jobs_list, ""
|
return jobs_list, ""
|
||||||
|
|
||||||
res_data = res.json()
|
res_data = res.json()
|
||||||
|
@ -145,7 +152,7 @@ class ZipRecruiter(Scraper):
|
||||||
location = Location(
|
location = Location(
|
||||||
city=job.get("job_city"), state=job.get("job_state"), country=country_enum
|
city=job.get("job_city"), state=job.get("job_state"), country=country_enum
|
||||||
)
|
)
|
||||||
job_type = get_job_type_enum(
|
job_type = self._get_job_type_enum(
|
||||||
job.get("employment_type", "").replace("_", "").lower()
|
job.get("employment_type", "").replace("_", "").lower()
|
||||||
)
|
)
|
||||||
date_posted = datetime.fromisoformat(job["posted_time"].rstrip("Z")).date()
|
date_posted = datetime.fromisoformat(job["posted_time"].rstrip("Z")).date()
|
||||||
|
@ -194,8 +201,6 @@ class ZipRecruiter(Scraper):
|
||||||
else ""
|
else ""
|
||||||
)
|
)
|
||||||
description_full = job_description_clean + company_description_clean
|
description_full = job_description_clean + company_description_clean
|
||||||
|
|
||||||
try:
|
|
||||||
script_tag = soup.find("script", type="application/json")
|
script_tag = soup.find("script", type="application/json")
|
||||||
if script_tag:
|
if script_tag:
|
||||||
job_json = json.loads(script_tag.string)
|
job_json = json.loads(script_tag.string)
|
||||||
|
@ -203,8 +208,6 @@ class ZipRecruiter(Scraper):
|
||||||
m = re.search(r"job_url=(.+)", job_url_val)
|
m = re.search(r"job_url=(.+)", job_url_val)
|
||||||
if m:
|
if m:
|
||||||
job_url_direct = m.group(1)
|
job_url_direct = m.group(1)
|
||||||
except:
|
|
||||||
job_url_direct = None
|
|
||||||
|
|
||||||
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
||||||
description_full = markdown_converter(description_full)
|
description_full = markdown_converter(description_full)
|
||||||
|
@ -212,8 +215,33 @@ class ZipRecruiter(Scraper):
|
||||||
return description_full, job_url_direct
|
return description_full, job_url_direct
|
||||||
|
|
||||||
def _get_cookies(self):
|
def _get_cookies(self):
|
||||||
"""
|
data = "event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple"
|
||||||
Sends a session event to the API with device properties.
|
|
||||||
"""
|
|
||||||
url = f"{self.api_url}/jobs-app/event"
|
url = f"{self.api_url}/jobs-app/event"
|
||||||
self.session.post(url, data=get_cookie_data)
|
self.session.post(url, data=data)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_job_type_enum(job_type_str: str) -> list[JobType] | None:
|
||||||
|
for job_type in JobType:
|
||||||
|
if job_type_str in job_type.value:
|
||||||
|
return [job_type]
|
||||||
|
return None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _add_params(scraper_input) -> dict[str, str | Any]:
|
||||||
|
params = {
|
||||||
|
"search": scraper_input.search_term,
|
||||||
|
"location": scraper_input.location,
|
||||||
|
}
|
||||||
|
if scraper_input.hours_old:
|
||||||
|
params["days"] = max(scraper_input.hours_old // 24, 1)
|
||||||
|
job_type_map = {JobType.FULL_TIME: "full_time", JobType.PART_TIME: "part_time"}
|
||||||
|
if scraper_input.job_type:
|
||||||
|
job_type = scraper_input.job_type
|
||||||
|
params["employment_type"] = job_type_map.get(job_type, job_type.value[0])
|
||||||
|
if scraper_input.easy_apply:
|
||||||
|
params["zipapply"] = 1
|
||||||
|
if scraper_input.is_remote:
|
||||||
|
params["remote"] = 1
|
||||||
|
if scraper_input.distance:
|
||||||
|
params["radius"] = scraper_input.distance
|
||||||
|
return {k: v for k, v in params.items() if v is not None}
|
|
@ -0,0 +1,10 @@
|
||||||
|
headers = {
|
||||||
|
"Host": "api.ziprecruiter.com",
|
||||||
|
"accept": "*/*",
|
||||||
|
"x-zr-zva-override": "100000000;vid:ZT1huzm_EQlDTVEc",
|
||||||
|
"x-pushnotificationid": "0ff4983d38d7fc5b3370297f2bcffcf4b3321c418f5c22dd152a0264707602a0",
|
||||||
|
"x-deviceid": "D77B3A92-E589-46A4-8A39-6EF6F1D86006",
|
||||||
|
"user-agent": "Job Search/87.0 (iPhone; CPU iOS 16_6_1 like Mac OS X)",
|
||||||
|
"authorization": "Basic YTBlZjMyZDYtN2I0Yy00MWVkLWEyODMtYTI1NDAzMzI0YTcyOg==",
|
||||||
|
"accept-language": "en-US,en;q=0.9",
|
||||||
|
}
|
|
@ -0,0 +1,18 @@
|
||||||
|
from jobspy import scrape_jobs
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def test_all():
|
||||||
|
sites = [
|
||||||
|
"indeed",
|
||||||
|
"glassdoor",
|
||||||
|
] # ziprecruiter/linkedin needs good ip, and temp fix to pass test on ci
|
||||||
|
result = scrape_jobs(
|
||||||
|
site_name=sites,
|
||||||
|
search_term="engineer",
|
||||||
|
results_wanted=5,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert (
|
||||||
|
isinstance(result, pd.DataFrame) and len(result) == len(sites) * 5
|
||||||
|
), "Result should be a non-empty DataFrame"
|
|
@ -0,0 +1,13 @@
|
||||||
|
from jobspy import scrape_jobs
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def test_glassdoor():
|
||||||
|
result = scrape_jobs(
|
||||||
|
site_name="glassdoor",
|
||||||
|
search_term="engineer",
|
||||||
|
results_wanted=5,
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
isinstance(result, pd.DataFrame) and len(result) == 5
|
||||||
|
), "Result should be a non-empty DataFrame"
|
|
@ -0,0 +1,12 @@
|
||||||
|
from jobspy import scrape_jobs
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def test_google():
|
||||||
|
result = scrape_jobs(
|
||||||
|
site_name="google", search_term="software engineer", results_wanted=5
|
||||||
|
)
|
||||||
|
|
||||||
|
assert (
|
||||||
|
isinstance(result, pd.DataFrame) and len(result) == 5
|
||||||
|
), "Result should be a non-empty DataFrame"
|
|
@ -0,0 +1,13 @@
|
||||||
|
from jobspy import scrape_jobs
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def test_indeed():
|
||||||
|
result = scrape_jobs(
|
||||||
|
site_name="indeed",
|
||||||
|
search_term="engineer",
|
||||||
|
results_wanted=5,
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
isinstance(result, pd.DataFrame) and len(result) == 5
|
||||||
|
), "Result should be a non-empty DataFrame"
|
|
@ -0,0 +1,9 @@
|
||||||
|
from jobspy import scrape_jobs
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def test_linkedin():
|
||||||
|
result = scrape_jobs(site_name="linkedin", search_term="engineer", results_wanted=5)
|
||||||
|
assert (
|
||||||
|
isinstance(result, pd.DataFrame) and len(result) == 5
|
||||||
|
), "Result should be a non-empty DataFrame"
|
|
@ -0,0 +1,12 @@
|
||||||
|
from jobspy import scrape_jobs
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def test_ziprecruiter():
|
||||||
|
result = scrape_jobs(
|
||||||
|
site_name="zip_recruiter", search_term="software engineer", results_wanted=5
|
||||||
|
)
|
||||||
|
|
||||||
|
assert (
|
||||||
|
isinstance(result, pd.DataFrame) and len(result) == 5
|
||||||
|
), "Result should be a non-empty DataFrame"
|
Loading…
Reference in New Issue