Compare commits

..

33 Commits

Author SHA1 Message Date
Cullen Watson
5bd199d0a5 Merge branch 'main' of https://github.com/Bunsly/JobSpy 2025-02-21 14:15:06 -06:00
Cullen Watson
4ec308a302 refactor:organize code 2025-02-21 14:14:55 -06:00
Cullen Watson
7cb0c518fc docs:readme 2025-02-21 12:53:59 -06:00
Cullen Watson
df70d4bc2e minor 2025-02-21 12:35:31 -06:00
Cullen Watson
3006063875 enh:remove log by default 2025-02-21 12:31:04 -06:00
Abdulrahman Hisham
1be009b8bc Adding Bayt.com Scraper to current codebase (#246) 2025-02-21 12:29:54 -06:00
Cullen Watson
81ed9b3ddf enh:remove log by default 2025-02-21 12:29:28 -06:00
Abdulrahman Al Muaitah
11a9e9a56a Fixed Bayt scraper integration 2025-02-21 20:10:02 +04:00
Abdulrahman Al Muaitah
c6ade14784 Added Bayt Scraper integration 2025-02-21 15:31:29 +04:00
Cullen Watson
13c74a0fed docs:readme 2025-02-09 13:42:18 -06:00
Cullen Watson
333e9e6760 docs:readme 2025-01-17 21:44:49 -06:00
github-actions
04032a0f91 Increment version 2024-12-04 22:55:06 +00:00
Cullen Watson
496896d0b5 enh:fix yml (#225) 2024-12-04 16:54:52 -06:00
Cullen Watson
87ba1ad1bf fix yml 2024-12-04 16:52:15 -06:00
Jason Geffner
4e7ac9a583 Fix Google job search (#223)
The previous regex did not capture all expected matches in the returned content
2024-12-04 16:45:59 -06:00
Cullen Watson
e44d13e1cf enh:auto update version 2024-12-04 16:29:38 -06:00
Cullen Watson
d52e366ef7 docs:readme 2024-11-26 15:51:26 -06:00
Cullen Watson
395ebf0017 docs:readme 2024-11-26 15:49:12 -06:00
Cullen Watson
63fddd9b7f docs:readme 2024-11-26 15:48:22 -06:00
Cullen Watson
58956868ae docs:readme 2024-11-26 15:47:10 -06:00
Cullen Watson
4fce836222 docs:readme 2024-10-28 03:53:59 -05:00
Cullen Watson
5ba25e7a7c docs:readme 2024-10-28 03:42:19 -05:00
Cullen Watson
f7cb3e9206 docs:readme 2024-10-28 03:36:21 -05:00
Cullen Watson
3ad3f121f7 docs:readme 2024-10-28 03:34:52 -05:00
Cullen Watson
ff3c782912 docs:readme 2024-10-25 18:12:08 -05:00
Cullen Watson
338d854b96 fix(google): search (#216) 2024-10-25 14:54:14 -05:00
Cullen Watson
811d4c40b4 chore:version 2024-10-24 15:28:25 -05:00
Cullen Watson
dba92d22c2 chore:version 2024-10-24 15:27:16 -05:00
Cullen Watson
10a3592a0f docs:file 2024-10-24 15:26:49 -05:00
Cullen Watson
b7905cc756 docs:file 2024-10-24 15:24:18 -05:00
Cullen Watson
6867d58829 docs:readme 2024-10-24 15:22:31 -05:00
Cullen Watson
f6248c8386 enh: google jobs (#214) 2024-10-24 15:19:40 -05:00
Cullen Watson
f395597fdd fix(indeed): offset 2024-10-22 19:25:07 -05:00
33 changed files with 1107 additions and 692 deletions

View File

@@ -1,9 +1,9 @@
name: Publish Python 🐍 distributions 📦 to PyPI
name: Publish JobSpy to PyPi
on: push
jobs:
build-n-publish:
name: Build and publish Python 🐍 distributions 📦 to PyPI
name: Build and publish JobSpy to PyPi
runs-on: ubuntu-latest
steps:
@@ -30,4 +30,4 @@ jobs:
if: startsWith(github.ref, 'refs/tags')
uses: pypa/gh-action-pypi-publish@release/v1
with:
password: ${{ secrets.PYPI_API_TOKEN }}
password: ${{ secrets.PYPI_API_TOKEN }}

View File

@@ -1,22 +0,0 @@
name: Python Tests
on:
pull_request:
branches:
- main
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.8'
- name: Install dependencies
run: |
pip install poetry
poetry install
- name: Run tests
run: poetry run pytest tests/test_all.py

144
README.md
View File

@@ -1,17 +1,12 @@
<img src="https://github.com/cullenwatson/JobSpy/assets/78247585/ae185b7e-e444-4712-8bb9-fa97f53e896b" width="400">
**JobSpy** is a simple, yet comprehensive, job scraping library.
**Not technical?** Try out the web scraping tool on our site at [usejobspy.com](https://usejobspy.com).
*Looking to build a data-focused software product?* **[Book a call](https://bunsly.com/)** *to
work with us.*
**JobSpy** is a job scraping library with the goal of aggregating all the jobs from popular job boards with one tool.
## Features
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, & **ZipRecruiter** simultaneously
- Aggregates the job postings in a Pandas DataFrame
- Proxies support
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, **ZipRecruiter**, & **Bayt** concurrently
- Aggregates the job postings in a dataframe
- Proxies support to bypass blocking
![jobspy](https://github.com/cullenwatson/JobSpy/assets/78247585/ec7ef355-05f6-4fd3-8161-a817e31c5c57)
@@ -30,16 +25,16 @@ import csv
from jobspy import scrape_jobs
jobs = scrape_jobs(
site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor"],
site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google", "bayt"],
search_term="software engineer",
location="Dallas, TX",
google_search_term="software engineer jobs near San Francisco, CA since yesterday",
location="San Francisco, CA",
results_wanted=20,
hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old)
country_indeed='USA', # only needed for indeed / glassdoor
hours_old=72,
country_indeed='USA',
# linkedin_fetch_description=True # get more info such as full description, direct job url for linkedin (slower)
# linkedin_fetch_description=True # gets more info such as description, direct job url (slower)
# proxies=["208.195.175.46:65095", "208.195.175.45:65095", "localhost"],
)
print(f"Found {len(jobs)} jobs")
print(jobs.head())
@@ -63,10 +58,13 @@ zip_recruiter Software Developer TEKsystems Phoenix
```plaintext
Optional
├── site_name (list|str):
| linkedin, zip_recruiter, indeed, glassdoor
| (default is all four)
| linkedin, zip_recruiter, indeed, glassdoor, google, bayt
| (default is all)
├── search_term (str)
|
├── google_search_term (str)
| search term for google jobs. This is the only param for filtering google jobs.
├── location (str)
@@ -80,16 +78,13 @@ Optional
| in format ['user:pass@host:port', 'localhost']
| each job board scraper will round robin through the proxies
|
├── ca_cert (str)
| path to CA Certificate file for proxies
├── is_remote (bool)
├── results_wanted (int):
| number of job results to retrieve for each site specified in 'site_name'
├── easy_apply (bool):
| filters for jobs that are hosted on the job board site
| filters for jobs that are hosted on the job board site (LinkedIn easy apply filter no longer works)
├── description_format (str):
| markdown, html (Format type of the job descriptions. Default is markdown.)
@@ -116,6 +111,9 @@ Optional
|
├── enforce_annual_salary (bool):
| converts wages to annual salary
|
├── ca_cert (str)
| path to CA Certificate file for proxies
```
```
@@ -131,46 +129,6 @@ Optional
| - easy_apply
```
### JobPost Schema
```plaintext
JobPost
├── title
├── company
├── company_url
├── job_url
├── location
│ ├── country
│ ├── city
│ ├── state
├── description
├── job_type: fulltime, parttime, internship, contract
├── job_function
│ ├── interval: yearly, monthly, weekly, daily, hourly
│ ├── min_amount
│ ├── max_amount
│ ├── currency
│ └── salary_source: direct_data, description (parsed from posting)
├── date_posted
├── emails
└── is_remote
Linkedin specific
└── job_level
Linkedin & Indeed specific
└── company_industry
Indeed specific
├── company_country
├── company_addresses
├── company_employees_label
├── company_revenue_label
├── company_description
└── logo_photo_url
```
## Supported Countries for Job Searching
### **LinkedIn**
@@ -207,6 +165,11 @@ You can specify the following countries when searching on Indeed (use the exact
| United Arab Emirates | UK* | USA* | Uruguay |
| Venezuela | Vietnam* | | |
### **Bayt**
Bayt only uses the search_term parameter currently and searches internationally
## Notes
* Indeed is the best scraper currently with no rate limiting.
@@ -217,7 +180,23 @@ You can specify the following countries when searching on Indeed (use the exact
---
**Q: Why is Indeed giving unrelated roles?**
**A:** Indeed is searching each one of your terms e.g. software intern, it searches software OR intern. Try search_term='"software intern"' in quotes for stricter searching
**A:** Indeed searches the description too.
- use - to remove words
- "" for exact match
Example of a good Indeed query
```py
search_term='"engineering intern" software summer (java OR python OR c++) 2025 -tax -marketing'
```
This searches the description/title and must include software, summer, 2025, one of the languages, engineering intern exactly, no tax, no marketing.
---
**Q: No results when using "google"?**
**A:** You have to use super specific syntax. Search for google jobs on your browser and then whatever pops up in the google jobs search box after applying some filters is what you need to copy & paste into the google_search_term.
---
@@ -229,8 +208,41 @@ You can specify the following countries when searching on Indeed (use the exact
---
**Q: Encountering issues with your queries?**
**A:** Try reducing the number of `results_wanted` and/or broadening the filters. If problems
persist, [submit an issue](https://github.com/Bunsly/JobSpy/issues).
### JobPost Schema
---
```plaintext
JobPost
├── title
├── company
├── company_url
├── job_url
├── location
│ ├── country
│ ├── city
│ ├── state
├── description
├── job_type: fulltime, parttime, internship, contract
├── job_function
│ ├── interval: yearly, monthly, weekly, daily, hourly
│ ├── min_amount
│ ├── max_amount
│ ├── currency
│ └── salary_source: direct_data, description (parsed from posting)
├── date_posted
├── emails
└── is_remote
Linkedin specific
└── job_level
Linkedin & Indeed specific
└── company_industry
Indeed specific
├── company_country
├── company_addresses
├── company_employees_label
├── company_revenue_label
├── company_description
└── company_logo
```

View File

@@ -1,27 +1,33 @@
from __future__ import annotations
import pandas as pd
from typing import Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Tuple
from .jobs import JobType, Location
from .scrapers.utils import set_logger_level, extract_salary, create_logger
from .scrapers.indeed import IndeedScraper
from .scrapers.ziprecruiter import ZipRecruiterScraper
from .scrapers.glassdoor import GlassdoorScraper
from .scrapers.linkedin import LinkedInScraper
from .scrapers import SalarySource, ScraperInput, Site, JobResponse, Country
from .scrapers.exceptions import (
LinkedInException,
IndeedException,
ZipRecruiterException,
GlassdoorException,
import pandas as pd
from jobspy.bayt import BaytScraper
from jobspy.glassdoor import Glassdoor
from jobspy.google import Google
from jobspy.indeed import Indeed
from jobspy.linkedin import LinkedIn
from jobspy.model import JobType, Location, JobResponse, Country
from jobspy.model import SalarySource, ScraperInput, Site
from jobspy.util import (
set_logger_level,
extract_salary,
create_logger,
get_enum_from_value,
map_str_to_site,
convert_to_annual,
desired_order,
)
from jobspy.ziprecruiter import ZipRecruiter
def scrape_jobs(
site_name: str | list[str] | Site | list[Site] | None = None,
search_term: str | None = None,
google_search_term: str | None = None,
location: str | None = None,
distance: int | None = 50,
is_remote: bool = False,
@@ -29,7 +35,6 @@ def scrape_jobs(
easy_apply: bool | None = None,
results_wanted: int = 15,
country_indeed: str = "usa",
hyperlinks: bool = False,
proxies: list[str] | str | None = None,
ca_cert: str | None = None,
description_format: str = "markdown",
@@ -38,30 +43,22 @@ def scrape_jobs(
offset: int | None = 0,
hours_old: int = None,
enforce_annual_salary: bool = False,
verbose: int = 2,
verbose: int = 0,
**kwargs,
) -> pd.DataFrame:
"""
Simultaneously scrapes job data from multiple job sites.
:return: pandas dataframe containing job data
Scrapes job data from job boards concurrently
:return: Pandas DataFrame containing job data
"""
SCRAPER_MAPPING = {
Site.LINKEDIN: LinkedInScraper,
Site.INDEED: IndeedScraper,
Site.ZIP_RECRUITER: ZipRecruiterScraper,
Site.GLASSDOOR: GlassdoorScraper,
Site.LINKEDIN: LinkedIn,
Site.INDEED: Indeed,
Site.ZIP_RECRUITER: ZipRecruiter,
Site.GLASSDOOR: Glassdoor,
Site.GOOGLE: Google,
Site.BAYT: BaytScraper,
}
set_logger_level(verbose)
def map_str_to_site(site_name: str) -> Site:
return Site[site_name.upper()]
def get_enum_from_value(value_str):
for job_type in JobType:
if value_str in job_type.value:
return job_type
raise Exception(f"Invalid job type: {value_str}")
job_type = get_enum_from_value(job_type) if job_type else None
def get_site_type():
@@ -83,6 +80,7 @@ def scrape_jobs(
site_type=get_site_type(),
country=country_enum,
search_term=search_term,
google_search_term=google_search_term,
location=location,
distance=distance,
is_remote=is_remote,
@@ -120,28 +118,12 @@ def scrape_jobs(
site_value, scraped_data = future.result()
site_to_jobs_dict[site_value] = scraped_data
def convert_to_annual(job_data: dict):
if job_data["interval"] == "hourly":
job_data["min_amount"] *= 2080
job_data["max_amount"] *= 2080
if job_data["interval"] == "monthly":
job_data["min_amount"] *= 12
job_data["max_amount"] *= 12
if job_data["interval"] == "weekly":
job_data["min_amount"] *= 52
job_data["max_amount"] *= 52
if job_data["interval"] == "daily":
job_data["min_amount"] *= 260
job_data["max_amount"] *= 260
job_data["interval"] = "yearly"
jobs_dfs: list[pd.DataFrame] = []
for site, job_response in site_to_jobs_dict.items():
for job in job_response.jobs:
job_data = job.dict()
job_url = job_data["job_url"]
job_data["job_url_hyper"] = f'<a href="{job_url}">{job_url}</a>'
job_data["site"] = site
job_data["company"] = job_data["company_name"]
job_data["job_type"] = (
@@ -204,38 +186,6 @@ def scrape_jobs(
# Step 2: Concatenate the filtered DataFrames
jobs_df = pd.concat(filtered_dfs, ignore_index=True)
# Desired column order
desired_order = [
"id",
"site",
"job_url_hyper" if hyperlinks else "job_url",
"job_url_direct",
"title",
"company",
"location",
"job_type",
"date_posted",
"salary_source",
"interval",
"min_amount",
"max_amount",
"currency",
"is_remote",
"job_level",
"job_function",
"company_industry",
"listing_type",
"emails",
"description",
"company_url",
"logo_photo_url",
"company_url_direct",
"company_addresses",
"company_num_employees",
"company_revenue",
"company_description",
]
# Step 3: Ensure all desired columns are present, adding missing ones as empty
for column in desired_order:
if column not in jobs_df.columns:
@@ -245,6 +195,8 @@ def scrape_jobs(
jobs_df = jobs_df[desired_order]
# Step 4: Sort the DataFrame as required
return jobs_df.sort_values(by=["site", "date_posted"], ascending=[True, False])
return jobs_df.sort_values(
by=["site", "date_posted"], ascending=[True, False]
).reset_index(drop=True)
else:
return pd.DataFrame()

145
jobspy/bayt/__init__.py Normal file
View File

@@ -0,0 +1,145 @@
from __future__ import annotations
import random
import time
from bs4 import BeautifulSoup
from jobspy.model import (
Scraper,
ScraperInput,
Site,
JobPost,
JobResponse,
Location,
Country,
)
from jobspy.util import create_logger, create_session
log = create_logger("Bayt")
class BaytScraper(Scraper):
base_url = "https://www.bayt.com"
delay = 2
band_delay = 3
def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
):
super().__init__(Site.BAYT, proxies=proxies, ca_cert=ca_cert)
self.scraper_input = None
self.session = None
self.country = "worldwide"
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
self.scraper_input = scraper_input
self.session = create_session(
proxies=self.proxies, ca_cert=self.ca_cert, is_tls=False, has_retry=True
)
job_list: list[JobPost] = []
page = 1
results_wanted = (
scraper_input.results_wanted if scraper_input.results_wanted else 10
)
while len(job_list) < results_wanted:
log.info(f"Fetching Bayt jobs page {page}")
job_elements = self._fetch_jobs(self.scraper_input.search_term, page)
if not job_elements:
break
if job_elements:
log.debug(
"First job element snippet:\n" + job_elements[0].prettify()[:500]
)
initial_count = len(job_list)
for job in job_elements:
try:
job_post = self._extract_job_info(job)
if job_post:
job_list.append(job_post)
if len(job_list) >= results_wanted:
break
else:
log.debug(
"Extraction returned None. Job snippet:\n"
+ job.prettify()[:500]
)
except Exception as e:
log.error(f"Bayt: Error extracting job info: {str(e)}")
continue
if len(job_list) == initial_count:
log.info(f"No new jobs found on page {page}. Ending pagination.")
break
page += 1
time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list)
def _fetch_jobs(self, query: str, page: int) -> list | None:
"""
Grabs the job results for the given query and page number.
"""
try:
url = f"{self.base_url}/en/international/jobs/{query}-jobs/?page={page}"
response = self.session.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
job_listings = soup.find_all("li", attrs={"data-js-job": ""})
log.debug(f"Found {len(job_listings)} job listing elements")
return job_listings
except Exception as e:
log.error(f"Bayt: Error fetching jobs - {str(e)}")
return None
def _extract_job_info(self, job: BeautifulSoup) -> JobPost | None:
"""
Extracts the job information from a single job listing.
"""
# Find the h2 element holding the title and link (no class filtering)
job_general_information = job.find("h2")
if not job_general_information:
return
job_title = job_general_information.get_text(strip=True)
job_url = self._extract_job_url(job_general_information)
if not job_url:
return
# Extract company name using the original approach:
company_tag = job.find("div", class_="t-nowrap p10l")
company_name = (
company_tag.find("span").get_text(strip=True)
if company_tag and company_tag.find("span")
else None
)
# Extract location using the original approach:
location_tag = job.find("div", class_="t-mute t-small")
location = location_tag.get_text(strip=True) if location_tag else None
job_id = f"bayt-{abs(hash(job_url))}"
location_obj = Location(
city=location,
country=Country.from_string(self.country),
)
return JobPost(
id=job_id,
title=job_title,
company_name=company_name,
location=location_obj,
job_url=job_url,
)
def _extract_job_url(self, job_general_information: BeautifulSoup) -> str | None:
"""
Pulls the job URL from the 'a' within the h2 element.
"""
a_tag = job_general_information.find("a")
if a_tag and a_tag.has_attr("href"):
return self.base_url + a_tag["href"].strip()

View File

@@ -1,5 +1,5 @@
"""
jobspy.scrapers.exceptions
jobspy.jobboard.exceptions
~~~~~~~~~~~~~~~~~~~
This module contains the set of Scrapers' exceptions.
@@ -24,3 +24,13 @@ class ZipRecruiterException(Exception):
class GlassdoorException(Exception):
def __init__(self, message=None):
super().__init__(message or "An error occurred with Glassdoor")
class GoogleJobsException(Exception):
def __init__(self, message=None):
super().__init__(message or "An error occurred with Google Jobs")
class BaytException(Exception):
def __init__(self, message=None):
super().__init__(message or "An error occurred with Bayt")

View File

@@ -1,41 +1,38 @@
"""
jobspy.scrapers.glassdoor
~~~~~~~~~~~~~~~~~~~
This module contains routines to scrape Glassdoor.
"""
from __future__ import annotations
import re
import json
import requests
from typing import Optional, Tuple
from typing import Tuple
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed
from .constants import fallback_token, query_template, headers
from .. import Scraper, ScraperInput, Site
from ..utils import extract_emails_from_text, create_logger
from ..exceptions import GlassdoorException
from ..utils import (
from jobspy.glassdoor.constant import fallback_token, query_template, headers
from jobspy.glassdoor.util import (
get_cursor_for_page,
parse_compensation,
parse_location,
)
from jobspy.util import (
extract_emails_from_text,
create_logger,
create_session,
markdown_converter,
)
from ...jobs import (
from jobspy.exception import GlassdoorException
from jobspy.model import (
JobPost,
Compensation,
CompensationInterval,
Location,
JobResponse,
JobType,
DescriptionFormat,
Scraper,
ScraperInput,
Site,
)
logger = create_logger("Glassdoor")
log = create_logger("Glassdoor")
class GlassdoorScraper(Scraper):
class Glassdoor(Scraper):
def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
):
@@ -64,7 +61,7 @@ class GlassdoorScraper(Scraper):
self.base_url = self.scraper_input.country.get_glassdoor_url()
self.session = create_session(
proxies=self.proxies, ca_cert=self.ca_cert, is_tls=True, has_retry=True
proxies=self.proxies, ca_cert=self.ca_cert, has_retry=True
)
token = self._get_csrf_token()
headers["gd-csrf-token"] = token if token else fallback_token
@@ -74,7 +71,7 @@ class GlassdoorScraper(Scraper):
scraper_input.location, scraper_input.is_remote
)
if location_type is None:
logger.error("Glassdoor: location not parsed")
log.error("Glassdoor: location not parsed")
return JobResponse(jobs=[])
job_list: list[JobPost] = []
cursor = None
@@ -83,7 +80,7 @@ class GlassdoorScraper(Scraper):
tot_pages = (scraper_input.results_wanted // self.jobs_per_page) + 2
range_end = min(tot_pages, self.max_pages + 1)
for page in range(range_start, range_end):
logger.info(f"search page: {page} / {range_end-1}")
log.info(f"search page: {page} / {range_end - 1}")
try:
jobs, cursor = self._fetch_jobs_page(
scraper_input, location_id, location_type, page, cursor
@@ -93,7 +90,7 @@ class GlassdoorScraper(Scraper):
job_list = job_list[: scraper_input.results_wanted]
break
except Exception as e:
logger.error(f"Glassdoor: {str(e)}")
log.error(f"Glassdoor: {str(e)}")
break
return JobResponse(jobs=job_list)
@@ -129,7 +126,7 @@ class GlassdoorScraper(Scraper):
ValueError,
Exception,
) as e:
logger.error(f"Glassdoor: {str(e)}")
log.error(f"Glassdoor: {str(e)}")
return jobs, None
jobs_data = res_json["data"]["jobListings"]["jobListings"]
@@ -146,7 +143,7 @@ class GlassdoorScraper(Scraper):
except Exception as exc:
raise GlassdoorException(f"Glassdoor generated an exception: {exc}")
return jobs, self.get_cursor_for_page(
return jobs, get_cursor_for_page(
res_json["data"]["jobListings"]["paginationCursors"], page_num + 1
)
@@ -185,9 +182,9 @@ class GlassdoorScraper(Scraper):
if location_type == "S":
is_remote = True
else:
location = self.parse_location(location_name)
location = parse_location(location_name)
compensation = self.parse_compensation(job["header"])
compensation = parse_compensation(job["header"])
try:
description = self._fetch_job_description(job_id)
except:
@@ -214,7 +211,7 @@ class GlassdoorScraper(Scraper):
is_remote=is_remote,
description=description,
emails=extract_emails_from_text(description) if description else None,
logo_photo_url=company_logo,
company_logo=company_logo,
listing_type=listing_type,
)
@@ -264,12 +261,12 @@ class GlassdoorScraper(Scraper):
if res.status_code != 200:
if res.status_code == 429:
err = f"429 Response - Blocked by Glassdoor for too many requests"
logger.error(err)
log.error(err)
return None, None
else:
err = f"Glassdoor response status code {res.status_code}"
err += f" - {res.text}"
logger.error(f"Glassdoor response status code {res.status_code}")
log.error(f"Glassdoor response status code {res.status_code}")
return None, None
items = res.json()
@@ -321,44 +318,3 @@ class GlassdoorScraper(Scraper):
{"filterKey": "jobType", "values": self.scraper_input.job_type.value[0]}
)
return json.dumps([payload])
@staticmethod
def parse_compensation(data: dict) -> Optional[Compensation]:
pay_period = data.get("payPeriod")
adjusted_pay = data.get("payPeriodAdjustedPay")
currency = data.get("payCurrency", "USD")
if not pay_period or not adjusted_pay:
return None
interval = None
if pay_period == "ANNUAL":
interval = CompensationInterval.YEARLY
elif pay_period:
interval = CompensationInterval.get_interval(pay_period)
min_amount = int(adjusted_pay.get("p10") // 1)
max_amount = int(adjusted_pay.get("p90") // 1)
return Compensation(
interval=interval,
min_amount=min_amount,
max_amount=max_amount,
currency=currency,
)
@staticmethod
def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
for job_type in JobType:
if job_type_str in job_type.value:
return [job_type]
@staticmethod
def parse_location(location_name: str) -> Location | None:
if not location_name or location_name == "Remote":
return
city, _, state = location_name.partition(", ")
return Location(city=city, state=state)
@staticmethod
def get_cursor_for_page(pagination_cursors, page_num):
for cursor_data in pagination_cursors:
if cursor_data["pageNumber"] == page_num:
return cursor_data["cursor"]

42
jobspy/glassdoor/util.py Normal file
View File

@@ -0,0 +1,42 @@
from jobspy.model import Compensation, CompensationInterval, Location, JobType
def parse_compensation(data: dict) -> Compensation | None:
pay_period = data.get("payPeriod")
adjusted_pay = data.get("payPeriodAdjustedPay")
currency = data.get("payCurrency", "USD")
if not pay_period or not adjusted_pay:
return None
interval = None
if pay_period == "ANNUAL":
interval = CompensationInterval.YEARLY
elif pay_period:
interval = CompensationInterval.get_interval(pay_period)
min_amount = int(adjusted_pay.get("p10") // 1)
max_amount = int(adjusted_pay.get("p90") // 1)
return Compensation(
interval=interval,
min_amount=min_amount,
max_amount=max_amount,
currency=currency,
)
def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
for job_type in JobType:
if job_type_str in job_type.value:
return [job_type]
def parse_location(location_name: str) -> Location | None:
if not location_name or location_name == "Remote":
return
city, _, state = location_name.partition(", ")
return Location(city=city, state=state)
def get_cursor_for_page(pagination_cursors, page_num):
for cursor_data in pagination_cursors:
if cursor_data["pageNumber"] == page_num:
return cursor_data["cursor"]

202
jobspy/google/__init__.py Normal file
View File

@@ -0,0 +1,202 @@
from __future__ import annotations
import math
import re
import json
from typing import Tuple
from datetime import datetime, timedelta
from jobspy.google.constant import headers_jobs, headers_initial, async_param
from jobspy.model import (
Scraper,
ScraperInput,
Site,
JobPost,
JobResponse,
Location,
JobType,
)
from jobspy.util import extract_emails_from_text, extract_job_type, create_session
from jobspy.google.util import log, find_job_info_initial_page, find_job_info
class Google(Scraper):
def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
):
"""
Initializes Google Scraper with the Goodle jobs search url
"""
site = Site(Site.GOOGLE)
super().__init__(site, proxies=proxies, ca_cert=ca_cert)
self.country = None
self.session = None
self.scraper_input = None
self.jobs_per_page = 10
self.seen_urls = set()
self.url = "https://www.google.com/search"
self.jobs_url = "https://www.google.com/async/callback:550"
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Scrapes Google for jobs with scraper_input criteria.
:param scraper_input: Information about job search criteria.
:return: JobResponse containing a list of jobs.
"""
self.scraper_input = scraper_input
self.scraper_input.results_wanted = min(900, scraper_input.results_wanted)
self.session = create_session(
proxies=self.proxies, ca_cert=self.ca_cert, is_tls=False, has_retry=True
)
forward_cursor, job_list = self._get_initial_cursor_and_jobs()
if forward_cursor is None:
log.warning(
"initial cursor not found, try changing your query or there was at most 10 results"
)
return JobResponse(jobs=job_list)
page = 1
while (
len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset
and forward_cursor
):
log.info(
f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
)
try:
jobs, forward_cursor = self._get_jobs_next_page(forward_cursor)
except Exception as e:
log.error(f"failed to get jobs on page: {page}, {e}")
break
if not jobs:
log.info(f"found no jobs on page: {page}")
break
job_list += jobs
page += 1
return JobResponse(
jobs=job_list[
scraper_input.offset : scraper_input.offset
+ scraper_input.results_wanted
]
)
def _get_initial_cursor_and_jobs(self) -> Tuple[str, list[JobPost]]:
"""Gets initial cursor and jobs to paginate through job listings"""
query = f"{self.scraper_input.search_term} jobs"
def get_time_range(hours_old):
if hours_old <= 24:
return "since yesterday"
elif hours_old <= 72:
return "in the last 3 days"
elif hours_old <= 168:
return "in the last week"
else:
return "in the last month"
job_type_mapping = {
JobType.FULL_TIME: "Full time",
JobType.PART_TIME: "Part time",
JobType.INTERNSHIP: "Internship",
JobType.CONTRACT: "Contract",
}
if self.scraper_input.job_type in job_type_mapping:
query += f" {job_type_mapping[self.scraper_input.job_type]}"
if self.scraper_input.location:
query += f" near {self.scraper_input.location}"
if self.scraper_input.hours_old:
time_filter = get_time_range(self.scraper_input.hours_old)
query += f" {time_filter}"
if self.scraper_input.is_remote:
query += " remote"
if self.scraper_input.google_search_term:
query = self.scraper_input.google_search_term
params = {"q": query, "udm": "8"}
response = self.session.get(self.url, headers=headers_initial, params=params)
pattern_fc = r'<div jsname="Yust4d"[^>]+data-async-fc="([^"]+)"'
match_fc = re.search(pattern_fc, response.text)
data_async_fc = match_fc.group(1) if match_fc else None
jobs_raw = find_job_info_initial_page(response.text)
jobs = []
for job_raw in jobs_raw:
job_post = self._parse_job(job_raw)
if job_post:
jobs.append(job_post)
return data_async_fc, jobs
def _get_jobs_next_page(self, forward_cursor: str) -> Tuple[list[JobPost], str]:
params = {"fc": [forward_cursor], "fcv": ["3"], "async": [async_param]}
response = self.session.get(self.jobs_url, headers=headers_jobs, params=params)
return self._parse_jobs(response.text)
def _parse_jobs(self, job_data: str) -> Tuple[list[JobPost], str]:
"""
Parses jobs on a page with next page cursor
"""
start_idx = job_data.find("[[[")
end_idx = job_data.rindex("]]]") + 3
s = job_data[start_idx:end_idx]
parsed = json.loads(s)[0]
pattern_fc = r'data-async-fc="([^"]+)"'
match_fc = re.search(pattern_fc, job_data)
data_async_fc = match_fc.group(1) if match_fc else None
jobs_on_page = []
for array in parsed:
_, job_data = array
if not job_data.startswith("[[["):
continue
job_d = json.loads(job_data)
job_info = find_job_info(job_d)
job_post = self._parse_job(job_info)
if job_post:
jobs_on_page.append(job_post)
return jobs_on_page, data_async_fc
def _parse_job(self, job_info: list):
job_url = job_info[3][0][0] if job_info[3] and job_info[3][0] else None
if job_url in self.seen_urls:
return
self.seen_urls.add(job_url)
title = job_info[0]
company_name = job_info[1]
location = city = job_info[2]
state = country = date_posted = None
if location and "," in location:
city, state, *country = [*map(lambda x: x.strip(), location.split(","))]
days_ago_str = job_info[12]
if type(days_ago_str) == str:
match = re.search(r"\d+", days_ago_str)
days_ago = int(match.group()) if match else None
date_posted = (datetime.now() - timedelta(days=days_ago)).date()
description = job_info[19]
job_post = JobPost(
id=f"go-{job_info[28]}",
title=title,
company_name=company_name,
location=Location(
city=city, state=state, country=country[0] if country else None
),
job_url=job_url,
date_posted=date_posted,
is_remote="remote" in description.lower() or "wfh" in description.lower(),
description=description,
emails=extract_emails_from_text(description),
job_type=extract_job_type(description),
)
return job_post

52
jobspy/google/constant.py Normal file
View File

@@ -0,0 +1,52 @@
headers_initial = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "en-US,en;q=0.9",
"priority": "u=0, i",
"referer": "https://www.google.com/",
"sec-ch-prefers-color-scheme": "dark",
"sec-ch-ua": '"Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"',
"sec-ch-ua-arch": '"arm"',
"sec-ch-ua-bitness": '"64"',
"sec-ch-ua-form-factors": '"Desktop"',
"sec-ch-ua-full-version": '"130.0.6723.58"',
"sec-ch-ua-full-version-list": '"Chromium";v="130.0.6723.58", "Google Chrome";v="130.0.6723.58", "Not?A_Brand";v="99.0.0.0"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-model": '""',
"sec-ch-ua-platform": '"macOS"',
"sec-ch-ua-platform-version": '"15.0.1"',
"sec-ch-ua-wow64": "?0",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
"x-browser-channel": "stable",
"x-browser-copyright": "Copyright 2024 Google LLC. All rights reserved.",
"x-browser-year": "2024",
}
headers_jobs = {
"accept": "*/*",
"accept-language": "en-US,en;q=0.9",
"priority": "u=1, i",
"referer": "https://www.google.com/",
"sec-ch-prefers-color-scheme": "dark",
"sec-ch-ua": '"Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"',
"sec-ch-ua-arch": '"arm"',
"sec-ch-ua-bitness": '"64"',
"sec-ch-ua-form-factors": '"Desktop"',
"sec-ch-ua-full-version": '"130.0.6723.58"',
"sec-ch-ua-full-version-list": '"Chromium";v="130.0.6723.58", "Google Chrome";v="130.0.6723.58", "Not?A_Brand";v="99.0.0.0"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-model": '""',
"sec-ch-ua-platform": '"macOS"',
"sec-ch-ua-platform-version": '"15.0.1"',
"sec-ch-ua-wow64": "?0",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
}
async_param = "_basejs:/xjs/_/js/k=xjs.s.en_US.JwveA-JiKmg.2018.O/am=AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIAAAAAAAAACAAAoICAAAAAAAKMAfAAAAIAQAAAAAAAAAAAAACCAAAEJDAAACAAAAAGABAIAAARBAAABAAAAAgAgQAABAASKAfv8JAAABAAAAAAwAQAQACQAAAAAAcAEAQABoCAAAABAAAIABAACAAAAEAAAAFAAAAAAAAAAAAAAAAAAAAAAAAACAQADoBwAAAAAAAAAAAAAQBAAAAATQAAoACOAHAAAAAAAAAQAAAIIAAAA_ZAACAAAAAAAAcB8APB4wHFJ4AAAAAAAAAAAAAAAACECCYA5If0EACAAAAAAAAAAAAAAAAAAAUgRNXG4AMAE/dg=0/br=1/rs=ACT90oGxMeaFMCopIHq5tuQM-6_3M_VMjQ,_basecss:/xjs/_/ss/k=xjs.s.IwsGu62EDtU.L.B1.O/am=QOoQIAQAAAQAREADEBAAAAAAAAAAAAAAAAAAAAAgAQAAIAAAgAQAAAIAIAIAoEwCAADIC8AfsgEAawwAPkAAjgoAGAAAAAAAAEADAAAAAAIgAECHAAAAAAAAAAABAQAggAARQAAAQCEAAAAAIAAAABgAAAAAIAQIACCAAfB-AAFIQABoCEA_CgEAAIABAACEgHAEwwAEFQAM4CgAAAAAAAAAAAAACABCAAAAQEAAABAgAMCPAAA4AoE2BAEAggSAAIoAQAAAAAgAAAAACCAQAAAxEwA_ZAACAAAAAAAAAAkAAAAAAAAgAAAAAAAAAAAAAAAAAAAAAAAAQAEAAAAAAAAAAAAAAAAAAAAAQA/br=1/rs=ACT90oGZc36t3uUQkj0srnIvvbHjO2hgyg,_basecomb:/xjs/_/js/k=xjs.s.en_US.JwveA-JiKmg.2018.O/ck=xjs.s.IwsGu62EDtU.L.B1.O/am=QOoQIAQAAAQAREADEBAAAAAAAAAAAAAAAAAAAAAgAQAAIAAAgAQAAAKAIAoIqEwCAADIK8AfsgEAawwAPkAAjgoAGAAACCAAAEJDAAACAAIgAGCHAIAAARBAAABBAQAggAgRQABAQSOAfv8JIAABABgAAAwAYAQICSCAAfB-cAFIQABoCEA_ChEAAIABAACEgHAEwwAEFQAM4CgAAAAAAAAAAAAACABCAACAQEDoBxAgAMCPAAA4AoE2BAEAggTQAIoASOAHAAgAAAAACSAQAIIxEwA_ZAACAAAAAAAAcB8APB4wHFJ4AAAAAAAAAAAAAAAACECCYA5If0EACAAAAAAAAAAAAAAAAAAAUgRNXG4AMAE/d=1/ed=1/dg=0/br=1/ujg=1/rs=ACT90oFNLTjPzD_OAqhhtXwe2pg1T3WpBg,_fmt:prog,_id:fc_5FwaZ86OKsfdwN4P4La3yA4_2"

41
jobspy/google/util.py Normal file
View File

@@ -0,0 +1,41 @@
import re
from jobspy.util import create_logger
log = create_logger("Google")
def find_job_info(jobs_data: list | dict) -> list | None:
"""Iterates through the JSON data to find the job listings"""
if isinstance(jobs_data, dict):
for key, value in jobs_data.items():
if key == "520084652" and isinstance(value, list):
return value
else:
result = find_job_info(value)
if result:
return result
elif isinstance(jobs_data, list):
for item in jobs_data:
result = find_job_info(item)
if result:
return result
return None
def find_job_info_initial_page(html_text: str):
pattern = f'520084652":(' + r"\[.*?\]\s*])\s*}\s*]\s*]\s*]\s*]\s*]"
results = []
matches = re.finditer(pattern, html_text)
import json
for match in matches:
try:
parsed_data = json.loads(match.group(1))
results.append(parsed_data)
except json.JSONDecodeError as e:
log.error(f"Failed to parse match: {str(e)}")
results.append({"raw_match": match.group(0), "error": str(e)})
return results

View File

@@ -1,39 +1,32 @@
"""
jobspy.scrapers.indeed
~~~~~~~~~~~~~~~~~~~
This module contains routines to scrape Indeed.
"""
from __future__ import annotations
import math
from typing import Tuple
from datetime import datetime
from typing import Tuple
from .constants import job_search_query, api_headers
from .. import Scraper, ScraperInput, Site
from ..utils import (
extract_emails_from_text,
get_enum_from_job_type,
markdown_converter,
create_session,
create_logger,
)
from ...jobs import (
from jobspy.indeed.constant import job_search_query, api_headers
from jobspy.indeed.util import is_job_remote, get_compensation, get_job_type
from jobspy.model import (
Scraper,
ScraperInput,
Site,
JobPost,
Compensation,
CompensationInterval,
Location,
JobResponse,
JobType,
DescriptionFormat,
)
from jobspy.util import (
extract_emails_from_text,
markdown_converter,
create_session,
create_logger,
)
logger = create_logger("Indeed")
log = create_logger("Indeed")
class IndeedScraper(Scraper):
class Indeed(Scraper):
def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
):
@@ -69,25 +62,23 @@ class IndeedScraper(Scraper):
page = 1
cursor = None
offset_pages = math.ceil(self.scraper_input.offset / 100)
for _ in range(offset_pages):
logger.info(f"skipping search page: {page}")
__, cursor = self._scrape_page(cursor)
if not __:
logger.info(f"found no jobs on page: {page}")
break
while len(self.seen_urls) < scraper_input.results_wanted:
logger.info(
f"search page: {page} / {math.ceil(scraper_input.results_wanted / 100)}"
while len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset:
log.info(
f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
)
jobs, cursor = self._scrape_page(cursor)
if not jobs:
logger.info(f"found no jobs on page: {page}")
log.info(f"found no jobs on page: {page}")
break
job_list += jobs
page += 1
return JobResponse(jobs=job_list[: scraper_input.results_wanted])
return JobResponse(
jobs=job_list[
scraper_input.offset : scraper_input.offset
+ scraper_input.results_wanted
]
)
def _scrape_page(self, cursor: str | None) -> Tuple[list[JobPost], str | None]:
"""
@@ -124,9 +115,10 @@ class IndeedScraper(Scraper):
headers=api_headers_temp,
json=payload,
timeout=10,
verify=False,
)
if not response.ok:
logger.info(
log.info(
f"responded with status code: {response.status_code} (submit GitHub issue if this appears to be a bug)"
)
return jobs, new_cursor
@@ -214,7 +206,7 @@ class IndeedScraper(Scraper):
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
description = markdown_converter(description)
job_type = self._get_job_type(job["attributes"])
job_type = get_job_type(job["attributes"])
timestamp_seconds = job["datePublished"] / 1000
date_posted = datetime.fromtimestamp(timestamp_seconds).strftime("%Y-%m-%d")
employer = job["employer"].get("dossier") if job["employer"] else None
@@ -235,14 +227,14 @@ class IndeedScraper(Scraper):
country=job.get("location", {}).get("countryCode"),
),
job_type=job_type,
compensation=self._get_compensation(job["compensation"]),
compensation=get_compensation(job["compensation"]),
date_posted=date_posted,
job_url=job_url,
job_url_direct=(
job["recruit"].get("viewJobUrl") if job.get("recruit") else None
),
emails=extract_emails_from_text(description) if description else None,
is_remote=self._is_job_remote(job, description),
is_remote=is_job_remote(job, description),
company_addresses=(
employer_details["addresses"][0]
if employer_details.get("addresses")
@@ -260,92 +252,9 @@ class IndeedScraper(Scraper):
company_num_employees=employer_details.get("employeesLocalizedLabel"),
company_revenue=employer_details.get("revenueLocalizedLabel"),
company_description=employer_details.get("briefDescription"),
logo_photo_url=(
company_logo=(
employer["images"].get("squareLogoUrl")
if employer and employer.get("images")
else None
),
)
@staticmethod
def _get_job_type(attributes: list) -> list[JobType]:
"""
Parses the attributes to get list of job types
:param attributes:
:return: list of JobType
"""
job_types: list[JobType] = []
for attribute in attributes:
job_type_str = attribute["label"].replace("-", "").replace(" ", "").lower()
job_type = get_enum_from_job_type(job_type_str)
if job_type:
job_types.append(job_type)
return job_types
@staticmethod
def _get_compensation(compensation: dict) -> Compensation | None:
"""
Parses the job to get compensation
:param job:
:return: compensation object
"""
if not compensation["baseSalary"] and not compensation["estimated"]:
return None
comp = (
compensation["baseSalary"]
if compensation["baseSalary"]
else compensation["estimated"]["baseSalary"]
)
if not comp:
return None
interval = IndeedScraper._get_compensation_interval(comp["unitOfWork"])
if not interval:
return None
min_range = comp["range"].get("min")
max_range = comp["range"].get("max")
return Compensation(
interval=interval,
min_amount=int(min_range) if min_range is not None else None,
max_amount=int(max_range) if max_range is not None else None,
currency=(
compensation["estimated"]["currencyCode"]
if compensation["estimated"]
else compensation["currencyCode"]
),
)
@staticmethod
def _is_job_remote(job: dict, description: str) -> bool:
"""
Searches the description, location, and attributes to check if job is remote
"""
remote_keywords = ["remote", "work from home", "wfh"]
is_remote_in_attributes = any(
any(keyword in attr["label"].lower() for keyword in remote_keywords)
for attr in job["attributes"]
)
is_remote_in_description = any(
keyword in description.lower() for keyword in remote_keywords
)
is_remote_in_location = any(
keyword in job["location"]["formatted"]["long"].lower()
for keyword in remote_keywords
)
return (
is_remote_in_attributes or is_remote_in_description or is_remote_in_location
)
@staticmethod
def _get_compensation_interval(interval: str) -> CompensationInterval:
interval_mapping = {
"DAY": "DAILY",
"YEAR": "YEARLY",
"HOUR": "HOURLY",
"WEEK": "WEEKLY",
"MONTH": "MONTHLY",
}
mapped_interval = interval_mapping.get(interval.upper(), None)
if mapped_interval and mapped_interval in CompensationInterval.__members__:
return CompensationInterval[mapped_interval]
else:
raise ValueError(f"Unsupported interval: {interval}")

83
jobspy/indeed/util.py Normal file
View File

@@ -0,0 +1,83 @@
from jobspy.model import CompensationInterval, JobType, Compensation
from jobspy.util import get_enum_from_job_type
def get_job_type(attributes: list) -> list[JobType]:
"""
Parses the attributes to get list of job types
:param attributes:
:return: list of JobType
"""
job_types: list[JobType] = []
for attribute in attributes:
job_type_str = attribute["label"].replace("-", "").replace(" ", "").lower()
job_type = get_enum_from_job_type(job_type_str)
if job_type:
job_types.append(job_type)
return job_types
def get_compensation(compensation: dict) -> Compensation | None:
"""
Parses the job to get compensation
:param sssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrompensation:
:return: compensation object
"""
if not compensation["baseSalary"] and not compensation["estimated"]:
return None
comp = (
compensation["baseSalary"]
if compensation["baseSalary"]
else compensation["estimated"]["baseSalary"]
)
if not comp:
return None
interval = get_compensation_interval(comp["unitOfWork"])
if not interval:
return None
min_range = comp["range"].get("min")
max_range = comp["range"].get("max")
return Compensation(
interval=interval,
min_amount=int(min_range) if min_range is not None else None,
max_amount=int(max_range) if max_range is not None else None,
currency=(
compensation["estimated"]["currencyCode"]
if compensation["estimated"]
else compensation["currencyCode"]
),
)
def is_job_remote(job: dict, description: str) -> bool:
"""
Searches the description, location, and attributes to check if job is remote
"""
remote_keywords = ["remote", "work from home", "wfh"]
is_remote_in_attributes = any(
any(keyword in attr["label"].lower() for keyword in remote_keywords)
for attr in job["attributes"]
)
is_remote_in_description = any(
keyword in description.lower() for keyword in remote_keywords
)
is_remote_in_location = any(
keyword in job["location"]["formatted"]["long"].lower()
for keyword in remote_keywords
)
return is_remote_in_attributes or is_remote_in_description or is_remote_in_location
def get_compensation_interval(interval: str) -> CompensationInterval:
interval_mapping = {
"DAY": "DAILY",
"YEAR": "YEARLY",
"HOUR": "HOURLY",
"WEEK": "WEEKLY",
"MONTH": "MONTHLY",
}
mapped_interval = interval_mapping.get(interval.upper(), None)
if mapped_interval and mapped_interval in CompensationInterval.__members__:
return CompensationInterval[mapped_interval]
else:
raise ValueError(f"Unsupported interval: {interval}")

View File

@@ -1,47 +1,48 @@
"""
jobspy.scrapers.linkedin
~~~~~~~~~~~~~~~~~~~
This module contains routines to scrape LinkedIn.
"""
from __future__ import annotations
import math
import time
import random
import regex as re
from typing import Optional
import time
from datetime import datetime
from bs4.element import Tag
from bs4 import BeautifulSoup
from typing import Optional
from urllib.parse import urlparse, urlunparse, unquote
from .constants import headers
from .. import Scraper, ScraperInput, Site
from ..exceptions import LinkedInException
from ..utils import create_session, remove_attributes, create_logger
from ...jobs import (
import regex as re
from bs4 import BeautifulSoup
from bs4.element import Tag
from jobspy.exception import LinkedInException
from jobspy.linkedin.constant import headers
from jobspy.linkedin.util import (
job_type_code,
parse_job_type,
parse_job_level,
parse_company_industry,
)
from jobspy.model import (
JobPost,
Location,
JobResponse,
JobType,
Country,
Compensation,
DescriptionFormat,
Scraper,
ScraperInput,
Site,
)
from ..utils import (
from jobspy.util import (
extract_emails_from_text,
get_enum_from_job_type,
currency_parser,
markdown_converter,
create_session,
remove_attributes,
create_logger,
)
logger = create_logger("LinkedIn")
log = create_logger("LinkedIn")
class LinkedInScraper(Scraper):
class LinkedIn(Scraper):
base_url = "https://www.linkedin.com"
delay = 3
band_delay = 4
@@ -86,7 +87,7 @@ class LinkedInScraper(Scraper):
)
while continue_search():
request_count += 1
logger.info(
log.info(
f"search page: {request_count} / {math.ceil(scraper_input.results_wanted / 10)}"
)
params = {
@@ -95,7 +96,7 @@ class LinkedInScraper(Scraper):
"distance": scraper_input.distance,
"f_WT": 2 if scraper_input.is_remote else None,
"f_JT": (
self.job_type_code(scraper_input.job_type)
job_type_code(scraper_input.job_type)
if scraper_input.job_type
else None
),
@@ -126,13 +127,13 @@ class LinkedInScraper(Scraper):
else:
err = f"LinkedIn response status code {response.status_code}"
err += f" - {response.text}"
logger.error(err)
log.error(err)
return JobResponse(jobs=job_list)
except Exception as e:
if "Proxy responded with" in str(e):
logger.error(f"LinkedIn: Bad proxy")
log.error(f"LinkedIn: Bad proxy")
else:
logger.error(f"LinkedIn: {str(e)}")
log.error(f"LinkedIn: {str(e)}")
return JobResponse(jobs=job_list)
soup = BeautifulSoup(response.text, "html.parser")
@@ -232,7 +233,7 @@ class LinkedInScraper(Scraper):
description=job_details.get("description"),
job_url_direct=job_details.get("job_url_direct"),
emails=extract_emails_from_text(job_details.get("description")),
logo_photo_url=job_details.get("logo_photo_url"),
company_logo=job_details.get("company_logo"),
job_function=job_details.get("job_function"),
)
@@ -275,18 +276,18 @@ class LinkedInScraper(Scraper):
if job_function_span:
job_function = job_function_span.text.strip()
logo_photo_url = (
company_logo = (
logo_image.get("data-delayed-url")
if (logo_image := soup.find("img", {"class": "artdeco-entity-image"}))
else None
)
return {
"description": description,
"job_level": self._parse_job_level(soup),
"company_industry": self._parse_company_industry(soup),
"job_type": self._parse_job_type(soup),
"job_level": parse_job_level(soup),
"company_industry": parse_company_industry(soup),
"job_type": parse_job_type(soup),
"job_url_direct": self._parse_job_url_direct(soup),
"logo_photo_url": logo_photo_url,
"company_logo": company_logo,
"job_function": job_function,
}
@@ -316,77 +317,6 @@ class LinkedInScraper(Scraper):
location = Location(city=city, state=state, country=country)
return location
@staticmethod
def _parse_job_type(soup_job_type: BeautifulSoup) -> list[JobType] | None:
"""
Gets the job type from job page
:param soup_job_type:
:return: JobType
"""
h3_tag = soup_job_type.find(
"h3",
class_="description__job-criteria-subheader",
string=lambda text: "Employment type" in text,
)
employment_type = None
if h3_tag:
employment_type_span = h3_tag.find_next_sibling(
"span",
class_="description__job-criteria-text description__job-criteria-text--criteria",
)
if employment_type_span:
employment_type = employment_type_span.get_text(strip=True)
employment_type = employment_type.lower()
employment_type = employment_type.replace("-", "")
return [get_enum_from_job_type(employment_type)] if employment_type else []
@staticmethod
def _parse_job_level(soup_job_level: BeautifulSoup) -> str | None:
"""
Gets the job level from job page
:param soup_job_level:
:return: str
"""
h3_tag = soup_job_level.find(
"h3",
class_="description__job-criteria-subheader",
string=lambda text: "Seniority level" in text,
)
job_level = None
if h3_tag:
job_level_span = h3_tag.find_next_sibling(
"span",
class_="description__job-criteria-text description__job-criteria-text--criteria",
)
if job_level_span:
job_level = job_level_span.get_text(strip=True)
return job_level
@staticmethod
def _parse_company_industry(soup_industry: BeautifulSoup) -> str | None:
"""
Gets the company industry from job page
:param soup_industry:
:return: str
"""
h3_tag = soup_industry.find(
"h3",
class_="description__job-criteria-subheader",
string=lambda text: "Industries" in text,
)
industry = None
if h3_tag:
industry_span = h3_tag.find_next_sibling(
"span",
class_="description__job-criteria-text description__job-criteria-text--criteria",
)
if industry_span:
industry = industry_span.get_text(strip=True)
return industry
def _parse_job_url_direct(self, soup: BeautifulSoup) -> str | None:
"""
Gets the job url direct from job page
@@ -403,13 +333,3 @@ class LinkedInScraper(Scraper):
job_url_direct = unquote(job_url_direct_match.group())
return job_url_direct
@staticmethod
def job_type_code(job_type_enum: JobType) -> str:
return {
JobType.FULL_TIME: "F",
JobType.PART_TIME: "P",
JobType.INTERNSHIP: "I",
JobType.CONTRACT: "C",
JobType.TEMPORARY: "T",
}.get(job_type_enum, "")

85
jobspy/linkedin/util.py Normal file
View File

@@ -0,0 +1,85 @@
from bs4 import BeautifulSoup
from jobspy.model import JobType
from jobspy.util import get_enum_from_job_type
def job_type_code(job_type_enum: JobType) -> str:
return {
JobType.FULL_TIME: "F",
JobType.PART_TIME: "P",
JobType.INTERNSHIP: "I",
JobType.CONTRACT: "C",
JobType.TEMPORARY: "T",
}.get(job_type_enum, "")
def parse_job_type(soup_job_type: BeautifulSoup) -> list[JobType] | None:
"""
Gets the job type from job page
:param soup_job_type:
:return: JobType
"""
h3_tag = soup_job_type.find(
"h3",
class_="description__job-criteria-subheader",
string=lambda text: "Employment type" in text,
)
employment_type = None
if h3_tag:
employment_type_span = h3_tag.find_next_sibling(
"span",
class_="description__job-criteria-text description__job-criteria-text--criteria",
)
if employment_type_span:
employment_type = employment_type_span.get_text(strip=True)
employment_type = employment_type.lower()
employment_type = employment_type.replace("-", "")
return [get_enum_from_job_type(employment_type)] if employment_type else []
def parse_job_level(soup_job_level: BeautifulSoup) -> str | None:
"""
Gets the job level from job page
:param soup_job_level:
:return: str
"""
h3_tag = soup_job_level.find(
"h3",
class_="description__job-criteria-subheader",
string=lambda text: "Seniority level" in text,
)
job_level = None
if h3_tag:
job_level_span = h3_tag.find_next_sibling(
"span",
class_="description__job-criteria-text description__job-criteria-text--criteria",
)
if job_level_span:
job_level = job_level_span.get_text(strip=True)
return job_level
def parse_company_industry(soup_industry: BeautifulSoup) -> str | None:
"""
Gets the company industry from job page
:param soup_industry:
:return: str
"""
h3_tag = soup_industry.find(
"h3",
class_="description__job-criteria-subheader",
string=lambda text: "Industries" in text,
)
industry = None
if h3_tag:
industry_span = h3_tag.find_next_sibling(
"span",
class_="description__job-criteria-text description__job-criteria-text--criteria",
)
if industry_span:
industry = industry_span.get_text(strip=True)
return industry

View File

@@ -1,5 +1,6 @@
from __future__ import annotations
from abc import ABC, abstractmethod
from typing import Optional
from datetime import date
from enum import Enum
@@ -256,7 +257,7 @@ class JobPost(BaseModel):
company_num_employees: str | None = None
company_revenue: str | None = None
company_description: str | None = None
logo_photo_url: str | None = None
company_logo: str | None = None
banner_photo_url: str | None = None
# linkedin only atm
@@ -265,3 +266,49 @@ class JobPost(BaseModel):
class JobResponse(BaseModel):
jobs: list[JobPost] = []
class Site(Enum):
LINKEDIN = "linkedin"
INDEED = "indeed"
ZIP_RECRUITER = "zip_recruiter"
GLASSDOOR = "glassdoor"
GOOGLE = "google"
BAYT = "bayt"
class SalarySource(Enum):
DIRECT_DATA = "direct_data"
DESCRIPTION = "description"
class ScraperInput(BaseModel):
site_type: list[Site]
search_term: str | None = None
google_search_term: str | None = None
location: str | None = None
country: Country | None = Country.USA
distance: int | None = None
is_remote: bool = False
job_type: JobType | None = None
easy_apply: bool | None = None
offset: int = 0
linkedin_fetch_description: bool = False
linkedin_company_ids: list[int] | None = None
description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN
results_wanted: int = 15
hours_old: int | None = None
class Scraper(ABC):
def __init__(
self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None
):
self.site = site
self.proxies = proxies
self.ca_cert = ca_cert
@abstractmethod
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...

View File

@@ -1,16 +1,19 @@
from __future__ import annotations
import re
import logging
import re
from itertools import cycle
import numpy as np
import requests
import tls_client
import numpy as np
import urllib3
from markdownify import markdownify as md
from requests.adapters import HTTPAdapter, Retry
from ..jobs import CompensationInterval, JobType
from jobspy.model import CompensationInterval, JobType, Site
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def create_logger(name: str):
@@ -129,7 +132,7 @@ def create_session(
return session
def set_logger_level(verbose: int = 2):
def set_logger_level(verbose: int):
"""
Adjusts the logger's level. This function allows the logging level to be changed at runtime.
@@ -264,3 +267,81 @@ def extract_salary(
else:
return interval, min_salary, max_salary, "USD"
return None, None, None, None
def extract_job_type(description: str):
if not description:
return []
keywords = {
JobType.FULL_TIME: r"full\s?time",
JobType.PART_TIME: r"part\s?time",
JobType.INTERNSHIP: r"internship",
JobType.CONTRACT: r"contract",
}
listing_types = []
for key, pattern in keywords.items():
if re.search(pattern, description, re.IGNORECASE):
listing_types.append(key)
return listing_types if listing_types else None
def map_str_to_site(site_name: str) -> Site:
return Site[site_name.upper()]
def get_enum_from_value(value_str):
for job_type in JobType:
if value_str in job_type.value:
return job_type
raise Exception(f"Invalid job type: {value_str}")
def convert_to_annual(job_data: dict):
if job_data["interval"] == "hourly":
job_data["min_amount"] *= 2080
job_data["max_amount"] *= 2080
if job_data["interval"] == "monthly":
job_data["min_amount"] *= 12
job_data["max_amount"] *= 12
if job_data["interval"] == "weekly":
job_data["min_amount"] *= 52
job_data["max_amount"] *= 52
if job_data["interval"] == "daily":
job_data["min_amount"] *= 260
job_data["max_amount"] *= 260
job_data["interval"] = "yearly"
desired_order = [
"id",
"site",
"job_url",
"job_url_direct",
"title",
"company",
"location",
"date_posted",
"job_type",
"salary_source",
"interval",
"min_amount",
"max_amount",
"currency",
"is_remote",
"job_level",
"job_function",
"listing_type",
"emails",
"description",
"company_industry",
"company_url",
"company_logo",
"company_url_direct",
"company_addresses",
"company_num_employees",
"company_revenue",
"company_description",
]

View File

@@ -1,46 +1,39 @@
"""
jobspy.scrapers.ziprecruiter
~~~~~~~~~~~~~~~~~~~
This module contains routines to scrape ZipRecruiter.
"""
from __future__ import annotations
import json
import math
import re
import time
from datetime import datetime
from typing import Optional, Tuple, Any
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
from bs4 import BeautifulSoup
from .constants import headers
from .. import Scraper, ScraperInput, Site
from ..utils import (
from jobspy.ziprecruiter.constant import headers, get_cookie_data
from jobspy.util import (
extract_emails_from_text,
create_session,
markdown_converter,
remove_attributes,
create_logger,
)
from ...jobs import (
from jobspy.model import (
JobPost,
Compensation,
Location,
JobResponse,
JobType,
Country,
DescriptionFormat,
Scraper,
ScraperInput,
Site,
)
from jobspy.ziprecruiter.util import get_job_type_enum, add_params
logger = create_logger("ZipRecruiter")
log = create_logger("ZipRecruiter")
class ZipRecruiterScraper(Scraper):
class ZipRecruiter(Scraper):
base_url = "https://www.ziprecruiter.com"
api_url = "https://api.ziprecruiter.com"
@@ -77,7 +70,7 @@ class ZipRecruiterScraper(Scraper):
break
if page > 1:
time.sleep(self.delay)
logger.info(f"search page: {page} / {max_pages}")
log.info(f"search page: {page} / {max_pages}")
jobs_on_page, continue_token = self._find_jobs_in_page(
scraper_input, continue_token
)
@@ -91,7 +84,7 @@ class ZipRecruiterScraper(Scraper):
def _find_jobs_in_page(
self, scraper_input: ScraperInput, continue_token: str | None = None
) -> Tuple[list[JobPost], Optional[str]]:
) -> tuple[list[JobPost], str | None]:
"""
Scrapes a page of ZipRecruiter for jobs with scraper_input criteria
:param scraper_input:
@@ -99,7 +92,7 @@ class ZipRecruiterScraper(Scraper):
:return: jobs found on page
"""
jobs_list = []
params = self._add_params(scraper_input)
params = add_params(scraper_input)
if continue_token:
params["continue_from"] = continue_token
try:
@@ -110,13 +103,13 @@ class ZipRecruiterScraper(Scraper):
else:
err = f"ZipRecruiter response status code {res.status_code}"
err += f" with response: {res.text}" # ZipRecruiter likely not available in EU
logger.error(err)
log.error(err)
return jobs_list, ""
except Exception as e:
if "Proxy responded with" in str(e):
logger.error(f"Indeed: Bad proxy")
log.error(f"Indeed: Bad proxy")
else:
logger.error(f"Indeed: {str(e)}")
log.error(f"Indeed: {str(e)}")
return jobs_list, ""
res_data = res.json()
@@ -152,7 +145,7 @@ class ZipRecruiterScraper(Scraper):
location = Location(
city=job.get("job_city"), state=job.get("job_state"), country=country_enum
)
job_type = self._get_job_type_enum(
job_type = get_job_type_enum(
job.get("employment_type", "").replace("_", "").lower()
)
date_posted = datetime.fromisoformat(job["posted_time"].rstrip("Z")).date()
@@ -201,13 +194,17 @@ class ZipRecruiterScraper(Scraper):
else ""
)
description_full = job_description_clean + company_description_clean
script_tag = soup.find("script", type="application/json")
if script_tag:
job_json = json.loads(script_tag.string)
job_url_val = job_json["model"].get("saveJobURL", "")
m = re.search(r"job_url=(.+)", job_url_val)
if m:
job_url_direct = m.group(1)
try:
script_tag = soup.find("script", type="application/json")
if script_tag:
job_json = json.loads(script_tag.string)
job_url_val = job_json["model"].get("saveJobURL", "")
m = re.search(r"job_url=(.+)", job_url_val)
if m:
job_url_direct = m.group(1)
except:
job_url_direct = None
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
description_full = markdown_converter(description_full)
@@ -215,33 +212,8 @@ class ZipRecruiterScraper(Scraper):
return description_full, job_url_direct
def _get_cookies(self):
data = "event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple"
"""
Sends a session event to the API with device properties.
"""
url = f"{self.api_url}/jobs-app/event"
self.session.post(url, data=data)
@staticmethod
def _get_job_type_enum(job_type_str: str) -> list[JobType] | None:
for job_type in JobType:
if job_type_str in job_type.value:
return [job_type]
return None
@staticmethod
def _add_params(scraper_input) -> dict[str, str | Any]:
params = {
"search": scraper_input.search_term,
"location": scraper_input.location,
}
if scraper_input.hours_old:
params["days"] = max(scraper_input.hours_old // 24, 1)
job_type_map = {JobType.FULL_TIME: "full_time", JobType.PART_TIME: "part_time"}
if scraper_input.job_type:
job_type = scraper_input.job_type
params["employment_type"] = job_type_map.get(job_type, job_type.value[0])
if scraper_input.easy_apply:
params["zipapply"] = 1
if scraper_input.is_remote:
params["remote"] = 1
if scraper_input.distance:
params["radius"] = scraper_input.distance
return {k: v for k, v in params.items() if v is not None}
self.session.post(url, data=get_cookie_data)

View File

@@ -0,0 +1,29 @@
headers = {
"Host": "api.ziprecruiter.com",
"accept": "*/*",
"x-zr-zva-override": "100000000;vid:ZT1huzm_EQlDTVEc",
"x-pushnotificationid": "0ff4983d38d7fc5b3370297f2bcffcf4b3321c418f5c22dd152a0264707602a0",
"x-deviceid": "D77B3A92-E589-46A4-8A39-6EF6F1D86006",
"user-agent": "Job Search/87.0 (iPhone; CPU iOS 16_6_1 like Mac OS X)",
"authorization": "Basic YTBlZjMyZDYtN2I0Yy00MWVkLWEyODMtYTI1NDAzMzI0YTcyOg==",
"accept-language": "en-US,en;q=0.9",
}
get_cookie_data = [
("event_type", "session"),
("logged_in", "false"),
("number_of_retry", "1"),
("property", "model:iPhone"),
("property", "os:iOS"),
("property", "locale:en_us"),
("property", "app_build_number:4734"),
("property", "app_version:91.0"),
("property", "manufacturer:Apple"),
("property", "timestamp:2025-01-12T12:04:42-06:00"),
("property", "screen_height:852"),
("property", "os_version:16.6.1"),
("property", "source:install"),
("property", "screen_width:393"),
("property", "device_model:iPhone 14 Pro"),
("property", "brand:Apple"),
]

View File

@@ -0,0 +1,31 @@
from jobspy.model import JobType
def add_params(scraper_input) -> dict[str, str | int]:
params: dict[str, str | int] = {
"search": scraper_input.search_term,
"location": scraper_input.location,
}
if scraper_input.hours_old:
params["days"] = max(scraper_input.hours_old // 24, 1)
job_type_map = {JobType.FULL_TIME: "full_time", JobType.PART_TIME: "part_time"}
if scraper_input.job_type:
job_type = scraper_input.job_type
params["employment_type"] = job_type_map.get(job_type, job_type.value[0])
if scraper_input.easy_apply:
params["zipapply"] = 1
if scraper_input.is_remote:
params["remote"] = 1
if scraper_input.distance:
params["radius"] = scraper_input.distance
return {k: v for k, v in params.items() if v is not None}
def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
for job_type in JobType:
if job_type_str in job_type.value:
return [job_type]
return None

View File

@@ -1,2 +0,0 @@
[virtualenvs]
in-project = true

View File

@@ -1,15 +1,20 @@
[build-system]
requires = [ "poetry-core",]
build-backend = "poetry.core.masonry.api"
[tool.poetry]
name = "python-jobspy"
version = "1.1.71"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy"
version = "1.1.77"
description = "Job scraper for LinkedIn, Indeed, Glassdoor, ZipRecruiter & Bayt"
authors = ["Cullen Watson <cullen@cullenwatson.com>", "Zachary Hampton <zachary@zacharysproducts.com>"]
homepage = "https://github.com/cullenwatson/JobSpy"
readme = "README.md"
keywords = ['jobs-scraper', 'linkedin', 'indeed', 'glassdoor', 'ziprecruiter']
keywords = [ "jobs-scraper", "linkedin", "indeed", "glassdoor", "ziprecruiter", "bayt"]
[[tool.poetry.packages]]
include = "jobspy"
packages = [
{ include = "jobspy", from = "src" }
]
[tool.black]
line-length = 88
[tool.poetry.dependencies]
python = "^3.10"
@@ -22,16 +27,7 @@ tls-client = "^1.0.1"
markdownify = "^0.13.1"
regex = "^2024.4.28"
[tool.poetry.group.dev.dependencies]
pytest = "^7.4.1"
jupyter = "^1.0.0"
black = "*"
pre-commit = "*"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
[tool.black]
line-length = 88

View File

@@ -1,51 +0,0 @@
from __future__ import annotations
from abc import ABC, abstractmethod
from ..jobs import (
Enum,
BaseModel,
JobType,
JobResponse,
Country,
DescriptionFormat,
)
class Site(Enum):
LINKEDIN = "linkedin"
INDEED = "indeed"
ZIP_RECRUITER = "zip_recruiter"
GLASSDOOR = "glassdoor"
class SalarySource(Enum):
DIRECT_DATA = "direct_data"
DESCRIPTION = "description"
class ScraperInput(BaseModel):
site_type: list[Site]
search_term: str | None = None
location: str | None = None
country: Country | None = Country.USA
distance: int | None = None
is_remote: bool = False
job_type: JobType | None = None
easy_apply: bool | None = None
offset: int = 0
linkedin_fetch_description: bool = False
linkedin_company_ids: list[int] | None = None
description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN
results_wanted: int = 15
hours_old: int | None = None
class Scraper(ABC):
def __init__(self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None):
self.site = site
self.proxies = proxies
self.ca_cert = ca_cert
@abstractmethod
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...

View File

@@ -1,10 +0,0 @@
headers = {
"Host": "api.ziprecruiter.com",
"accept": "*/*",
"x-zr-zva-override": "100000000;vid:ZT1huzm_EQlDTVEc",
"x-pushnotificationid": "0ff4983d38d7fc5b3370297f2bcffcf4b3321c418f5c22dd152a0264707602a0",
"x-deviceid": "D77B3A92-E589-46A4-8A39-6EF6F1D86006",
"user-agent": "Job Search/87.0 (iPhone; CPU iOS 16_6_1 like Mac OS X)",
"authorization": "Basic YTBlZjMyZDYtN2I0Yy00MWVkLWEyODMtYTI1NDAzMzI0YTcyOg==",
"accept-language": "en-US,en;q=0.9",
}

View File

View File

@@ -1,18 +0,0 @@
from jobspy import scrape_jobs
import pandas as pd
def test_all():
sites = [
"indeed",
"glassdoor",
] # ziprecruiter/linkedin needs good ip, and temp fix to pass test on ci
result = scrape_jobs(
site_name=sites,
search_term="engineer",
results_wanted=5,
)
assert (
isinstance(result, pd.DataFrame) and len(result) == len(sites) * 5
), "Result should be a non-empty DataFrame"

View File

@@ -1,13 +0,0 @@
from jobspy import scrape_jobs
import pandas as pd
def test_glassdoor():
result = scrape_jobs(
site_name="glassdoor",
search_term="engineer",
results_wanted=5,
)
assert (
isinstance(result, pd.DataFrame) and len(result) == 5
), "Result should be a non-empty DataFrame"

View File

@@ -1,13 +0,0 @@
from jobspy import scrape_jobs
import pandas as pd
def test_indeed():
result = scrape_jobs(
site_name="indeed",
search_term="engineer",
results_wanted=5,
)
assert (
isinstance(result, pd.DataFrame) and len(result) == 5
), "Result should be a non-empty DataFrame"

View File

@@ -1,9 +0,0 @@
from jobspy import scrape_jobs
import pandas as pd
def test_linkedin():
result = scrape_jobs(site_name="linkedin", search_term="engineer", results_wanted=5)
assert (
isinstance(result, pd.DataFrame) and len(result) == 5
), "Result should be a non-empty DataFrame"

View File

@@ -1,12 +0,0 @@
from jobspy import scrape_jobs
import pandas as pd
def test_ziprecruiter():
result = scrape_jobs(
site_name="zip_recruiter", search_term="software engineer", results_wanted=5
)
assert (
isinstance(result, pd.DataFrame) and len(result) == 5
), "Result should be a non-empty DataFrame"