Compare commits

...

23 Commits

Author SHA1 Message Date
Cullen Watson
7cb0c518fc docs:readme 2025-02-21 12:53:59 -06:00
Cullen Watson
df70d4bc2e minor 2025-02-21 12:35:31 -06:00
Cullen Watson
3006063875 enh:remove log by default 2025-02-21 12:31:04 -06:00
Abdulrahman Hisham
1be009b8bc Adding Bayt.com Scraper to current codebase (#246) 2025-02-21 12:29:54 -06:00
Cullen Watson
81ed9b3ddf enh:remove log by default 2025-02-21 12:29:28 -06:00
Abdulrahman Al Muaitah
11a9e9a56a Fixed Bayt scraper integration 2025-02-21 20:10:02 +04:00
Abdulrahman Al Muaitah
c6ade14784 Added Bayt Scraper integration 2025-02-21 15:31:29 +04:00
Cullen Watson
13c74a0fed docs:readme 2025-02-09 13:42:18 -06:00
Cullen Watson
333e9e6760 docs:readme 2025-01-17 21:44:49 -06:00
github-actions
04032a0f91 Increment version 2024-12-04 22:55:06 +00:00
Cullen Watson
496896d0b5 enh:fix yml (#225) 2024-12-04 16:54:52 -06:00
Cullen Watson
87ba1ad1bf fix yml 2024-12-04 16:52:15 -06:00
Jason Geffner
4e7ac9a583 Fix Google job search (#223)
The previous regex did not capture all expected matches in the returned content
2024-12-04 16:45:59 -06:00
Cullen Watson
e44d13e1cf enh:auto update version 2024-12-04 16:29:38 -06:00
Cullen Watson
d52e366ef7 docs:readme 2024-11-26 15:51:26 -06:00
Cullen Watson
395ebf0017 docs:readme 2024-11-26 15:49:12 -06:00
Cullen Watson
63fddd9b7f docs:readme 2024-11-26 15:48:22 -06:00
Cullen Watson
58956868ae docs:readme 2024-11-26 15:47:10 -06:00
Cullen Watson
4fce836222 docs:readme 2024-10-28 03:53:59 -05:00
Cullen Watson
5ba25e7a7c docs:readme 2024-10-28 03:42:19 -05:00
Cullen Watson
f7cb3e9206 docs:readme 2024-10-28 03:36:21 -05:00
Cullen Watson
3ad3f121f7 docs:readme 2024-10-28 03:34:52 -05:00
Cullen Watson
ff3c782912 docs:readme 2024-10-25 18:12:08 -05:00
23 changed files with 364 additions and 245 deletions

View File

@@ -1,33 +1,50 @@
name: Publish Python 🐍 distributions 📦 to PyPI
on: push
on:
pull_request:
types:
- closed
permissions:
contents: write
jobs:
build-n-publish:
name: Build and publish Python 🐍 distributions 📦 to PyPI
runs-on: ubuntu-latest
if: github.event.pull_request.merged == true && github.event.pull_request.base.ref == 'main'
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: "3.10"
- name: Install dependencies
run: pip install toml
- name: Increment version
run: python increment_version.py
- name: Commit version increment
run: |
git config --global user.name 'github-actions'
git config --global user.email 'github-actions@github.com'
git add pyproject.toml
git commit -m 'Increment version'
- name: Push changes
run: git push
- name: Install poetry
run: >-
python3 -m
pip install
poetry
--user
run: pip install poetry --user
- name: Build distribution 📦
run: >-
python3 -m
poetry
build
run: poetry build
- name: Publish distribution 📦 to PyPI
if: startsWith(github.ref, 'refs/tags')
uses: pypa/gh-action-pypi-publish@release/v1
with:
password: ${{ secrets.PYPI_API_TOKEN }}
password: ${{ secrets.PYPI_API_TOKEN }}

View File

@@ -1,22 +0,0 @@
name: Python Tests
on:
pull_request:
branches:
- main
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.8'
- name: Install dependencies
run: |
pip install poetry
poetry install
- name: Run tests
run: poetry run pytest tests/test_all.py

168
README.md
View File

@@ -1,15 +1,12 @@
<img src="https://github.com/cullenwatson/JobSpy/assets/78247585/ae185b7e-e444-4712-8bb9-fa97f53e896b" width="400">
**JobSpy** is a simple, yet comprehensive, job scraping library.
*Looking to build a data-focused software product?* **[Book a call](https://bunsly.com/)** *to
work with us.*
**JobSpy** is a job scraping library with the goal of aggregating all the jobs from popular job boards with one tool.
## Features
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, & **ZipRecruiter** simultaneously
- Aggregates the job postings in a Pandas DataFrame
- Proxies support
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, **ZipRecruiter**, & **Bayt** concurrently
- Aggregates the job postings in a dataframe
- Proxies support to bypass blocking
![jobspy](https://github.com/cullenwatson/JobSpy/assets/78247585/ec7ef355-05f6-4fd3-8161-a817e31c5c57)
@@ -28,17 +25,16 @@ import csv
from jobspy import scrape_jobs
jobs = scrape_jobs(
site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google"],
site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google", "bayt"],
search_term="software engineer",
google_search_term="software engineer jobs near San Francisco, CA since yesterday",
location="San Francisco, CA",
results_wanted=20,
hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old)
country_indeed='USA', # only needed for indeed / glassdoor
hours_old=72,
country_indeed='USA',
# linkedin_fetch_description=True # get more info such as full description, direct job url for linkedin (slower)
# linkedin_fetch_description=True # gets more info such as description, direct job url (slower)
# proxies=["208.195.175.46:65095", "208.195.175.45:65095", "localhost"],
)
print(f"Found {len(jobs)} jobs")
print(jobs.head())
@@ -62,13 +58,13 @@ zip_recruiter Software Developer TEKsystems Phoenix
```plaintext
Optional
├── site_name (list|str):
| linkedin, zip_recruiter, indeed, glassdoor, google
| linkedin, zip_recruiter, indeed, glassdoor, google, bayt
| (default is all)
├── search_term (str)
|
├── google_search_term (str)
| search term for google jobs. This is is only param for filtering google jobs.
| search term for google jobs. This is the only param for filtering google jobs.
├── location (str)
@@ -88,7 +84,7 @@ Optional
| number of job results to retrieve for each site specified in 'site_name'
├── easy_apply (bool):
| filters for jobs that are hosted on the job board site
| filters for jobs that are hosted on the job board site (LinkedIn easy apply filter no longer works)
├── description_format (str):
| markdown, html (Format type of the job descriptions. Default is markdown.)
@@ -133,6 +129,84 @@ Optional
| - easy_apply
```
## Supported Countries for Job Searching
### **LinkedIn**
LinkedIn searches globally & uses only the `location` parameter.
### **ZipRecruiter**
ZipRecruiter searches for jobs in **US/Canada** & uses only the `location` parameter.
### **Indeed / Glassdoor**
Indeed & Glassdoor supports most countries, but the `country_indeed` parameter is required. Additionally, use the `location`
parameter to narrow down the location, e.g. city & state if necessary.
You can specify the following countries when searching on Indeed (use the exact name, * indicates support for Glassdoor):
| | | | |
|----------------------|--------------|------------|----------------|
| Argentina | Australia* | Austria* | Bahrain |
| Belgium* | Brazil* | Canada* | Chile |
| China | Colombia | Costa Rica | Czech Republic |
| Denmark | Ecuador | Egypt | Finland |
| France* | Germany* | Greece | Hong Kong* |
| Hungary | India* | Indonesia | Ireland* |
| Israel | Italy* | Japan | Kuwait |
| Luxembourg | Malaysia | Mexico* | Morocco |
| Netherlands* | New Zealand* | Nigeria | Norway |
| Oman | Pakistan | Panama | Peru |
| Philippines | Poland | Portugal | Qatar |
| Romania | Saudi Arabia | Singapore* | South Africa |
| South Korea | Spain* | Sweden | Switzerland* |
| Taiwan | Thailand | Turkey | Ukraine |
| United Arab Emirates | UK* | USA* | Uruguay |
| Venezuela | Vietnam* | | |
### **Bayt**
Bayt only uses the search_term parameter currently and searches internationally
## Notes
* Indeed is the best scraper currently with no rate limiting.
* All the job board endpoints are capped at around 1000 jobs on a given search.
* LinkedIn is the most restrictive and usually rate limits around the 10th page with one ip. Proxies are a must basically.
## Frequently Asked Questions
---
**Q: Why is Indeed giving unrelated roles?**
**A:** Indeed searches the description too.
- use - to remove words
- "" for exact match
Example of a good Indeed query
```py
search_term='"engineering intern" software summer (java OR python OR c++) 2025 -tax -marketing'
```
This searches the description/title and must include software, summer, 2025, one of the languages, engineering intern exactly, no tax, no marketing.
---
**Q: No results when using "google"?**
**A:** You have to use super specific syntax. Search for google jobs on your browser and then whatever pops up in the google jobs search box after applying some filters is what you need to copy & paste into the google_search_term.
---
**Q: Received a response code 429?**
**A:** This indicates that you have been blocked by the job board site for sending too many requests. All of the job board sites are aggressive with blocking. We recommend:
- Wait some time between scrapes (site-dependent).
- Try using the proxies param to change your IP address.
---
### JobPost Schema
@@ -172,67 +246,3 @@ Indeed specific
├── company_description
└── company_logo
```
## Supported Countries for Job Searching
### **LinkedIn**
LinkedIn searches globally & uses only the `location` parameter.
### **ZipRecruiter**
ZipRecruiter searches for jobs in **US/Canada** & uses only the `location` parameter.
### **Indeed / Glassdoor**
Indeed & Glassdoor supports most countries, but the `country_indeed` parameter is required. Additionally, use the `location`
parameter to narrow down the location, e.g. city & state if necessary.
You can specify the following countries when searching on Indeed (use the exact name, * indicates support for Glassdoor):
| | | | |
|----------------------|--------------|------------|----------------|
| Argentina | Australia* | Austria* | Bahrain |
| Belgium* | Brazil* | Canada* | Chile |
| China | Colombia | Costa Rica | Czech Republic |
| Denmark | Ecuador | Egypt | Finland |
| France* | Germany* | Greece | Hong Kong* |
| Hungary | India* | Indonesia | Ireland* |
| Israel | Italy* | Japan | Kuwait |
| Luxembourg | Malaysia | Mexico* | Morocco |
| Netherlands* | New Zealand* | Nigeria | Norway |
| Oman | Pakistan | Panama | Peru |
| Philippines | Poland | Portugal | Qatar |
| Romania | Saudi Arabia | Singapore* | South Africa |
| South Korea | Spain* | Sweden | Switzerland* |
| Taiwan | Thailand | Turkey | Ukraine |
| United Arab Emirates | UK* | USA* | Uruguay |
| Venezuela | Vietnam* | | |
## Notes
* Indeed is the best scraper currently with no rate limiting.
* All the job board endpoints are capped at around 1000 jobs on a given search.
* LinkedIn is the most restrictive and usually rate limits around the 10th page with one ip. Proxies are a must basically.
## Frequently Asked Questions
---
**Q: Why is Indeed giving unrelated roles?**
**A:** Indeed is searching each one of your terms e.g. software intern, it searches software OR intern. Try search_term='"software intern"' in quotes for stricter searching
---
**Q: Received a response code 429?**
**A:** This indicates that you have been blocked by the job board site for sending too many requests. All of the job board sites are aggressive with blocking. We recommend:
- Wait some time between scrapes (site-dependent).
- Try using the proxies param to change your IP address.
---
**Q: Encountering issues with your queries?**
**A:** Try reducing the number of `results_wanted` and/or broadening the filters. If problems
persist, [submit an issue](https://github.com/Bunsly/JobSpy/issues).
---

21
increment_version.py Normal file
View File

@@ -0,0 +1,21 @@
import toml
def increment_version(version):
major, minor, patch = map(int, version.split('.'))
patch += 1
return f"{major}.{minor}.{patch}"
# Load pyproject.toml
with open('pyproject.toml', 'r') as file:
pyproject = toml.load(file)
# Increment the version
current_version = pyproject['tool']['poetry']['version']
new_version = increment_version(current_version)
pyproject['tool']['poetry']['version'] = new_version
# Save the updated pyproject.toml
with open('pyproject.toml', 'w') as file:
toml.dump(pyproject, file)
print(f"Version updated from {current_version} to {new_version}")

View File

@@ -1,2 +0,0 @@
[virtualenvs]
in-project = true

View File

@@ -1,15 +1,21 @@
[build-system]
requires = [ "poetry-core",]
build-backend = "poetry.core.masonry.api"
[tool.poetry]
name = "python-jobspy"
version = "1.1.75"
version = "1.1.76"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
authors = [ "Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>",]
homepage = "https://github.com/Bunsly/JobSpy"
readme = "README.md"
keywords = ['jobs-scraper', 'linkedin', 'indeed', 'glassdoor', 'ziprecruiter']
keywords = [ "jobs-scraper", "linkedin", "indeed", "glassdoor", "ziprecruiter",]
[[tool.poetry.packages]]
include = "jobspy"
from = "src"
packages = [
{ include = "jobspy", from = "src" }
]
[tool.black]
line-length = 88
[tool.poetry.dependencies]
python = "^3.10"
@@ -22,16 +28,8 @@ tls-client = "^1.0.1"
markdownify = "^0.13.1"
regex = "^2024.4.28"
[tool.poetry.group.dev.dependencies]
pytest = "^7.4.1"
jupyter = "^1.0.0"
black = "*"
pre-commit = "*"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
[tool.black]
line-length = 88

View File

@@ -11,6 +11,7 @@ from .scrapers.ziprecruiter import ZipRecruiterScraper
from .scrapers.glassdoor import GlassdoorScraper
from .scrapers.google import GoogleJobsScraper
from .scrapers.linkedin import LinkedInScraper
from .scrapers.bayt import BaytScraper
from .scrapers import SalarySource, ScraperInput, Site, JobResponse, Country
from .scrapers.exceptions import (
LinkedInException,
@@ -41,7 +42,7 @@ def scrape_jobs(
offset: int | None = 0,
hours_old: int = None,
enforce_annual_salary: bool = False,
verbose: int = 2,
verbose: int = 0,
**kwargs,
) -> pd.DataFrame:
"""
@@ -54,6 +55,7 @@ def scrape_jobs(
Site.ZIP_RECRUITER: ZipRecruiterScraper,
Site.GLASSDOOR: GlassdoorScraper,
Site.GOOGLE: GoogleJobsScraper,
Site.BAYT: BaytScraper,
}
set_logger_level(verbose)

View File

@@ -18,6 +18,7 @@ class Site(Enum):
ZIP_RECRUITER = "zip_recruiter"
GLASSDOOR = "glassdoor"
GOOGLE = "google"
BAYT = "bayt"
class SalarySource(Enum):

View File

@@ -0,0 +1,145 @@
"""
jobspy.scrapers.bayt
~~~~~~~~~~~~~~~~~~~
This module contains routines to scrape Bayt.
"""
from __future__ import annotations
import random
import time
from bs4 import BeautifulSoup
from .. import Scraper, ScraperInput, Site
from ..utils import create_logger, create_session
from ...jobs import JobPost, JobResponse, Location, Country
log = create_logger("Bayt")
class BaytScraper(Scraper):
base_url = "https://www.bayt.com"
delay = 2
band_delay = 3
def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
):
super().__init__(Site.BAYT, proxies=proxies, ca_cert=ca_cert)
self.scraper_input = None
self.session = None
self.country = "worldwide"
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
self.scraper_input = scraper_input
self.session = create_session(
proxies=self.proxies, ca_cert=self.ca_cert, is_tls=False, has_retry=True
)
job_list: list[JobPost] = []
page = 1
results_wanted = (
scraper_input.results_wanted if scraper_input.results_wanted else 10
)
while len(job_list) < results_wanted:
log.info(f"Fetching Bayt jobs page {page}")
job_elements = self._fetch_jobs(self.scraper_input.search_term, page)
if not job_elements:
break
if job_elements:
log.debug(
"First job element snippet:\n" + job_elements[0].prettify()[:500]
)
initial_count = len(job_list)
for job in job_elements:
try:
job_post = self._extract_job_info(job)
if job_post:
job_list.append(job_post)
if len(job_list) >= results_wanted:
break
else:
log.debug(
"Extraction returned None. Job snippet:\n"
+ job.prettify()[:500]
)
except Exception as e:
log.error(f"Bayt: Error extracting job info: {str(e)}")
continue
if len(job_list) == initial_count:
log.info(f"No new jobs found on page {page}. Ending pagination.")
break
page += 1
time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list)
def _fetch_jobs(self, query: str, page: int) -> list | None:
"""
Grabs the job results for the given query and page number.
"""
try:
url = f"{self.base_url}/en/international/jobs/{query}-jobs/?page={page}"
response = self.session.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
job_listings = soup.find_all("li", attrs={"data-js-job": ""})
log.debug(f"Found {len(job_listings)} job listing elements")
return job_listings
except Exception as e:
log.error(f"Bayt: Error fetching jobs - {str(e)}")
return None
def _extract_job_info(self, job: BeautifulSoup) -> JobPost | None:
"""
Extracts the job information from a single job listing.
"""
# Find the h2 element holding the title and link (no class filtering)
job_general_information = job.find("h2")
if not job_general_information:
return
job_title = job_general_information.get_text(strip=True)
job_url = self._extract_job_url(job_general_information)
if not job_url:
return
# Extract company name using the original approach:
company_tag = job.find("div", class_="t-nowrap p10l")
company_name = (
company_tag.find("span").get_text(strip=True)
if company_tag and company_tag.find("span")
else None
)
# Extract location using the original approach:
location_tag = job.find("div", class_="t-mute t-small")
location = location_tag.get_text(strip=True) if location_tag else None
job_id = f"bayt-{abs(hash(job_url))}"
location_obj = Location(
city=location,
country=Country.from_string(self.country),
)
return JobPost(
id=job_id,
title=job_title,
company_name=company_name,
location=location_obj,
job_url=job_url,
)
def _extract_job_url(self, job_general_information: BeautifulSoup) -> str | None:
"""
Pulls the job URL from the 'a' within the h2 element.
"""
a_tag = job_general_information.find("a")
if a_tag and a_tag.has_attr("href"):
return self.base_url + a_tag["href"].strip()

View File

@@ -29,3 +29,8 @@ class GlassdoorException(Exception):
class GoogleJobsException(Exception):
def __init__(self, message=None):
super().__init__(message or "An error occurred with Google Jobs")
class BaytException(Exception):
def __init__(self, message=None):
super().__init__(message or "An error occurred with Bayt")

View File

@@ -32,7 +32,7 @@ from ...jobs import (
DescriptionFormat,
)
logger = create_logger("Glassdoor")
log = create_logger("Glassdoor")
class GlassdoorScraper(Scraper):
@@ -64,7 +64,7 @@ class GlassdoorScraper(Scraper):
self.base_url = self.scraper_input.country.get_glassdoor_url()
self.session = create_session(
proxies=self.proxies, ca_cert=self.ca_cert, is_tls=True, has_retry=True
proxies=self.proxies, ca_cert=self.ca_cert, has_retry=True
)
token = self._get_csrf_token()
headers["gd-csrf-token"] = token if token else fallback_token
@@ -74,7 +74,7 @@ class GlassdoorScraper(Scraper):
scraper_input.location, scraper_input.is_remote
)
if location_type is None:
logger.error("Glassdoor: location not parsed")
log.error("Glassdoor: location not parsed")
return JobResponse(jobs=[])
job_list: list[JobPost] = []
cursor = None
@@ -83,7 +83,7 @@ class GlassdoorScraper(Scraper):
tot_pages = (scraper_input.results_wanted // self.jobs_per_page) + 2
range_end = min(tot_pages, self.max_pages + 1)
for page in range(range_start, range_end):
logger.info(f"search page: {page} / {range_end-1}")
log.info(f"search page: {page} / {range_end - 1}")
try:
jobs, cursor = self._fetch_jobs_page(
scraper_input, location_id, location_type, page, cursor
@@ -93,7 +93,7 @@ class GlassdoorScraper(Scraper):
job_list = job_list[: scraper_input.results_wanted]
break
except Exception as e:
logger.error(f"Glassdoor: {str(e)}")
log.error(f"Glassdoor: {str(e)}")
break
return JobResponse(jobs=job_list)
@@ -129,7 +129,7 @@ class GlassdoorScraper(Scraper):
ValueError,
Exception,
) as e:
logger.error(f"Glassdoor: {str(e)}")
log.error(f"Glassdoor: {str(e)}")
return jobs, None
jobs_data = res_json["data"]["jobListings"]["jobListings"]
@@ -264,12 +264,12 @@ class GlassdoorScraper(Scraper):
if res.status_code != 200:
if res.status_code == 429:
err = f"429 Response - Blocked by Glassdoor for too many requests"
logger.error(err)
log.error(err)
return None, None
else:
err = f"Glassdoor response status code {res.status_code}"
err += f" - {res.text}"
logger.error(f"Glassdoor response status code {res.status_code}")
log.error(f"Glassdoor response status code {res.status_code}")
return None, None
items = res.json()

View File

@@ -26,7 +26,7 @@ from ...jobs import (
JobType,
)
logger = create_logger("Google")
log = create_logger("Google")
class GoogleJobsScraper(Scraper):
@@ -61,7 +61,7 @@ class GoogleJobsScraper(Scraper):
)
forward_cursor, job_list = self._get_initial_cursor_and_jobs()
if forward_cursor is None:
logger.warning(
log.warning(
"initial cursor not found, try changing your query or there was at most 10 results"
)
return JobResponse(jobs=job_list)
@@ -72,16 +72,16 @@ class GoogleJobsScraper(Scraper):
len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset
and forward_cursor
):
logger.info(
log.info(
f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
)
try:
jobs, forward_cursor = self._get_jobs_next_page(forward_cursor)
except Exception as e:
logger.error(f"failed to get jobs on page: {page}, {e}")
log.error(f"failed to get jobs on page: {page}, {e}")
break
if not jobs:
logger.info(f"found no jobs on page: {page}")
log.info(f"found no jobs on page: {page}")
break
job_list += jobs
page += 1
@@ -230,10 +230,7 @@ class GoogleJobsScraper(Scraper):
@staticmethod
def _find_job_info_initial_page(html_text: str):
pattern = (
f'520084652":('
+ r"\[(?:[^\[\]]|\[(?:[^\[\]]|\[(?:[^\[\]]|\[[^\[\]]*\])*\])*\])*\])"
)
pattern = f'520084652":(' + r"\[.*?\]\s*])\s*}\s*]\s*]\s*]\s*]\s*]"
results = []
matches = re.finditer(pattern, html_text)
@@ -245,6 +242,6 @@ class GoogleJobsScraper(Scraper):
results.append(parsed_data)
except json.JSONDecodeError as e:
logger.error(f"Failed to parse match: {str(e)}")
log.error(f"Failed to parse match: {str(e)}")
results.append({"raw_match": match.group(0), "error": str(e)})
return results

View File

@@ -30,7 +30,7 @@ from ...jobs import (
DescriptionFormat,
)
logger = create_logger("Indeed")
log = create_logger("Indeed")
class IndeedScraper(Scraper):
@@ -71,12 +71,12 @@ class IndeedScraper(Scraper):
cursor = None
while len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset:
logger.info(
log.info(
f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
)
jobs, cursor = self._scrape_page(cursor)
if not jobs:
logger.info(f"found no jobs on page: {page}")
log.info(f"found no jobs on page: {page}")
break
job_list += jobs
page += 1
@@ -122,9 +122,10 @@ class IndeedScraper(Scraper):
headers=api_headers_temp,
json=payload,
timeout=10,
verify=False,
)
if not response.ok:
logger.info(
log.info(
f"responded with status code: {response.status_code} (submit GitHub issue if this appears to be a bug)"
)
return jobs, new_cursor

View File

@@ -38,7 +38,7 @@ from ..utils import (
markdown_converter,
)
logger = create_logger("LinkedIn")
log = create_logger("LinkedIn")
class LinkedInScraper(Scraper):
@@ -86,7 +86,7 @@ class LinkedInScraper(Scraper):
)
while continue_search():
request_count += 1
logger.info(
log.info(
f"search page: {request_count} / {math.ceil(scraper_input.results_wanted / 10)}"
)
params = {
@@ -126,13 +126,13 @@ class LinkedInScraper(Scraper):
else:
err = f"LinkedIn response status code {response.status_code}"
err += f" - {response.text}"
logger.error(err)
log.error(err)
return JobResponse(jobs=job_list)
except Exception as e:
if "Proxy responded with" in str(e):
logger.error(f"LinkedIn: Bad proxy")
log.error(f"LinkedIn: Bad proxy")
else:
logger.error(f"LinkedIn: {str(e)}")
log.error(f"LinkedIn: {str(e)}")
return JobResponse(jobs=job_list)
soup = BeautifulSoup(response.text, "html.parser")

View File

@@ -1,17 +1,20 @@
from __future__ import annotations
import re
import logging
import re
from itertools import cycle
import numpy as np
import requests
import tls_client
import numpy as np
import urllib3
from markdownify import markdownify as md
from requests.adapters import HTTPAdapter, Retry
from ..jobs import CompensationInterval, JobType
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def create_logger(name: str):
logger = logging.getLogger(f"JobSpy:{name}")
@@ -129,7 +132,7 @@ def create_session(
return session
def set_logger_level(verbose: int = 2):
def set_logger_level(verbose: int):
"""
Adjusts the logger's level. This function allows the logging level to be changed at runtime.

View File

@@ -11,11 +11,10 @@ import json
import math
import re
import time
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
from typing import Optional, Tuple, Any
from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup
from .constants import headers
@@ -37,7 +36,7 @@ from ...jobs import (
DescriptionFormat,
)
logger = create_logger("ZipRecruiter")
log = create_logger("ZipRecruiter")
class ZipRecruiterScraper(Scraper):
@@ -77,7 +76,7 @@ class ZipRecruiterScraper(Scraper):
break
if page > 1:
time.sleep(self.delay)
logger.info(f"search page: {page} / {max_pages}")
log.info(f"search page: {page} / {max_pages}")
jobs_on_page, continue_token = self._find_jobs_in_page(
scraper_input, continue_token
)
@@ -110,13 +109,13 @@ class ZipRecruiterScraper(Scraper):
else:
err = f"ZipRecruiter response status code {res.status_code}"
err += f" with response: {res.text}" # ZipRecruiter likely not available in EU
logger.error(err)
log.error(err)
return jobs_list, ""
except Exception as e:
if "Proxy responded with" in str(e):
logger.error(f"Indeed: Bad proxy")
log.error(f"Indeed: Bad proxy")
else:
logger.error(f"Indeed: {str(e)}")
log.error(f"Indeed: {str(e)}")
return jobs_list, ""
res_data = res.json()
@@ -215,7 +214,28 @@ class ZipRecruiterScraper(Scraper):
return description_full, job_url_direct
def _get_cookies(self):
data = "event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple"
"""
Sends a session event to the API with device properties.
"""
data = [
("event_type", "session"),
("logged_in", "false"),
("number_of_retry", "1"),
("property", "model:iPhone"),
("property", "os:iOS"),
("property", "locale:en_us"),
("property", "app_build_number:4734"),
("property", "app_version:91.0"),
("property", "manufacturer:Apple"),
("property", "timestamp:2025-01-12T12:04:42-06:00"),
("property", "screen_height:852"),
("property", "os_version:16.6.1"),
("property", "source:install"),
("property", "screen_width:393"),
("property", "device_model:iPhone 14 Pro"),
("property", "brand:Apple"),
]
url = f"{self.api_url}/jobs-app/event"
self.session.post(url, data=data)

View File

View File

@@ -1,18 +0,0 @@
from jobspy import scrape_jobs
import pandas as pd
def test_all():
sites = [
"indeed",
"glassdoor",
] # ziprecruiter/linkedin needs good ip, and temp fix to pass test on ci
result = scrape_jobs(
site_name=sites,
search_term="engineer",
results_wanted=5,
)
assert (
isinstance(result, pd.DataFrame) and len(result) == len(sites) * 5
), "Result should be a non-empty DataFrame"

View File

@@ -1,13 +0,0 @@
from jobspy import scrape_jobs
import pandas as pd
def test_glassdoor():
result = scrape_jobs(
site_name="glassdoor",
search_term="engineer",
results_wanted=5,
)
assert (
isinstance(result, pd.DataFrame) and len(result) == 5
), "Result should be a non-empty DataFrame"

View File

@@ -1,12 +0,0 @@
from jobspy import scrape_jobs
import pandas as pd
def test_google():
result = scrape_jobs(
site_name="google", search_term="software engineer", results_wanted=5
)
assert (
isinstance(result, pd.DataFrame) and len(result) == 5
), "Result should be a non-empty DataFrame"

View File

@@ -1,13 +0,0 @@
from jobspy import scrape_jobs
import pandas as pd
def test_indeed():
result = scrape_jobs(
site_name="indeed",
search_term="engineer",
results_wanted=5,
)
assert (
isinstance(result, pd.DataFrame) and len(result) == 5
), "Result should be a non-empty DataFrame"

View File

@@ -1,9 +0,0 @@
from jobspy import scrape_jobs
import pandas as pd
def test_linkedin():
result = scrape_jobs(site_name="linkedin", search_term="engineer", results_wanted=5)
assert (
isinstance(result, pd.DataFrame) and len(result) == 5
), "Result should be a non-empty DataFrame"

View File

@@ -1,12 +0,0 @@
from jobspy import scrape_jobs
import pandas as pd
def test_ziprecruiter():
result = scrape_jobs(
site_name="zip_recruiter", search_term="software engineer", results_wanted=5
)
assert (
isinstance(result, pd.DataFrame) and len(result) == 5
), "Result should be a non-empty DataFrame"