mirror of https://github.com/Bunsly/JobSpy
Compare commits
13 Commits
Author | SHA1 | Date |
---|---|---|
|
94d413bad1 | |
|
61205bcc77 | |
|
f1602eca70 | |
|
d4d52d05f5 | |
|
0946cb3373 | |
|
051981689f | |
|
903b7e6f1b | |
|
6782b9884e | |
|
94c74d60f2 | |
|
5463e5a664 | |
|
ed139e7e6b | |
|
5bd199d0a5 | |
|
4ec308a302 |
|
@ -1,50 +1,37 @@
|
|||
name: Publish Python 🐍 distributions 📦 to PyPI
|
||||
name: Publish JobSpy to PyPi
|
||||
on:
|
||||
pull_request:
|
||||
types:
|
||||
- closed
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
build-n-publish:
|
||||
name: Build and publish Python 🐍 distributions 📦 to PyPI
|
||||
name: Build and publish JobSpy to PyPi
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
if: github.event.pull_request.merged == true && github.event.pull_request.base.ref == 'main'
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.10"
|
||||
|
||||
- name: Install dependencies
|
||||
run: pip install toml
|
||||
|
||||
- name: Increment version
|
||||
run: python increment_version.py
|
||||
|
||||
- name: Commit version increment
|
||||
run: |
|
||||
git config --global user.name 'github-actions'
|
||||
git config --global user.email 'github-actions@github.com'
|
||||
git add pyproject.toml
|
||||
git commit -m 'Increment version'
|
||||
|
||||
- name: Push changes
|
||||
run: git push
|
||||
|
||||
- name: Install poetry
|
||||
run: pip install poetry --user
|
||||
run: >-
|
||||
python3 -m
|
||||
pip install
|
||||
poetry
|
||||
--user
|
||||
|
||||
- name: Build distribution 📦
|
||||
run: poetry build
|
||||
run: >-
|
||||
python3 -m
|
||||
poetry
|
||||
build
|
||||
|
||||
- name: Publish distribution 📦 to PyPI
|
||||
if: startsWith(github.ref, 'refs/tags') || github.event_name == 'workflow_dispatch'
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
password: ${{ secrets.PYPI_API_TOKEN }}
|
||||
password: ${{ secrets.PYPI_API_TOKEN }}
|
17
README.md
17
README.md
|
@ -4,7 +4,7 @@
|
|||
|
||||
## Features
|
||||
|
||||
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, **ZipRecruiter**, & **Bayt** concurrently
|
||||
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, **ZipRecruiter**, **Bayt** & **Naukri** concurrently
|
||||
- Aggregates the job postings in a dataframe
|
||||
- Proxies support to bypass blocking
|
||||
|
||||
|
@ -25,7 +25,7 @@ import csv
|
|||
from jobspy import scrape_jobs
|
||||
|
||||
jobs = scrape_jobs(
|
||||
site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google", "bayt"],
|
||||
site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google", "bayt", "naukri"],
|
||||
search_term="software engineer",
|
||||
google_search_term="software engineer jobs near San Francisco, CA since yesterday",
|
||||
location="San Francisco, CA",
|
||||
|
@ -51,6 +51,7 @@ linkedin Software Engineer - Early Career Lockheed Martin Sunnyvale
|
|||
linkedin Full-Stack Software Engineer Rain New York NY fulltime yearly None None https://www.linkedin.com/jobs/view/3696158877 Rain’s mission is to create the fastest and ea...
|
||||
zip_recruiter Software Engineer - New Grad ZipRecruiter Santa Monica CA fulltime yearly 130000 150000 https://www.ziprecruiter.com/jobs/ziprecruiter... We offer a hybrid work environment. Most US-ba...
|
||||
zip_recruiter Software Developer TEKsystems Phoenix AZ fulltime hourly 65 75 https://www.ziprecruiter.com/jobs/teksystems-0... Top Skills' Details• 6 years of Java developme...
|
||||
|
||||
```
|
||||
|
||||
### Parameters for `scrape_jobs()`
|
||||
|
@ -220,6 +221,7 @@ JobPost
|
|||
│ ├── country
|
||||
│ ├── city
|
||||
│ ├── state
|
||||
├── is_remote
|
||||
├── description
|
||||
├── job_type: fulltime, parttime, internship, contract
|
||||
├── job_function
|
||||
|
@ -229,8 +231,7 @@ JobPost
|
|||
│ ├── currency
|
||||
│ └── salary_source: direct_data, description (parsed from posting)
|
||||
├── date_posted
|
||||
├── emails
|
||||
└── is_remote
|
||||
└── emails
|
||||
|
||||
Linkedin specific
|
||||
└── job_level
|
||||
|
@ -245,4 +246,12 @@ Indeed specific
|
|||
├── company_revenue_label
|
||||
├── company_description
|
||||
└── company_logo
|
||||
|
||||
Naukri specific
|
||||
├── skills
|
||||
├── experience_range
|
||||
├── company_rating
|
||||
├── company_reviews_count
|
||||
├── vacancy_count
|
||||
└── work_from_home_type
|
||||
```
|
||||
|
|
|
@ -1,21 +0,0 @@
|
|||
import toml
|
||||
|
||||
def increment_version(version):
|
||||
major, minor, patch = map(int, version.split('.'))
|
||||
patch += 1
|
||||
return f"{major}.{minor}.{patch}"
|
||||
|
||||
# Load pyproject.toml
|
||||
with open('pyproject.toml', 'r') as file:
|
||||
pyproject = toml.load(file)
|
||||
|
||||
# Increment the version
|
||||
current_version = pyproject['tool']['poetry']['version']
|
||||
new_version = increment_version(current_version)
|
||||
pyproject['tool']['poetry']['version'] = new_version
|
||||
|
||||
# Save the updated pyproject.toml
|
||||
with open('pyproject.toml', 'w') as file:
|
||||
toml.dump(pyproject, file)
|
||||
|
||||
print(f"Version updated from {current_version} to {new_version}")
|
|
@ -1,25 +1,28 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import pandas as pd
|
||||
from typing import Tuple
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from typing import Tuple
|
||||
|
||||
from .jobs import JobType, Location
|
||||
from .scrapers.utils import set_logger_level, extract_salary, create_logger
|
||||
from .scrapers.indeed import IndeedScraper
|
||||
from .scrapers.ziprecruiter import ZipRecruiterScraper
|
||||
from .scrapers.glassdoor import GlassdoorScraper
|
||||
from .scrapers.google import GoogleJobsScraper
|
||||
from .scrapers.linkedin import LinkedInScraper
|
||||
from .scrapers.bayt import BaytScraper
|
||||
from .scrapers import SalarySource, ScraperInput, Site, JobResponse, Country
|
||||
from .scrapers.exceptions import (
|
||||
LinkedInException,
|
||||
IndeedException,
|
||||
ZipRecruiterException,
|
||||
GlassdoorException,
|
||||
GoogleJobsException,
|
||||
import pandas as pd
|
||||
|
||||
from jobspy.bayt import BaytScraper
|
||||
from jobspy.glassdoor import Glassdoor
|
||||
from jobspy.google import Google
|
||||
from jobspy.indeed import Indeed
|
||||
from jobspy.linkedin import LinkedIn
|
||||
from jobspy.naukri import Naukri
|
||||
from jobspy.model import JobType, Location, JobResponse, Country
|
||||
from jobspy.model import SalarySource, ScraperInput, Site
|
||||
from jobspy.util import (
|
||||
set_logger_level,
|
||||
extract_salary,
|
||||
create_logger,
|
||||
get_enum_from_value,
|
||||
map_str_to_site,
|
||||
convert_to_annual,
|
||||
desired_order,
|
||||
)
|
||||
from jobspy.ziprecruiter import ZipRecruiter
|
||||
|
||||
|
||||
def scrape_jobs(
|
||||
|
@ -33,7 +36,6 @@ def scrape_jobs(
|
|||
easy_apply: bool | None = None,
|
||||
results_wanted: int = 15,
|
||||
country_indeed: str = "usa",
|
||||
hyperlinks: bool = False,
|
||||
proxies: list[str] | str | None = None,
|
||||
ca_cert: str | None = None,
|
||||
description_format: str = "markdown",
|
||||
|
@ -46,28 +48,19 @@ def scrape_jobs(
|
|||
**kwargs,
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Simultaneously scrapes job data from multiple job sites.
|
||||
:return: pandas dataframe containing job data
|
||||
Scrapes job data from job boards concurrently
|
||||
:return: Pandas DataFrame containing job data
|
||||
"""
|
||||
SCRAPER_MAPPING = {
|
||||
Site.LINKEDIN: LinkedInScraper,
|
||||
Site.INDEED: IndeedScraper,
|
||||
Site.ZIP_RECRUITER: ZipRecruiterScraper,
|
||||
Site.GLASSDOOR: GlassdoorScraper,
|
||||
Site.GOOGLE: GoogleJobsScraper,
|
||||
Site.LINKEDIN: LinkedIn,
|
||||
Site.INDEED: Indeed,
|
||||
Site.ZIP_RECRUITER: ZipRecruiter,
|
||||
Site.GLASSDOOR: Glassdoor,
|
||||
Site.GOOGLE: Google,
|
||||
Site.BAYT: BaytScraper,
|
||||
Site.NAUKRI: Naukri,
|
||||
}
|
||||
set_logger_level(verbose)
|
||||
|
||||
def map_str_to_site(site_name: str) -> Site:
|
||||
return Site[site_name.upper()]
|
||||
|
||||
def get_enum_from_value(value_str):
|
||||
for job_type in JobType:
|
||||
if value_str in job_type.value:
|
||||
return job_type
|
||||
raise Exception(f"Invalid job type: {value_str}")
|
||||
|
||||
job_type = get_enum_from_value(job_type) if job_type else None
|
||||
|
||||
def get_site_type():
|
||||
|
@ -127,28 +120,12 @@ def scrape_jobs(
|
|||
site_value, scraped_data = future.result()
|
||||
site_to_jobs_dict[site_value] = scraped_data
|
||||
|
||||
def convert_to_annual(job_data: dict):
|
||||
if job_data["interval"] == "hourly":
|
||||
job_data["min_amount"] *= 2080
|
||||
job_data["max_amount"] *= 2080
|
||||
if job_data["interval"] == "monthly":
|
||||
job_data["min_amount"] *= 12
|
||||
job_data["max_amount"] *= 12
|
||||
if job_data["interval"] == "weekly":
|
||||
job_data["min_amount"] *= 52
|
||||
job_data["max_amount"] *= 52
|
||||
if job_data["interval"] == "daily":
|
||||
job_data["min_amount"] *= 260
|
||||
job_data["max_amount"] *= 260
|
||||
job_data["interval"] = "yearly"
|
||||
|
||||
jobs_dfs: list[pd.DataFrame] = []
|
||||
|
||||
for site, job_response in site_to_jobs_dict.items():
|
||||
for job in job_response.jobs:
|
||||
job_data = job.dict()
|
||||
job_url = job_data["job_url"]
|
||||
job_data["job_url_hyper"] = f'<a href="{job_url}">{job_url}</a>'
|
||||
job_data["site"] = site
|
||||
job_data["company"] = job_data["company_name"]
|
||||
job_data["job_type"] = (
|
||||
|
@ -164,6 +141,7 @@ def scrape_jobs(
|
|||
**job_data["location"]
|
||||
).display_location()
|
||||
|
||||
# Handle compensation
|
||||
compensation_obj = job_data.get("compensation")
|
||||
if compensation_obj and isinstance(compensation_obj, dict):
|
||||
job_data["interval"] = (
|
||||
|
@ -182,7 +160,6 @@ def scrape_jobs(
|
|||
and job_data["max_amount"]
|
||||
):
|
||||
convert_to_annual(job_data)
|
||||
|
||||
else:
|
||||
if country_enum == Country.USA:
|
||||
(
|
||||
|
@ -201,6 +178,17 @@ def scrape_jobs(
|
|||
if "min_amount" in job_data and job_data["min_amount"]
|
||||
else None
|
||||
)
|
||||
|
||||
#naukri-specific fields
|
||||
job_data["skills"] = (
|
||||
", ".join(job_data["skills"]) if job_data["skills"] else None
|
||||
)
|
||||
job_data["experience_range"] = job_data.get("experience_range")
|
||||
job_data["company_rating"] = job_data.get("company_rating")
|
||||
job_data["company_reviews_count"] = job_data.get("company_reviews_count")
|
||||
job_data["vacancy_count"] = job_data.get("vacancy_count")
|
||||
job_data["work_from_home_type"] = job_data.get("work_from_home_type")
|
||||
|
||||
job_df = pd.DataFrame([job_data])
|
||||
jobs_dfs.append(job_df)
|
||||
|
||||
|
@ -211,38 +199,6 @@ def scrape_jobs(
|
|||
# Step 2: Concatenate the filtered DataFrames
|
||||
jobs_df = pd.concat(filtered_dfs, ignore_index=True)
|
||||
|
||||
# Desired column order
|
||||
desired_order = [
|
||||
"id",
|
||||
"site",
|
||||
"job_url_hyper" if hyperlinks else "job_url",
|
||||
"job_url_direct",
|
||||
"title",
|
||||
"company",
|
||||
"location",
|
||||
"date_posted",
|
||||
"job_type",
|
||||
"salary_source",
|
||||
"interval",
|
||||
"min_amount",
|
||||
"max_amount",
|
||||
"currency",
|
||||
"is_remote",
|
||||
"job_level",
|
||||
"job_function",
|
||||
"listing_type",
|
||||
"emails",
|
||||
"description",
|
||||
"company_industry",
|
||||
"company_url",
|
||||
"company_logo",
|
||||
"company_url_direct",
|
||||
"company_addresses",
|
||||
"company_num_employees",
|
||||
"company_revenue",
|
||||
"company_description",
|
||||
]
|
||||
|
||||
# Step 3: Ensure all desired columns are present, adding missing ones as empty
|
||||
for column in desired_order:
|
||||
if column not in jobs_df.columns:
|
||||
|
@ -256,4 +212,4 @@ def scrape_jobs(
|
|||
by=["site", "date_posted"], ascending=[True, False]
|
||||
).reset_index(drop=True)
|
||||
else:
|
||||
return pd.DataFrame()
|
||||
return pd.DataFrame()
|
|
@ -1,10 +1,3 @@
|
|||
"""
|
||||
jobspy.scrapers.bayt
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
This module contains routines to scrape Bayt.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import random
|
||||
|
@ -12,9 +5,16 @@ import time
|
|||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .. import Scraper, ScraperInput, Site
|
||||
from ..utils import create_logger, create_session
|
||||
from ...jobs import JobPost, JobResponse, Location, Country
|
||||
from jobspy.model import (
|
||||
Scraper,
|
||||
ScraperInput,
|
||||
Site,
|
||||
JobPost,
|
||||
JobResponse,
|
||||
Location,
|
||||
Country,
|
||||
)
|
||||
from jobspy.util import create_logger, create_session
|
||||
|
||||
log = create_logger("Bayt")
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
"""
|
||||
jobspy.scrapers.exceptions
|
||||
jobspy.jobboard.exceptions
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
This module contains the set of Scrapers' exceptions.
|
||||
|
@ -34,3 +34,7 @@ class GoogleJobsException(Exception):
|
|||
class BaytException(Exception):
|
||||
def __init__(self, message=None):
|
||||
super().__init__(message or "An error occurred with Bayt")
|
||||
|
||||
class NaukriException(Exception):
|
||||
def __init__(self,message=None):
|
||||
super().__init__(message or "An error occurred with Naukri")
|
|
@ -1,41 +1,38 @@
|
|||
"""
|
||||
jobspy.scrapers.glassdoor
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
This module contains routines to scrape Glassdoor.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import json
|
||||
import requests
|
||||
from typing import Optional, Tuple
|
||||
from typing import Tuple
|
||||
from datetime import datetime, timedelta
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
from .constants import fallback_token, query_template, headers
|
||||
from .. import Scraper, ScraperInput, Site
|
||||
from ..utils import extract_emails_from_text, create_logger
|
||||
from ..exceptions import GlassdoorException
|
||||
from ..utils import (
|
||||
from jobspy.glassdoor.constant import fallback_token, query_template, headers
|
||||
from jobspy.glassdoor.util import (
|
||||
get_cursor_for_page,
|
||||
parse_compensation,
|
||||
parse_location,
|
||||
)
|
||||
from jobspy.util import (
|
||||
extract_emails_from_text,
|
||||
create_logger,
|
||||
create_session,
|
||||
markdown_converter,
|
||||
)
|
||||
from ...jobs import (
|
||||
from jobspy.exception import GlassdoorException
|
||||
from jobspy.model import (
|
||||
JobPost,
|
||||
Compensation,
|
||||
CompensationInterval,
|
||||
Location,
|
||||
JobResponse,
|
||||
JobType,
|
||||
DescriptionFormat,
|
||||
Scraper,
|
||||
ScraperInput,
|
||||
Site,
|
||||
)
|
||||
|
||||
log = create_logger("Glassdoor")
|
||||
|
||||
|
||||
class GlassdoorScraper(Scraper):
|
||||
class Glassdoor(Scraper):
|
||||
def __init__(
|
||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
||||
):
|
||||
|
@ -146,7 +143,7 @@ class GlassdoorScraper(Scraper):
|
|||
except Exception as exc:
|
||||
raise GlassdoorException(f"Glassdoor generated an exception: {exc}")
|
||||
|
||||
return jobs, self.get_cursor_for_page(
|
||||
return jobs, get_cursor_for_page(
|
||||
res_json["data"]["jobListings"]["paginationCursors"], page_num + 1
|
||||
)
|
||||
|
||||
|
@ -185,9 +182,9 @@ class GlassdoorScraper(Scraper):
|
|||
if location_type == "S":
|
||||
is_remote = True
|
||||
else:
|
||||
location = self.parse_location(location_name)
|
||||
location = parse_location(location_name)
|
||||
|
||||
compensation = self.parse_compensation(job["header"])
|
||||
compensation = parse_compensation(job["header"])
|
||||
try:
|
||||
description = self._fetch_job_description(job_id)
|
||||
except:
|
||||
|
@ -321,44 +318,3 @@ class GlassdoorScraper(Scraper):
|
|||
{"filterKey": "jobType", "values": self.scraper_input.job_type.value[0]}
|
||||
)
|
||||
return json.dumps([payload])
|
||||
|
||||
@staticmethod
|
||||
def parse_compensation(data: dict) -> Optional[Compensation]:
|
||||
pay_period = data.get("payPeriod")
|
||||
adjusted_pay = data.get("payPeriodAdjustedPay")
|
||||
currency = data.get("payCurrency", "USD")
|
||||
if not pay_period or not adjusted_pay:
|
||||
return None
|
||||
|
||||
interval = None
|
||||
if pay_period == "ANNUAL":
|
||||
interval = CompensationInterval.YEARLY
|
||||
elif pay_period:
|
||||
interval = CompensationInterval.get_interval(pay_period)
|
||||
min_amount = int(adjusted_pay.get("p10") // 1)
|
||||
max_amount = int(adjusted_pay.get("p90") // 1)
|
||||
return Compensation(
|
||||
interval=interval,
|
||||
min_amount=min_amount,
|
||||
max_amount=max_amount,
|
||||
currency=currency,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
|
||||
for job_type in JobType:
|
||||
if job_type_str in job_type.value:
|
||||
return [job_type]
|
||||
|
||||
@staticmethod
|
||||
def parse_location(location_name: str) -> Location | None:
|
||||
if not location_name or location_name == "Remote":
|
||||
return
|
||||
city, _, state = location_name.partition(", ")
|
||||
return Location(city=city, state=state)
|
||||
|
||||
@staticmethod
|
||||
def get_cursor_for_page(pagination_cursors, page_num):
|
||||
for cursor_data in pagination_cursors:
|
||||
if cursor_data["pageNumber"] == page_num:
|
||||
return cursor_data["cursor"]
|
|
@ -0,0 +1,42 @@
|
|||
from jobspy.model import Compensation, CompensationInterval, Location, JobType
|
||||
|
||||
|
||||
def parse_compensation(data: dict) -> Compensation | None:
|
||||
pay_period = data.get("payPeriod")
|
||||
adjusted_pay = data.get("payPeriodAdjustedPay")
|
||||
currency = data.get("payCurrency", "USD")
|
||||
if not pay_period or not adjusted_pay:
|
||||
return None
|
||||
|
||||
interval = None
|
||||
if pay_period == "ANNUAL":
|
||||
interval = CompensationInterval.YEARLY
|
||||
elif pay_period:
|
||||
interval = CompensationInterval.get_interval(pay_period)
|
||||
min_amount = int(adjusted_pay.get("p10") // 1)
|
||||
max_amount = int(adjusted_pay.get("p90") // 1)
|
||||
return Compensation(
|
||||
interval=interval,
|
||||
min_amount=min_amount,
|
||||
max_amount=max_amount,
|
||||
currency=currency,
|
||||
)
|
||||
|
||||
|
||||
def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
|
||||
for job_type in JobType:
|
||||
if job_type_str in job_type.value:
|
||||
return [job_type]
|
||||
|
||||
|
||||
def parse_location(location_name: str) -> Location | None:
|
||||
if not location_name or location_name == "Remote":
|
||||
return
|
||||
city, _, state = location_name.partition(", ")
|
||||
return Location(city=city, state=state)
|
||||
|
||||
|
||||
def get_cursor_for_page(pagination_cursors, page_num):
|
||||
for cursor_data in pagination_cursors:
|
||||
if cursor_data["pageNumber"] == page_num:
|
||||
return cursor_data["cursor"]
|
|
@ -1,10 +1,3 @@
|
|||
"""
|
||||
jobspy.scrapers.google
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
This module contains routines to scrape Google.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
|
@ -13,23 +6,21 @@ import json
|
|||
from typing import Tuple
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from .constants import headers_jobs, headers_initial, async_param
|
||||
from .. import Scraper, ScraperInput, Site
|
||||
from ..utils import extract_emails_from_text, create_logger, extract_job_type
|
||||
from ..utils import (
|
||||
create_session,
|
||||
)
|
||||
from ...jobs import (
|
||||
from jobspy.google.constant import headers_jobs, headers_initial, async_param
|
||||
from jobspy.model import (
|
||||
Scraper,
|
||||
ScraperInput,
|
||||
Site,
|
||||
JobPost,
|
||||
JobResponse,
|
||||
Location,
|
||||
JobType,
|
||||
)
|
||||
|
||||
log = create_logger("Google")
|
||||
from jobspy.util import extract_emails_from_text, extract_job_type, create_session
|
||||
from jobspy.google.util import log, find_job_info_initial_page, find_job_info
|
||||
|
||||
|
||||
class GoogleJobsScraper(Scraper):
|
||||
class Google(Scraper):
|
||||
def __init__(
|
||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
||||
):
|
||||
|
@ -135,7 +126,7 @@ class GoogleJobsScraper(Scraper):
|
|||
pattern_fc = r'<div jsname="Yust4d"[^>]+data-async-fc="([^"]+)"'
|
||||
match_fc = re.search(pattern_fc, response.text)
|
||||
data_async_fc = match_fc.group(1) if match_fc else None
|
||||
jobs_raw = self._find_job_info_initial_page(response.text)
|
||||
jobs_raw = find_job_info_initial_page(response.text)
|
||||
jobs = []
|
||||
for job_raw in jobs_raw:
|
||||
job_post = self._parse_job(job_raw)
|
||||
|
@ -167,7 +158,7 @@ class GoogleJobsScraper(Scraper):
|
|||
continue
|
||||
job_d = json.loads(job_data)
|
||||
|
||||
job_info = self._find_job_info(job_d)
|
||||
job_info = find_job_info(job_d)
|
||||
job_post = self._parse_job(job_info)
|
||||
if job_post:
|
||||
jobs_on_page.append(job_post)
|
||||
|
@ -209,39 +200,3 @@ class GoogleJobsScraper(Scraper):
|
|||
job_type=extract_job_type(description),
|
||||
)
|
||||
return job_post
|
||||
|
||||
@staticmethod
|
||||
def _find_job_info(jobs_data: list | dict) -> list | None:
|
||||
"""Iterates through the JSON data to find the job listings"""
|
||||
if isinstance(jobs_data, dict):
|
||||
for key, value in jobs_data.items():
|
||||
if key == "520084652" and isinstance(value, list):
|
||||
return value
|
||||
else:
|
||||
result = GoogleJobsScraper._find_job_info(value)
|
||||
if result:
|
||||
return result
|
||||
elif isinstance(jobs_data, list):
|
||||
for item in jobs_data:
|
||||
result = GoogleJobsScraper._find_job_info(item)
|
||||
if result:
|
||||
return result
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _find_job_info_initial_page(html_text: str):
|
||||
pattern = f'520084652":(' + r"\[.*?\]\s*])\s*}\s*]\s*]\s*]\s*]\s*]"
|
||||
results = []
|
||||
matches = re.finditer(pattern, html_text)
|
||||
|
||||
import json
|
||||
|
||||
for match in matches:
|
||||
try:
|
||||
parsed_data = json.loads(match.group(1))
|
||||
results.append(parsed_data)
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
log.error(f"Failed to parse match: {str(e)}")
|
||||
results.append({"raw_match": match.group(0), "error": str(e)})
|
||||
return results
|
|
@ -0,0 +1,41 @@
|
|||
import re
|
||||
|
||||
from jobspy.util import create_logger
|
||||
|
||||
log = create_logger("Google")
|
||||
|
||||
|
||||
def find_job_info(jobs_data: list | dict) -> list | None:
|
||||
"""Iterates through the JSON data to find the job listings"""
|
||||
if isinstance(jobs_data, dict):
|
||||
for key, value in jobs_data.items():
|
||||
if key == "520084652" and isinstance(value, list):
|
||||
return value
|
||||
else:
|
||||
result = find_job_info(value)
|
||||
if result:
|
||||
return result
|
||||
elif isinstance(jobs_data, list):
|
||||
for item in jobs_data:
|
||||
result = find_job_info(item)
|
||||
if result:
|
||||
return result
|
||||
return None
|
||||
|
||||
|
||||
def find_job_info_initial_page(html_text: str):
|
||||
pattern = f'520084652":(' + r"\[.*?\]\s*])\s*}\s*]\s*]\s*]\s*]\s*]"
|
||||
results = []
|
||||
matches = re.finditer(pattern, html_text)
|
||||
|
||||
import json
|
||||
|
||||
for match in matches:
|
||||
try:
|
||||
parsed_data = json.loads(match.group(1))
|
||||
results.append(parsed_data)
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
log.error(f"Failed to parse match: {str(e)}")
|
||||
results.append({"raw_match": match.group(0), "error": str(e)})
|
||||
return results
|
|
@ -1,39 +1,32 @@
|
|||
"""
|
||||
jobspy.scrapers.indeed
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
This module contains routines to scrape Indeed.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
from typing import Tuple
|
||||
from datetime import datetime
|
||||
from typing import Tuple
|
||||
|
||||
from .constants import job_search_query, api_headers
|
||||
from .. import Scraper, ScraperInput, Site
|
||||
from ..utils import (
|
||||
extract_emails_from_text,
|
||||
get_enum_from_job_type,
|
||||
markdown_converter,
|
||||
create_session,
|
||||
create_logger,
|
||||
)
|
||||
from ...jobs import (
|
||||
from jobspy.indeed.constant import job_search_query, api_headers
|
||||
from jobspy.indeed.util import is_job_remote, get_compensation, get_job_type
|
||||
from jobspy.model import (
|
||||
Scraper,
|
||||
ScraperInput,
|
||||
Site,
|
||||
JobPost,
|
||||
Compensation,
|
||||
CompensationInterval,
|
||||
Location,
|
||||
JobResponse,
|
||||
JobType,
|
||||
DescriptionFormat,
|
||||
)
|
||||
from jobspy.util import (
|
||||
extract_emails_from_text,
|
||||
markdown_converter,
|
||||
create_session,
|
||||
create_logger,
|
||||
)
|
||||
|
||||
log = create_logger("Indeed")
|
||||
|
||||
|
||||
class IndeedScraper(Scraper):
|
||||
class Indeed(Scraper):
|
||||
def __init__(
|
||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
||||
):
|
||||
|
@ -213,7 +206,7 @@ class IndeedScraper(Scraper):
|
|||
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
||||
description = markdown_converter(description)
|
||||
|
||||
job_type = self._get_job_type(job["attributes"])
|
||||
job_type = get_job_type(job["attributes"])
|
||||
timestamp_seconds = job["datePublished"] / 1000
|
||||
date_posted = datetime.fromtimestamp(timestamp_seconds).strftime("%Y-%m-%d")
|
||||
employer = job["employer"].get("dossier") if job["employer"] else None
|
||||
|
@ -234,14 +227,14 @@ class IndeedScraper(Scraper):
|
|||
country=job.get("location", {}).get("countryCode"),
|
||||
),
|
||||
job_type=job_type,
|
||||
compensation=self._get_compensation(job["compensation"]),
|
||||
compensation=get_compensation(job["compensation"]),
|
||||
date_posted=date_posted,
|
||||
job_url=job_url,
|
||||
job_url_direct=(
|
||||
job["recruit"].get("viewJobUrl") if job.get("recruit") else None
|
||||
),
|
||||
emails=extract_emails_from_text(description) if description else None,
|
||||
is_remote=self._is_job_remote(job, description),
|
||||
is_remote=is_job_remote(job, description),
|
||||
company_addresses=(
|
||||
employer_details["addresses"][0]
|
||||
if employer_details.get("addresses")
|
||||
|
@ -265,86 +258,3 @@ class IndeedScraper(Scraper):
|
|||
else None
|
||||
),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _get_job_type(attributes: list) -> list[JobType]:
|
||||
"""
|
||||
Parses the attributes to get list of job types
|
||||
:param attributes:
|
||||
:return: list of JobType
|
||||
"""
|
||||
job_types: list[JobType] = []
|
||||
for attribute in attributes:
|
||||
job_type_str = attribute["label"].replace("-", "").replace(" ", "").lower()
|
||||
job_type = get_enum_from_job_type(job_type_str)
|
||||
if job_type:
|
||||
job_types.append(job_type)
|
||||
return job_types
|
||||
|
||||
@staticmethod
|
||||
def _get_compensation(compensation: dict) -> Compensation | None:
|
||||
"""
|
||||
Parses the job to get compensation
|
||||
:param job:
|
||||
:return: compensation object
|
||||
"""
|
||||
if not compensation["baseSalary"] and not compensation["estimated"]:
|
||||
return None
|
||||
comp = (
|
||||
compensation["baseSalary"]
|
||||
if compensation["baseSalary"]
|
||||
else compensation["estimated"]["baseSalary"]
|
||||
)
|
||||
if not comp:
|
||||
return None
|
||||
interval = IndeedScraper._get_compensation_interval(comp["unitOfWork"])
|
||||
if not interval:
|
||||
return None
|
||||
min_range = comp["range"].get("min")
|
||||
max_range = comp["range"].get("max")
|
||||
return Compensation(
|
||||
interval=interval,
|
||||
min_amount=int(min_range) if min_range is not None else None,
|
||||
max_amount=int(max_range) if max_range is not None else None,
|
||||
currency=(
|
||||
compensation["estimated"]["currencyCode"]
|
||||
if compensation["estimated"]
|
||||
else compensation["currencyCode"]
|
||||
),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _is_job_remote(job: dict, description: str) -> bool:
|
||||
"""
|
||||
Searches the description, location, and attributes to check if job is remote
|
||||
"""
|
||||
remote_keywords = ["remote", "work from home", "wfh"]
|
||||
is_remote_in_attributes = any(
|
||||
any(keyword in attr["label"].lower() for keyword in remote_keywords)
|
||||
for attr in job["attributes"]
|
||||
)
|
||||
is_remote_in_description = any(
|
||||
keyword in description.lower() for keyword in remote_keywords
|
||||
)
|
||||
is_remote_in_location = any(
|
||||
keyword in job["location"]["formatted"]["long"].lower()
|
||||
for keyword in remote_keywords
|
||||
)
|
||||
return (
|
||||
is_remote_in_attributes or is_remote_in_description or is_remote_in_location
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _get_compensation_interval(interval: str) -> CompensationInterval:
|
||||
interval_mapping = {
|
||||
"DAY": "DAILY",
|
||||
"YEAR": "YEARLY",
|
||||
"HOUR": "HOURLY",
|
||||
"WEEK": "WEEKLY",
|
||||
"MONTH": "MONTHLY",
|
||||
}
|
||||
mapped_interval = interval_mapping.get(interval.upper(), None)
|
||||
if mapped_interval and mapped_interval in CompensationInterval.__members__:
|
||||
return CompensationInterval[mapped_interval]
|
||||
else:
|
||||
raise ValueError(f"Unsupported interval: {interval}")
|
|
@ -0,0 +1,83 @@
|
|||
from jobspy.model import CompensationInterval, JobType, Compensation
|
||||
from jobspy.util import get_enum_from_job_type
|
||||
|
||||
|
||||
def get_job_type(attributes: list) -> list[JobType]:
|
||||
"""
|
||||
Parses the attributes to get list of job types
|
||||
:param attributes:
|
||||
:return: list of JobType
|
||||
"""
|
||||
job_types: list[JobType] = []
|
||||
for attribute in attributes:
|
||||
job_type_str = attribute["label"].replace("-", "").replace(" ", "").lower()
|
||||
job_type = get_enum_from_job_type(job_type_str)
|
||||
if job_type:
|
||||
job_types.append(job_type)
|
||||
return job_types
|
||||
|
||||
|
||||
def get_compensation(compensation: dict) -> Compensation | None:
|
||||
"""
|
||||
Parses the job to get compensation
|
||||
:param compensation:
|
||||
:return: compensation object
|
||||
"""
|
||||
if not compensation["baseSalary"] and not compensation["estimated"]:
|
||||
return None
|
||||
comp = (
|
||||
compensation["baseSalary"]
|
||||
if compensation["baseSalary"]
|
||||
else compensation["estimated"]["baseSalary"]
|
||||
)
|
||||
if not comp:
|
||||
return None
|
||||
interval = get_compensation_interval(comp["unitOfWork"])
|
||||
if not interval:
|
||||
return None
|
||||
min_range = comp["range"].get("min")
|
||||
max_range = comp["range"].get("max")
|
||||
return Compensation(
|
||||
interval=interval,
|
||||
min_amount=int(min_range) if min_range is not None else None,
|
||||
max_amount=int(max_range) if max_range is not None else None,
|
||||
currency=(
|
||||
compensation["estimated"]["currencyCode"]
|
||||
if compensation["estimated"]
|
||||
else compensation["currencyCode"]
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def is_job_remote(job: dict, description: str) -> bool:
|
||||
"""
|
||||
Searches the description, location, and attributes to check if job is remote
|
||||
"""
|
||||
remote_keywords = ["remote", "work from home", "wfh"]
|
||||
is_remote_in_attributes = any(
|
||||
any(keyword in attr["label"].lower() for keyword in remote_keywords)
|
||||
for attr in job["attributes"]
|
||||
)
|
||||
is_remote_in_description = any(
|
||||
keyword in description.lower() for keyword in remote_keywords
|
||||
)
|
||||
is_remote_in_location = any(
|
||||
keyword in job["location"]["formatted"]["long"].lower()
|
||||
for keyword in remote_keywords
|
||||
)
|
||||
return is_remote_in_attributes or is_remote_in_description or is_remote_in_location
|
||||
|
||||
|
||||
def get_compensation_interval(interval: str) -> CompensationInterval:
|
||||
interval_mapping = {
|
||||
"DAY": "DAILY",
|
||||
"YEAR": "YEARLY",
|
||||
"HOUR": "HOURLY",
|
||||
"WEEK": "WEEKLY",
|
||||
"MONTH": "MONTHLY",
|
||||
}
|
||||
mapped_interval = interval_mapping.get(interval.upper(), None)
|
||||
if mapped_interval and mapped_interval in CompensationInterval.__members__:
|
||||
return CompensationInterval[mapped_interval]
|
||||
else:
|
||||
raise ValueError(f"Unsupported interval: {interval}")
|
|
@ -1,47 +1,49 @@
|
|||
"""
|
||||
jobspy.scrapers.linkedin
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
This module contains routines to scrape LinkedIn.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
import time
|
||||
import random
|
||||
import regex as re
|
||||
from typing import Optional
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
from bs4.element import Tag
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import Optional
|
||||
from urllib.parse import urlparse, urlunparse, unquote
|
||||
|
||||
from .constants import headers
|
||||
from .. import Scraper, ScraperInput, Site
|
||||
from ..exceptions import LinkedInException
|
||||
from ..utils import create_session, remove_attributes, create_logger
|
||||
from ...jobs import (
|
||||
import regex as re
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.element import Tag
|
||||
|
||||
from jobspy.exception import LinkedInException
|
||||
from jobspy.linkedin.constant import headers
|
||||
from jobspy.linkedin.util import (
|
||||
is_job_remote,
|
||||
job_type_code,
|
||||
parse_job_type,
|
||||
parse_job_level,
|
||||
parse_company_industry
|
||||
)
|
||||
from jobspy.model import (
|
||||
JobPost,
|
||||
Location,
|
||||
JobResponse,
|
||||
JobType,
|
||||
Country,
|
||||
Compensation,
|
||||
DescriptionFormat,
|
||||
Scraper,
|
||||
ScraperInput,
|
||||
Site,
|
||||
)
|
||||
from ..utils import (
|
||||
from jobspy.util import (
|
||||
extract_emails_from_text,
|
||||
get_enum_from_job_type,
|
||||
currency_parser,
|
||||
markdown_converter,
|
||||
create_session,
|
||||
remove_attributes,
|
||||
create_logger,
|
||||
)
|
||||
|
||||
log = create_logger("LinkedIn")
|
||||
|
||||
|
||||
class LinkedInScraper(Scraper):
|
||||
class LinkedIn(Scraper):
|
||||
base_url = "https://www.linkedin.com"
|
||||
delay = 3
|
||||
band_delay = 4
|
||||
|
@ -95,7 +97,7 @@ class LinkedInScraper(Scraper):
|
|||
"distance": scraper_input.distance,
|
||||
"f_WT": 2 if scraper_input.is_remote else None,
|
||||
"f_JT": (
|
||||
self.job_type_code(scraper_input.job_type)
|
||||
job_type_code(scraper_input.job_type)
|
||||
if scraper_input.job_type
|
||||
else None
|
||||
),
|
||||
|
@ -172,7 +174,7 @@ class LinkedInScraper(Scraper):
|
|||
) -> Optional[JobPost]:
|
||||
salary_tag = job_card.find("span", class_="job-search-card__salary-info")
|
||||
|
||||
compensation = None
|
||||
compensation = description = None
|
||||
if salary_tag:
|
||||
salary_text = salary_tag.get_text(separator=" ").strip()
|
||||
salary_values = [currency_parser(value) for value in salary_text.split("-")]
|
||||
|
@ -216,6 +218,8 @@ class LinkedInScraper(Scraper):
|
|||
job_details = {}
|
||||
if full_descr:
|
||||
job_details = self._get_job_details(job_id)
|
||||
description = job_details.get("description")
|
||||
is_remote = is_job_remote(title, description, location)
|
||||
|
||||
return JobPost(
|
||||
id=f"li-{job_id}",
|
||||
|
@ -223,6 +227,7 @@ class LinkedInScraper(Scraper):
|
|||
company_name=company,
|
||||
company_url=company_url,
|
||||
location=location,
|
||||
is_remote=is_remote,
|
||||
date_posted=date_posted,
|
||||
job_url=f"{self.base_url}/jobs/view/{job_id}",
|
||||
compensation=compensation,
|
||||
|
@ -231,7 +236,7 @@ class LinkedInScraper(Scraper):
|
|||
company_industry=job_details.get("company_industry"),
|
||||
description=job_details.get("description"),
|
||||
job_url_direct=job_details.get("job_url_direct"),
|
||||
emails=extract_emails_from_text(job_details.get("description")),
|
||||
emails=extract_emails_from_text(description),
|
||||
company_logo=job_details.get("company_logo"),
|
||||
job_function=job_details.get("job_function"),
|
||||
)
|
||||
|
@ -282,9 +287,9 @@ class LinkedInScraper(Scraper):
|
|||
)
|
||||
return {
|
||||
"description": description,
|
||||
"job_level": self._parse_job_level(soup),
|
||||
"company_industry": self._parse_company_industry(soup),
|
||||
"job_type": self._parse_job_type(soup),
|
||||
"job_level": parse_job_level(soup),
|
||||
"company_industry": parse_company_industry(soup),
|
||||
"job_type": parse_job_type(soup),
|
||||
"job_url_direct": self._parse_job_url_direct(soup),
|
||||
"company_logo": company_logo,
|
||||
"job_function": job_function,
|
||||
|
@ -316,77 +321,6 @@ class LinkedInScraper(Scraper):
|
|||
location = Location(city=city, state=state, country=country)
|
||||
return location
|
||||
|
||||
@staticmethod
|
||||
def _parse_job_type(soup_job_type: BeautifulSoup) -> list[JobType] | None:
|
||||
"""
|
||||
Gets the job type from job page
|
||||
:param soup_job_type:
|
||||
:return: JobType
|
||||
"""
|
||||
h3_tag = soup_job_type.find(
|
||||
"h3",
|
||||
class_="description__job-criteria-subheader",
|
||||
string=lambda text: "Employment type" in text,
|
||||
)
|
||||
employment_type = None
|
||||
if h3_tag:
|
||||
employment_type_span = h3_tag.find_next_sibling(
|
||||
"span",
|
||||
class_="description__job-criteria-text description__job-criteria-text--criteria",
|
||||
)
|
||||
if employment_type_span:
|
||||
employment_type = employment_type_span.get_text(strip=True)
|
||||
employment_type = employment_type.lower()
|
||||
employment_type = employment_type.replace("-", "")
|
||||
|
||||
return [get_enum_from_job_type(employment_type)] if employment_type else []
|
||||
|
||||
@staticmethod
|
||||
def _parse_job_level(soup_job_level: BeautifulSoup) -> str | None:
|
||||
"""
|
||||
Gets the job level from job page
|
||||
:param soup_job_level:
|
||||
:return: str
|
||||
"""
|
||||
h3_tag = soup_job_level.find(
|
||||
"h3",
|
||||
class_="description__job-criteria-subheader",
|
||||
string=lambda text: "Seniority level" in text,
|
||||
)
|
||||
job_level = None
|
||||
if h3_tag:
|
||||
job_level_span = h3_tag.find_next_sibling(
|
||||
"span",
|
||||
class_="description__job-criteria-text description__job-criteria-text--criteria",
|
||||
)
|
||||
if job_level_span:
|
||||
job_level = job_level_span.get_text(strip=True)
|
||||
|
||||
return job_level
|
||||
|
||||
@staticmethod
|
||||
def _parse_company_industry(soup_industry: BeautifulSoup) -> str | None:
|
||||
"""
|
||||
Gets the company industry from job page
|
||||
:param soup_industry:
|
||||
:return: str
|
||||
"""
|
||||
h3_tag = soup_industry.find(
|
||||
"h3",
|
||||
class_="description__job-criteria-subheader",
|
||||
string=lambda text: "Industries" in text,
|
||||
)
|
||||
industry = None
|
||||
if h3_tag:
|
||||
industry_span = h3_tag.find_next_sibling(
|
||||
"span",
|
||||
class_="description__job-criteria-text description__job-criteria-text--criteria",
|
||||
)
|
||||
if industry_span:
|
||||
industry = industry_span.get_text(strip=True)
|
||||
|
||||
return industry
|
||||
|
||||
def _parse_job_url_direct(self, soup: BeautifulSoup) -> str | None:
|
||||
"""
|
||||
Gets the job url direct from job page
|
||||
|
@ -403,13 +337,3 @@ class LinkedInScraper(Scraper):
|
|||
job_url_direct = unquote(job_url_direct_match.group())
|
||||
|
||||
return job_url_direct
|
||||
|
||||
@staticmethod
|
||||
def job_type_code(job_type_enum: JobType) -> str:
|
||||
return {
|
||||
JobType.FULL_TIME: "F",
|
||||
JobType.PART_TIME: "P",
|
||||
JobType.INTERNSHIP: "I",
|
||||
JobType.CONTRACT: "C",
|
||||
JobType.TEMPORARY: "T",
|
||||
}.get(job_type_enum, "")
|
|
@ -0,0 +1,96 @@
|
|||
from bs4 import BeautifulSoup
|
||||
|
||||
from jobspy.model import JobType, Location
|
||||
from jobspy.util import get_enum_from_job_type
|
||||
|
||||
|
||||
def job_type_code(job_type_enum: JobType) -> str:
|
||||
return {
|
||||
JobType.FULL_TIME: "F",
|
||||
JobType.PART_TIME: "P",
|
||||
JobType.INTERNSHIP: "I",
|
||||
JobType.CONTRACT: "C",
|
||||
JobType.TEMPORARY: "T",
|
||||
}.get(job_type_enum, "")
|
||||
|
||||
|
||||
def parse_job_type(soup_job_type: BeautifulSoup) -> list[JobType] | None:
|
||||
"""
|
||||
Gets the job type from job page
|
||||
:param soup_job_type:
|
||||
:return: JobType
|
||||
"""
|
||||
h3_tag = soup_job_type.find(
|
||||
"h3",
|
||||
class_="description__job-criteria-subheader",
|
||||
string=lambda text: "Employment type" in text,
|
||||
)
|
||||
employment_type = None
|
||||
if h3_tag:
|
||||
employment_type_span = h3_tag.find_next_sibling(
|
||||
"span",
|
||||
class_="description__job-criteria-text description__job-criteria-text--criteria",
|
||||
)
|
||||
if employment_type_span:
|
||||
employment_type = employment_type_span.get_text(strip=True)
|
||||
employment_type = employment_type.lower()
|
||||
employment_type = employment_type.replace("-", "")
|
||||
|
||||
return [get_enum_from_job_type(employment_type)] if employment_type else []
|
||||
|
||||
|
||||
def parse_job_level(soup_job_level: BeautifulSoup) -> str | None:
|
||||
"""
|
||||
Gets the job level from job page
|
||||
:param soup_job_level:
|
||||
:return: str
|
||||
"""
|
||||
h3_tag = soup_job_level.find(
|
||||
"h3",
|
||||
class_="description__job-criteria-subheader",
|
||||
string=lambda text: "Seniority level" in text,
|
||||
)
|
||||
job_level = None
|
||||
if h3_tag:
|
||||
job_level_span = h3_tag.find_next_sibling(
|
||||
"span",
|
||||
class_="description__job-criteria-text description__job-criteria-text--criteria",
|
||||
)
|
||||
if job_level_span:
|
||||
job_level = job_level_span.get_text(strip=True)
|
||||
|
||||
return job_level
|
||||
|
||||
|
||||
def parse_company_industry(soup_industry: BeautifulSoup) -> str | None:
|
||||
"""
|
||||
Gets the company industry from job page
|
||||
:param soup_industry:
|
||||
:return: str
|
||||
"""
|
||||
h3_tag = soup_industry.find(
|
||||
"h3",
|
||||
class_="description__job-criteria-subheader",
|
||||
string=lambda text: "Industries" in text,
|
||||
)
|
||||
industry = None
|
||||
if h3_tag:
|
||||
industry_span = h3_tag.find_next_sibling(
|
||||
"span",
|
||||
class_="description__job-criteria-text description__job-criteria-text--criteria",
|
||||
)
|
||||
if industry_span:
|
||||
industry = industry_span.get_text(strip=True)
|
||||
|
||||
return industry
|
||||
|
||||
|
||||
def is_job_remote(title: dict, description: str, location: Location) -> bool:
|
||||
"""
|
||||
Searches the title, location, and description to check if job is remote
|
||||
"""
|
||||
remote_keywords = ["remote", "work from home", "wfh"]
|
||||
location = location.display_location()
|
||||
full_string = f'{title} {description} {location}'.lower()
|
||||
is_remote = any(keyword in full_string for keyword in remote_keywords)
|
||||
return is_remote
|
|
@ -1,5 +1,6 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Optional
|
||||
from datetime import date
|
||||
from enum import Enum
|
||||
|
@ -68,16 +69,20 @@ class Country(Enum):
|
|||
AUSTRIA = ("austria", "at", "at")
|
||||
BAHRAIN = ("bahrain", "bh")
|
||||
BELGIUM = ("belgium", "be", "fr:be")
|
||||
BULGARIA = ("bulgaria", "bg")
|
||||
BRAZIL = ("brazil", "br", "com.br")
|
||||
CANADA = ("canada", "ca", "ca")
|
||||
CHILE = ("chile", "cl")
|
||||
CHINA = ("china", "cn")
|
||||
COLOMBIA = ("colombia", "co")
|
||||
COSTARICA = ("costa rica", "cr")
|
||||
CROATIA = ("croatia", "hr")
|
||||
CYPRUS = ("cyprus", "cy")
|
||||
CZECHREPUBLIC = ("czech republic,czechia", "cz")
|
||||
DENMARK = ("denmark", "dk")
|
||||
ECUADOR = ("ecuador", "ec")
|
||||
EGYPT = ("egypt", "eg")
|
||||
ESTONIA = ("estonia", "ee")
|
||||
FINLAND = ("finland", "fi")
|
||||
FRANCE = ("france", "fr", "fr")
|
||||
GERMANY = ("germany", "de", "de")
|
||||
|
@ -91,6 +96,8 @@ class Country(Enum):
|
|||
ITALY = ("italy", "it", "it")
|
||||
JAPAN = ("japan", "jp")
|
||||
KUWAIT = ("kuwait", "kw")
|
||||
LATVIA = ("latvia", "lv")
|
||||
LITHUANIA = ("lithuania", "lt")
|
||||
LUXEMBOURG = ("luxembourg", "lu")
|
||||
MALAYSIA = ("malaysia", "malaysia:my", "com")
|
||||
MALTA = ("malta", "malta:mt", "mt")
|
||||
|
@ -111,6 +118,8 @@ class Country(Enum):
|
|||
ROMANIA = ("romania", "ro")
|
||||
SAUDIARABIA = ("saudi arabia", "sa")
|
||||
SINGAPORE = ("singapore", "sg", "sg")
|
||||
SLOVAKIA = ("slovakia", "sk")
|
||||
SLOVENIA = ("slovenia", "sl")
|
||||
SOUTHAFRICA = ("south africa", "za")
|
||||
SOUTHKOREA = ("south korea", "kr")
|
||||
SPAIN = ("spain", "es", "es")
|
||||
|
@ -245,13 +254,13 @@ class JobPost(BaseModel):
|
|||
is_remote: bool | None = None
|
||||
listing_type: str | None = None
|
||||
|
||||
# linkedin specific
|
||||
# LinkedIn specific
|
||||
job_level: str | None = None
|
||||
|
||||
# linkedin and indeed specific
|
||||
# LinkedIn and Indeed specific
|
||||
company_industry: str | None = None
|
||||
|
||||
# indeed specific
|
||||
# Indeed specific
|
||||
company_addresses: str | None = None
|
||||
company_num_employees: str | None = None
|
||||
company_revenue: str | None = None
|
||||
|
@ -259,9 +268,63 @@ class JobPost(BaseModel):
|
|||
company_logo: str | None = None
|
||||
banner_photo_url: str | None = None
|
||||
|
||||
# linkedin only atm
|
||||
# LinkedIn only atm
|
||||
job_function: str | None = None
|
||||
|
||||
# Naukri specific
|
||||
skills: list[str] | None = None #from tagsAndSkills
|
||||
experience_range: str | None = None #from experienceText
|
||||
company_rating: float | None = None #from ambitionBoxData.AggregateRating
|
||||
company_reviews_count: int | None = None #from ambitionBoxData.ReviewsCount
|
||||
vacancy_count: int | None = None #from vacancy
|
||||
work_from_home_type: str | None = None #from clusters.wfhType (e.g., "Hybrid", "Remote")
|
||||
|
||||
class JobResponse(BaseModel):
|
||||
jobs: list[JobPost] = []
|
||||
|
||||
|
||||
class Site(Enum):
|
||||
LINKEDIN = "linkedin"
|
||||
INDEED = "indeed"
|
||||
ZIP_RECRUITER = "zip_recruiter"
|
||||
GLASSDOOR = "glassdoor"
|
||||
GOOGLE = "google"
|
||||
BAYT = "bayt"
|
||||
NAUKRI = "naukri"
|
||||
|
||||
|
||||
class SalarySource(Enum):
|
||||
DIRECT_DATA = "direct_data"
|
||||
DESCRIPTION = "description"
|
||||
|
||||
|
||||
class ScraperInput(BaseModel):
|
||||
site_type: list[Site]
|
||||
search_term: str | None = None
|
||||
google_search_term: str | None = None
|
||||
|
||||
location: str | None = None
|
||||
country: Country | None = Country.USA
|
||||
distance: int | None = None
|
||||
is_remote: bool = False
|
||||
job_type: JobType | None = None
|
||||
easy_apply: bool | None = None
|
||||
offset: int = 0
|
||||
linkedin_fetch_description: bool = False
|
||||
linkedin_company_ids: list[int] | None = None
|
||||
description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN
|
||||
|
||||
results_wanted: int = 15
|
||||
hours_old: int | None = None
|
||||
|
||||
|
||||
class Scraper(ABC):
|
||||
def __init__(
|
||||
self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None
|
||||
):
|
||||
self.site = site
|
||||
self.proxies = proxies
|
||||
self.ca_cert = ca_cert
|
||||
|
||||
@abstractmethod
|
||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...
|
|
@ -0,0 +1,301 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
import random
|
||||
import time
|
||||
from datetime import datetime, date, timedelta
|
||||
from typing import Optional
|
||||
|
||||
import regex as re
|
||||
import requests
|
||||
|
||||
from jobspy.exception import NaukriException
|
||||
from jobspy.naukri.constant import headers as naukri_headers
|
||||
from jobspy.naukri.util import (
|
||||
is_job_remote,
|
||||
parse_job_type,
|
||||
parse_company_industry,
|
||||
)
|
||||
from jobspy.model import (
|
||||
JobPost,
|
||||
Location,
|
||||
JobResponse,
|
||||
Country,
|
||||
Compensation,
|
||||
DescriptionFormat,
|
||||
Scraper,
|
||||
ScraperInput,
|
||||
Site,
|
||||
)
|
||||
from jobspy.util import (
|
||||
extract_emails_from_text,
|
||||
currency_parser,
|
||||
markdown_converter,
|
||||
create_session,
|
||||
create_logger,
|
||||
)
|
||||
|
||||
log = create_logger("Naukri")
|
||||
|
||||
class Naukri(Scraper):
|
||||
base_url = "https://www.naukri.com/jobapi/v3/search"
|
||||
delay = 3
|
||||
band_delay = 4
|
||||
jobs_per_page = 20
|
||||
|
||||
def __init__(
|
||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
||||
):
|
||||
"""
|
||||
Initializes NaukriScraper with the Naukri API URL
|
||||
"""
|
||||
super().__init__(Site.NAUKRI, proxies=proxies, ca_cert=ca_cert)
|
||||
self.session = create_session(
|
||||
proxies=self.proxies,
|
||||
ca_cert=ca_cert,
|
||||
is_tls=False,
|
||||
has_retry=True,
|
||||
delay=5,
|
||||
clear_cookies=True,
|
||||
)
|
||||
self.session.headers.update(naukri_headers)
|
||||
self.scraper_input = None
|
||||
self.country = "India" #naukri is india-focused by default
|
||||
log.info("Naukri scraper initialized")
|
||||
|
||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
||||
"""
|
||||
Scrapes Naukri API for jobs with scraper_input criteria
|
||||
:param scraper_input:
|
||||
:return: job_response
|
||||
"""
|
||||
self.scraper_input = scraper_input
|
||||
job_list: list[JobPost] = []
|
||||
seen_ids = set()
|
||||
start = scraper_input.offset or 0
|
||||
page = (start // self.jobs_per_page) + 1
|
||||
request_count = 0
|
||||
seconds_old = (
|
||||
scraper_input.hours_old * 3600 if scraper_input.hours_old else None
|
||||
)
|
||||
continue_search = (
|
||||
lambda: len(job_list) < scraper_input.results_wanted and page <= 50 # Arbitrary limit
|
||||
)
|
||||
|
||||
while continue_search():
|
||||
request_count += 1
|
||||
log.info(
|
||||
f"Scraping page {request_count} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)} "
|
||||
f"for search term: {scraper_input.search_term}"
|
||||
)
|
||||
params = {
|
||||
"noOfResults": self.jobs_per_page,
|
||||
"urlType": "search_by_keyword",
|
||||
"searchType": "adv",
|
||||
"keyword": scraper_input.search_term,
|
||||
"pageNo": page,
|
||||
"k": scraper_input.search_term,
|
||||
"seoKey": f"{scraper_input.search_term.lower().replace(' ', '-')}-jobs",
|
||||
"src": "jobsearchDesk",
|
||||
"latLong": "",
|
||||
"location": scraper_input.location,
|
||||
"remote": "true" if scraper_input.is_remote else None,
|
||||
}
|
||||
if seconds_old:
|
||||
params["days"] = seconds_old // 86400 # Convert to days
|
||||
|
||||
params = {k: v for k, v in params.items() if v is not None}
|
||||
try:
|
||||
log.debug(f"Sending request to {self.base_url} with params: {params}")
|
||||
response = self.session.get(self.base_url, params=params, timeout=10)
|
||||
if response.status_code not in range(200, 400):
|
||||
err = f"Naukri API response status code {response.status_code} - {response.text}"
|
||||
log.error(err)
|
||||
return JobResponse(jobs=job_list)
|
||||
data = response.json()
|
||||
job_details = data.get("jobDetails", [])
|
||||
log.info(f"Received {len(job_details)} job entries from API")
|
||||
if not job_details:
|
||||
log.warning("No job details found in API response")
|
||||
break
|
||||
except Exception as e:
|
||||
log.error(f"Naukri API request failed: {str(e)}")
|
||||
return JobResponse(jobs=job_list)
|
||||
|
||||
for job in job_details:
|
||||
job_id = job.get("jobId")
|
||||
if not job_id or job_id in seen_ids:
|
||||
continue
|
||||
seen_ids.add(job_id)
|
||||
log.debug(f"Processing job ID: {job_id}")
|
||||
|
||||
try:
|
||||
fetch_desc = scraper_input.linkedin_fetch_description
|
||||
job_post = self._process_job(job, job_id, fetch_desc)
|
||||
if job_post:
|
||||
job_list.append(job_post)
|
||||
log.info(f"Added job: {job_post.title} (ID: {job_id})")
|
||||
if not continue_search():
|
||||
break
|
||||
except Exception as e:
|
||||
log.error(f"Error processing job ID {job_id}: {str(e)}")
|
||||
raise NaukriException(str(e))
|
||||
|
||||
if continue_search():
|
||||
time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
|
||||
page += 1
|
||||
|
||||
job_list = job_list[:scraper_input.results_wanted]
|
||||
log.info(f"Scraping completed. Total jobs collected: {len(job_list)}")
|
||||
return JobResponse(jobs=job_list)
|
||||
|
||||
def _process_job(
|
||||
self, job: dict, job_id: str, full_descr: bool
|
||||
) -> Optional[JobPost]:
|
||||
"""
|
||||
Processes a single job from API response into a JobPost object
|
||||
"""
|
||||
title = job.get("title", "N/A")
|
||||
company = job.get("companyName", "N/A")
|
||||
company_url = f"https://www.naukri.com/{job.get('staticUrl', '')}" if job.get("staticUrl") else None
|
||||
|
||||
location = self._get_location(job.get("placeholders", []))
|
||||
compensation = self._get_compensation(job.get("placeholders", []))
|
||||
date_posted = self._parse_date(job.get("footerPlaceholderLabel"), job.get("createdDate"))
|
||||
|
||||
job_url = f"https://www.naukri.com{job.get('jdURL', f'/job/{job_id}')}"
|
||||
description = job.get("jobDescription") if full_descr else None
|
||||
if description and self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
||||
description = markdown_converter(description)
|
||||
|
||||
job_type = parse_job_type(description) if description else None
|
||||
company_industry = parse_company_industry(description) if description else None
|
||||
is_remote = is_job_remote(title, description or "", location)
|
||||
company_logo = job.get("logoPathV3") or job.get("logoPath")
|
||||
|
||||
# Naukri-specific fields
|
||||
skills = job.get("tagsAndSkills", "").split(",") if job.get("tagsAndSkills") else None
|
||||
experience_range = job.get("experienceText")
|
||||
ambition_box = job.get("ambitionBoxData", {})
|
||||
company_rating = float(ambition_box.get("AggregateRating")) if ambition_box.get("AggregateRating") else None
|
||||
company_reviews_count = ambition_box.get("ReviewsCount")
|
||||
vacancy_count = job.get("vacancy")
|
||||
work_from_home_type = self._infer_work_from_home_type(job.get("placeholders", []), title, description or "")
|
||||
|
||||
job_post = JobPost(
|
||||
id=f"nk-{job_id}",
|
||||
title=title,
|
||||
company_name=company,
|
||||
company_url=company_url,
|
||||
location=location,
|
||||
is_remote=is_remote,
|
||||
date_posted=date_posted,
|
||||
job_url=job_url,
|
||||
compensation=compensation,
|
||||
job_type=job_type,
|
||||
company_industry=company_industry,
|
||||
description=description,
|
||||
emails=extract_emails_from_text(description or ""),
|
||||
company_logo=company_logo,
|
||||
skills=skills,
|
||||
experience_range=experience_range,
|
||||
company_rating=company_rating,
|
||||
company_reviews_count=company_reviews_count,
|
||||
vacancy_count=vacancy_count,
|
||||
work_from_home_type=work_from_home_type,
|
||||
)
|
||||
log.debug(f"Processed job: {title} at {company}")
|
||||
return job_post
|
||||
|
||||
def _get_location(self, placeholders: list[dict]) -> Location:
|
||||
"""
|
||||
Extracts location data from placeholders
|
||||
"""
|
||||
location = Location(country=Country.INDIA)
|
||||
for placeholder in placeholders:
|
||||
if placeholder.get("type") == "location":
|
||||
location_str = placeholder.get("label", "")
|
||||
parts = location_str.split(", ")
|
||||
city = parts[0] if parts else None
|
||||
state = parts[1] if len(parts) > 1 else None
|
||||
location = Location(city=city, state=state, country=Country.INDIA)
|
||||
log.debug(f"Parsed location: {location.display_location()}")
|
||||
break
|
||||
return location
|
||||
|
||||
def _get_compensation(self, placeholders: list[dict]) -> Optional[Compensation]:
|
||||
"""
|
||||
Extracts compensation data from placeholders, handling Indian salary formats (Lakhs, Crores)
|
||||
"""
|
||||
for placeholder in placeholders:
|
||||
if placeholder.get("type") == "salary":
|
||||
salary_text = placeholder.get("label", "").strip()
|
||||
if salary_text == "Not disclosed":
|
||||
log.debug("Salary not disclosed")
|
||||
return None
|
||||
|
||||
# Handle Indian salary formats (e.g., "12-16 Lacs P.A.", "1-5 Cr")
|
||||
salary_match = re.match(r"(\d+(?:\.\d+)?)\s*-\s*(\d+(?:\.\d+)?)\s*(Lacs|Lakh|Cr)\s*(P\.A\.)?", salary_text, re.IGNORECASE)
|
||||
if salary_match:
|
||||
min_salary, max_salary, unit = salary_match.groups()[:3]
|
||||
min_salary, max_salary = float(min_salary), float(max_salary)
|
||||
currency = "INR"
|
||||
|
||||
# Convert to base units (INR)
|
||||
if unit.lower() in ("lacs", "lakh"):
|
||||
min_salary *= 100000 # 1 Lakh = 100,000 INR
|
||||
max_salary *= 100000
|
||||
elif unit.lower() == "cr":
|
||||
min_salary *= 10000000 # 1 Crore = 10,000,000 INR
|
||||
max_salary *= 10000000
|
||||
|
||||
log.debug(f"Parsed salary: {min_salary} - {max_salary} INR")
|
||||
return Compensation(
|
||||
min_amount=int(min_salary),
|
||||
max_amount=int(max_salary),
|
||||
currency=currency,
|
||||
)
|
||||
else:
|
||||
log.debug(f"Could not parse salary: {salary_text}")
|
||||
return None
|
||||
return None
|
||||
|
||||
def _parse_date(self, label: str, created_date: int) -> Optional[date]:
|
||||
"""
|
||||
Parses date from footerPlaceholderLabel or createdDate, returning a date object
|
||||
"""
|
||||
today = datetime.now()
|
||||
if not label:
|
||||
if created_date:
|
||||
return datetime.fromtimestamp(created_date / 1000).date() # Convert to date
|
||||
return None
|
||||
label = label.lower()
|
||||
if "today" in label or "just now" in label or "few hours" in label:
|
||||
log.debug("Date parsed as today")
|
||||
return today.date()
|
||||
elif "ago" in label:
|
||||
match = re.search(r"(\d+)\s*day", label)
|
||||
if match:
|
||||
days = int(match.group(1))
|
||||
parsed_date = (today - timedelta(days = days)).date()
|
||||
log.debug(f"Date parsed: {days} days ago -> {parsed_date}")
|
||||
return parsed_date
|
||||
elif created_date:
|
||||
parsed_date = datetime.fromtimestamp(created_date / 1000).date()
|
||||
log.debug(f"Date parsed from timestamp: {parsed_date}")
|
||||
return parsed_date
|
||||
log.debug("No date parsed")
|
||||
return None
|
||||
|
||||
def _infer_work_from_home_type(self, placeholders: list[dict], title: str, description: str) -> Optional[str]:
|
||||
"""
|
||||
Infers work-from-home type from job data (e.g., 'Hybrid', 'Remote', 'Work from office')
|
||||
"""
|
||||
location_str = next((p["label"] for p in placeholders if p["type"] == "location"), "").lower()
|
||||
if "hybrid" in location_str or "hybrid" in title.lower() or "hybrid" in description.lower():
|
||||
return "Hybrid"
|
||||
elif "remote" in location_str or "remote" in title.lower() or "remote" in description.lower():
|
||||
return "Remote"
|
||||
elif "work from office" in description.lower() or not ("remote" in description.lower() or "hybrid" in description.lower()):
|
||||
return "Work from office"
|
||||
return None
|
|
@ -0,0 +1,11 @@
|
|||
headers = {
|
||||
"authority": "www.naukri.com",
|
||||
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
"cache-control": "max-age=0",
|
||||
"upgrade-insecure-requests": "1",
|
||||
"appid": "109",
|
||||
"systemid": "Naukri",
|
||||
"Nkparam": "Ppy0YK9uSHqPtG3bEejYc04RTpUN2CjJOrqA68tzQt0SKJHXZKzz9M8cZtKLVkoOuQmfe4cTb1r2CwfHaxW5Tg==",
|
||||
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from jobspy.model import JobType, Location
|
||||
from jobspy.util import get_enum_from_job_type
|
||||
|
||||
|
||||
def parse_job_type(soup: BeautifulSoup) -> list[JobType] | None:
|
||||
"""
|
||||
Gets the job type from the job page
|
||||
"""
|
||||
job_type_tag = soup.find("span", class_="job-type")
|
||||
if job_type_tag:
|
||||
job_type_str = job_type_tag.get_text(strip=True).lower().replace("-", "")
|
||||
return [get_enum_from_job_type(job_type_str)] if job_type_str else None
|
||||
return None
|
||||
|
||||
|
||||
def parse_company_industry(soup: BeautifulSoup) -> str | None:
|
||||
"""
|
||||
Gets the company industry from the job page
|
||||
"""
|
||||
industry_tag = soup.find("span", class_="industry")
|
||||
return industry_tag.get_text(strip=True) if industry_tag else None
|
||||
|
||||
|
||||
def is_job_remote(title: str, description: str, location: Location) -> bool:
|
||||
"""
|
||||
Searches the title, description, and location to check if the job is remote
|
||||
"""
|
||||
remote_keywords = ["remote", "work from home", "wfh"]
|
||||
location_str = location.display_location()
|
||||
full_string = f"{title} {description} {location_str}".lower()
|
||||
return any(keyword in full_string for keyword in remote_keywords)
|
|
@ -11,7 +11,7 @@ import urllib3
|
|||
from markdownify import markdownify as md
|
||||
from requests.adapters import HTTPAdapter, Retry
|
||||
|
||||
from ..jobs import CompensationInterval, JobType
|
||||
from jobspy.model import CompensationInterval, JobType, Site
|
||||
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
|
@ -47,11 +47,12 @@ class RotatingProxySession:
|
|||
"""Utility method to format a proxy string into a dictionary."""
|
||||
if proxy.startswith("http://") or proxy.startswith("https://"):
|
||||
return {"http": proxy, "https": proxy}
|
||||
if proxy.startswith("socks5://"):
|
||||
return {"http": proxy, "https": proxy}
|
||||
return {"http": f"http://{proxy}", "https": f"http://{proxy}"}
|
||||
|
||||
|
||||
class RequestsRotating(RotatingProxySession, requests.Session):
|
||||
|
||||
def __init__(self, proxies=None, has_retry=False, delay=1, clear_cookies=False):
|
||||
RotatingProxySession.__init__(self, proxies=proxies)
|
||||
requests.Session.__init__(self)
|
||||
|
@ -86,7 +87,6 @@ class RequestsRotating(RotatingProxySession, requests.Session):
|
|||
|
||||
|
||||
class TLSRotating(RotatingProxySession, tls_client.Session):
|
||||
|
||||
def __init__(self, proxies=None):
|
||||
RotatingProxySession.__init__(self, proxies=proxies)
|
||||
tls_client.Session.__init__(self, random_tls_extension_order=True)
|
||||
|
@ -286,3 +286,69 @@ def extract_job_type(description: str):
|
|||
listing_types.append(key)
|
||||
|
||||
return listing_types if listing_types else None
|
||||
|
||||
|
||||
def map_str_to_site(site_name: str) -> Site:
|
||||
return Site[site_name.upper()]
|
||||
|
||||
|
||||
def get_enum_from_value(value_str):
|
||||
for job_type in JobType:
|
||||
if value_str in job_type.value:
|
||||
return job_type
|
||||
raise Exception(f"Invalid job type: {value_str}")
|
||||
|
||||
|
||||
def convert_to_annual(job_data: dict):
|
||||
if job_data["interval"] == "hourly":
|
||||
job_data["min_amount"] *= 2080
|
||||
job_data["max_amount"] *= 2080
|
||||
if job_data["interval"] == "monthly":
|
||||
job_data["min_amount"] *= 12
|
||||
job_data["max_amount"] *= 12
|
||||
if job_data["interval"] == "weekly":
|
||||
job_data["min_amount"] *= 52
|
||||
job_data["max_amount"] *= 52
|
||||
if job_data["interval"] == "daily":
|
||||
job_data["min_amount"] *= 260
|
||||
job_data["max_amount"] *= 260
|
||||
job_data["interval"] = "yearly"
|
||||
|
||||
|
||||
desired_order = [
|
||||
"id",
|
||||
"site",
|
||||
"job_url",
|
||||
"job_url_direct",
|
||||
"title",
|
||||
"company",
|
||||
"location",
|
||||
"date_posted",
|
||||
"job_type",
|
||||
"salary_source",
|
||||
"interval",
|
||||
"min_amount",
|
||||
"max_amount",
|
||||
"currency",
|
||||
"is_remote",
|
||||
"job_level",
|
||||
"job_function",
|
||||
"listing_type",
|
||||
"emails",
|
||||
"description",
|
||||
"company_industry",
|
||||
"company_url",
|
||||
"company_logo",
|
||||
"company_url_direct",
|
||||
"company_addresses",
|
||||
"company_num_employees",
|
||||
"company_revenue",
|
||||
"company_description",
|
||||
# naukri-specific fields
|
||||
"skills",
|
||||
"experience_range",
|
||||
"company_rating",
|
||||
"company_reviews_count",
|
||||
"vacancy_count",
|
||||
"work_from_home_type",
|
||||
]
|
|
@ -1,10 +1,3 @@
|
|||
"""
|
||||
jobspy.scrapers.ziprecruiter
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
This module contains routines to scrape ZipRecruiter.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
|
@ -13,33 +6,34 @@ import re
|
|||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from datetime import datetime
|
||||
from typing import Optional, Tuple, Any
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .constants import headers
|
||||
from .. import Scraper, ScraperInput, Site
|
||||
from ..utils import (
|
||||
from jobspy.ziprecruiter.constant import headers, get_cookie_data
|
||||
from jobspy.util import (
|
||||
extract_emails_from_text,
|
||||
create_session,
|
||||
markdown_converter,
|
||||
remove_attributes,
|
||||
create_logger,
|
||||
)
|
||||
from ...jobs import (
|
||||
from jobspy.model import (
|
||||
JobPost,
|
||||
Compensation,
|
||||
Location,
|
||||
JobResponse,
|
||||
JobType,
|
||||
Country,
|
||||
DescriptionFormat,
|
||||
Scraper,
|
||||
ScraperInput,
|
||||
Site,
|
||||
)
|
||||
from jobspy.ziprecruiter.util import get_job_type_enum, add_params
|
||||
|
||||
log = create_logger("ZipRecruiter")
|
||||
|
||||
|
||||
class ZipRecruiterScraper(Scraper):
|
||||
class ZipRecruiter(Scraper):
|
||||
base_url = "https://www.ziprecruiter.com"
|
||||
api_url = "https://api.ziprecruiter.com"
|
||||
|
||||
|
@ -90,7 +84,7 @@ class ZipRecruiterScraper(Scraper):
|
|||
|
||||
def _find_jobs_in_page(
|
||||
self, scraper_input: ScraperInput, continue_token: str | None = None
|
||||
) -> Tuple[list[JobPost], Optional[str]]:
|
||||
) -> tuple[list[JobPost], str | None]:
|
||||
"""
|
||||
Scrapes a page of ZipRecruiter for jobs with scraper_input criteria
|
||||
:param scraper_input:
|
||||
|
@ -98,7 +92,7 @@ class ZipRecruiterScraper(Scraper):
|
|||
:return: jobs found on page
|
||||
"""
|
||||
jobs_list = []
|
||||
params = self._add_params(scraper_input)
|
||||
params = add_params(scraper_input)
|
||||
if continue_token:
|
||||
params["continue_from"] = continue_token
|
||||
try:
|
||||
|
@ -151,7 +145,7 @@ class ZipRecruiterScraper(Scraper):
|
|||
location = Location(
|
||||
city=job.get("job_city"), state=job.get("job_state"), country=country_enum
|
||||
)
|
||||
job_type = self._get_job_type_enum(
|
||||
job_type = get_job_type_enum(
|
||||
job.get("employment_type", "").replace("_", "").lower()
|
||||
)
|
||||
date_posted = datetime.fromisoformat(job["posted_time"].rstrip("Z")).date()
|
||||
|
@ -200,13 +194,17 @@ class ZipRecruiterScraper(Scraper):
|
|||
else ""
|
||||
)
|
||||
description_full = job_description_clean + company_description_clean
|
||||
script_tag = soup.find("script", type="application/json")
|
||||
if script_tag:
|
||||
job_json = json.loads(script_tag.string)
|
||||
job_url_val = job_json["model"].get("saveJobURL", "")
|
||||
m = re.search(r"job_url=(.+)", job_url_val)
|
||||
if m:
|
||||
job_url_direct = m.group(1)
|
||||
|
||||
try:
|
||||
script_tag = soup.find("script", type="application/json")
|
||||
if script_tag:
|
||||
job_json = json.loads(script_tag.string)
|
||||
job_url_val = job_json["model"].get("saveJobURL", "")
|
||||
m = re.search(r"job_url=(.+)", job_url_val)
|
||||
if m:
|
||||
job_url_direct = m.group(1)
|
||||
except:
|
||||
job_url_direct = None
|
||||
|
||||
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
||||
description_full = markdown_converter(description_full)
|
||||
|
@ -217,51 +215,5 @@ class ZipRecruiterScraper(Scraper):
|
|||
"""
|
||||
Sends a session event to the API with device properties.
|
||||
"""
|
||||
data = [
|
||||
("event_type", "session"),
|
||||
("logged_in", "false"),
|
||||
("number_of_retry", "1"),
|
||||
("property", "model:iPhone"),
|
||||
("property", "os:iOS"),
|
||||
("property", "locale:en_us"),
|
||||
("property", "app_build_number:4734"),
|
||||
("property", "app_version:91.0"),
|
||||
("property", "manufacturer:Apple"),
|
||||
("property", "timestamp:2025-01-12T12:04:42-06:00"),
|
||||
("property", "screen_height:852"),
|
||||
("property", "os_version:16.6.1"),
|
||||
("property", "source:install"),
|
||||
("property", "screen_width:393"),
|
||||
("property", "device_model:iPhone 14 Pro"),
|
||||
("property", "brand:Apple"),
|
||||
]
|
||||
|
||||
url = f"{self.api_url}/jobs-app/event"
|
||||
self.session.post(url, data=data)
|
||||
|
||||
@staticmethod
|
||||
def _get_job_type_enum(job_type_str: str) -> list[JobType] | None:
|
||||
for job_type in JobType:
|
||||
if job_type_str in job_type.value:
|
||||
return [job_type]
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _add_params(scraper_input) -> dict[str, str | Any]:
|
||||
params = {
|
||||
"search": scraper_input.search_term,
|
||||
"location": scraper_input.location,
|
||||
}
|
||||
if scraper_input.hours_old:
|
||||
params["days"] = max(scraper_input.hours_old // 24, 1)
|
||||
job_type_map = {JobType.FULL_TIME: "full_time", JobType.PART_TIME: "part_time"}
|
||||
if scraper_input.job_type:
|
||||
job_type = scraper_input.job_type
|
||||
params["employment_type"] = job_type_map.get(job_type, job_type.value[0])
|
||||
if scraper_input.easy_apply:
|
||||
params["zipapply"] = 1
|
||||
if scraper_input.is_remote:
|
||||
params["remote"] = 1
|
||||
if scraper_input.distance:
|
||||
params["radius"] = scraper_input.distance
|
||||
return {k: v for k, v in params.items() if v is not None}
|
||||
self.session.post(url, data=get_cookie_data)
|
|
@ -0,0 +1,29 @@
|
|||
headers = {
|
||||
"Host": "api.ziprecruiter.com",
|
||||
"accept": "*/*",
|
||||
"x-zr-zva-override": "100000000;vid:ZT1huzm_EQlDTVEc",
|
||||
"x-pushnotificationid": "0ff4983d38d7fc5b3370297f2bcffcf4b3321c418f5c22dd152a0264707602a0",
|
||||
"x-deviceid": "D77B3A92-E589-46A4-8A39-6EF6F1D86006",
|
||||
"user-agent": "Job Search/87.0 (iPhone; CPU iOS 16_6_1 like Mac OS X)",
|
||||
"authorization": "Basic YTBlZjMyZDYtN2I0Yy00MWVkLWEyODMtYTI1NDAzMzI0YTcyOg==",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
}
|
||||
|
||||
get_cookie_data = [
|
||||
("event_type", "session"),
|
||||
("logged_in", "false"),
|
||||
("number_of_retry", "1"),
|
||||
("property", "model:iPhone"),
|
||||
("property", "os:iOS"),
|
||||
("property", "locale:en_us"),
|
||||
("property", "app_build_number:4734"),
|
||||
("property", "app_version:91.0"),
|
||||
("property", "manufacturer:Apple"),
|
||||
("property", "timestamp:2025-01-12T12:04:42-06:00"),
|
||||
("property", "screen_height:852"),
|
||||
("property", "os_version:16.6.1"),
|
||||
("property", "source:install"),
|
||||
("property", "screen_width:393"),
|
||||
("property", "device_model:iPhone 14 Pro"),
|
||||
("property", "brand:Apple"),
|
||||
]
|
|
@ -0,0 +1,31 @@
|
|||
from jobspy.model import JobType
|
||||
|
||||
|
||||
def add_params(scraper_input) -> dict[str, str | int]:
|
||||
params: dict[str, str | int] = {
|
||||
"search": scraper_input.search_term,
|
||||
"location": scraper_input.location,
|
||||
}
|
||||
if scraper_input.hours_old:
|
||||
params["days"] = max(scraper_input.hours_old // 24, 1)
|
||||
|
||||
job_type_map = {JobType.FULL_TIME: "full_time", JobType.PART_TIME: "part_time"}
|
||||
if scraper_input.job_type:
|
||||
job_type = scraper_input.job_type
|
||||
params["employment_type"] = job_type_map.get(job_type, job_type.value[0])
|
||||
|
||||
if scraper_input.easy_apply:
|
||||
params["zipapply"] = 1
|
||||
if scraper_input.is_remote:
|
||||
params["remote"] = 1
|
||||
if scraper_input.distance:
|
||||
params["radius"] = scraper_input.distance
|
||||
|
||||
return {k: v for k, v in params.items() if v is not None}
|
||||
|
||||
|
||||
def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
|
||||
for job_type in JobType:
|
||||
if job_type_str in job_type.value:
|
||||
return [job_type]
|
||||
return None
|
|
@ -4,15 +4,14 @@ build-backend = "poetry.core.masonry.api"
|
|||
|
||||
[tool.poetry]
|
||||
name = "python-jobspy"
|
||||
version = "1.1.76"
|
||||
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
||||
authors = [ "Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>",]
|
||||
homepage = "https://github.com/Bunsly/JobSpy"
|
||||
version = "1.1.80"
|
||||
description = "Job scraper for LinkedIn, Indeed, Glassdoor, ZipRecruiter & Bayt"
|
||||
authors = ["Cullen Watson <cullen@cullenwatson.com>", "Zachary Hampton <zachary@zacharysproducts.com>"]
|
||||
homepage = "https://github.com/cullenwatson/JobSpy"
|
||||
readme = "README.md"
|
||||
keywords = [ "jobs-scraper", "linkedin", "indeed", "glassdoor", "ziprecruiter",]
|
||||
keywords = [ "jobs-scraper", "linkedin", "indeed", "glassdoor", "ziprecruiter", "bayt", "naukri"]
|
||||
[[tool.poetry.packages]]
|
||||
include = "jobspy"
|
||||
from = "src"
|
||||
|
||||
[tool.black]
|
||||
line-length = 88
|
||||
|
@ -29,7 +28,6 @@ markdownify = "^0.13.1"
|
|||
regex = "^2024.4.28"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
pytest = "^7.4.1"
|
||||
jupyter = "^1.0.0"
|
||||
black = "*"
|
||||
pre-commit = "*"
|
||||
|
|
|
@ -1,58 +0,0 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from ..jobs import (
|
||||
Enum,
|
||||
BaseModel,
|
||||
JobType,
|
||||
JobResponse,
|
||||
Country,
|
||||
DescriptionFormat,
|
||||
)
|
||||
|
||||
|
||||
class Site(Enum):
|
||||
LINKEDIN = "linkedin"
|
||||
INDEED = "indeed"
|
||||
ZIP_RECRUITER = "zip_recruiter"
|
||||
GLASSDOOR = "glassdoor"
|
||||
GOOGLE = "google"
|
||||
BAYT = "bayt"
|
||||
|
||||
|
||||
class SalarySource(Enum):
|
||||
DIRECT_DATA = "direct_data"
|
||||
DESCRIPTION = "description"
|
||||
|
||||
|
||||
class ScraperInput(BaseModel):
|
||||
site_type: list[Site]
|
||||
search_term: str | None = None
|
||||
google_search_term: str | None = None
|
||||
|
||||
location: str | None = None
|
||||
country: Country | None = Country.USA
|
||||
distance: int | None = None
|
||||
is_remote: bool = False
|
||||
job_type: JobType | None = None
|
||||
easy_apply: bool | None = None
|
||||
offset: int = 0
|
||||
linkedin_fetch_description: bool = False
|
||||
linkedin_company_ids: list[int] | None = None
|
||||
description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN
|
||||
|
||||
results_wanted: int = 15
|
||||
hours_old: int | None = None
|
||||
|
||||
|
||||
class Scraper(ABC):
|
||||
def __init__(
|
||||
self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None
|
||||
):
|
||||
self.site = site
|
||||
self.proxies = proxies
|
||||
self.ca_cert = ca_cert
|
||||
|
||||
@abstractmethod
|
||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...
|
|
@ -1,10 +0,0 @@
|
|||
headers = {
|
||||
"Host": "api.ziprecruiter.com",
|
||||
"accept": "*/*",
|
||||
"x-zr-zva-override": "100000000;vid:ZT1huzm_EQlDTVEc",
|
||||
"x-pushnotificationid": "0ff4983d38d7fc5b3370297f2bcffcf4b3321c418f5c22dd152a0264707602a0",
|
||||
"x-deviceid": "D77B3A92-E589-46A4-8A39-6EF6F1D86006",
|
||||
"user-agent": "Job Search/87.0 (iPhone; CPU iOS 16_6_1 like Mac OS X)",
|
||||
"authorization": "Basic YTBlZjMyZDYtN2I0Yy00MWVkLWEyODMtYTI1NDAzMzI0YTcyOg==",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
}
|
Loading…
Reference in New Issue