mirror of
https://github.com/Bunsly/JobSpy.git
synced 2026-03-05 12:04:33 -08:00
Compare commits
24 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
338d854b96 | ||
|
|
811d4c40b4 | ||
|
|
dba92d22c2 | ||
|
|
10a3592a0f | ||
|
|
b7905cc756 | ||
|
|
6867d58829 | ||
|
|
f6248c8386 | ||
|
|
f395597fdd | ||
|
|
6372e41bd9 | ||
|
|
6c869decb8 | ||
|
|
9f4083380d | ||
|
|
9207ab56f6 | ||
|
|
757a94853e | ||
|
|
6bc191d5c7 | ||
|
|
0cc34287f7 | ||
|
|
923979093b | ||
|
|
286f0e4487 | ||
|
|
f7b29d43a2 | ||
|
|
6f1490458c | ||
|
|
6bb7d81ba8 | ||
|
|
0e046432d1 | ||
|
|
209e0e65b6 | ||
|
|
8570c0651e | ||
|
|
8678b0bbe4 |
22
.github/workflows/python-test.yml
vendored
Normal file
22
.github/workflows/python-test.yml
vendored
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
name: Python Tests
|
||||||
|
|
||||||
|
on:
|
||||||
|
pull_request:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
test:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
- name: Set up Python
|
||||||
|
uses: actions/setup-python@v2
|
||||||
|
with:
|
||||||
|
python-version: '3.8'
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
pip install poetry
|
||||||
|
poetry install
|
||||||
|
- name: Run tests
|
||||||
|
run: poetry run pytest tests/test_all.py
|
||||||
40
README.md
40
README.md
@@ -2,14 +2,12 @@
|
|||||||
|
|
||||||
**JobSpy** is a simple, yet comprehensive, job scraping library.
|
**JobSpy** is a simple, yet comprehensive, job scraping library.
|
||||||
|
|
||||||
**Not technical?** Try out the web scraping tool on our site at [usejobspy.com](https://usejobspy.com).
|
|
||||||
|
|
||||||
*Looking to build a data-focused software product?* **[Book a call](https://bunsly.com/)** *to
|
*Looking to build a data-focused software product?* **[Book a call](https://bunsly.com/)** *to
|
||||||
work with us.*
|
work with us.*
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, & **ZipRecruiter** simultaneously
|
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, & **ZipRecruiter** simultaneously
|
||||||
- Aggregates the job postings in a Pandas DataFrame
|
- Aggregates the job postings in a Pandas DataFrame
|
||||||
- Proxies support
|
- Proxies support
|
||||||
|
|
||||||
@@ -30,14 +28,15 @@ import csv
|
|||||||
from jobspy import scrape_jobs
|
from jobspy import scrape_jobs
|
||||||
|
|
||||||
jobs = scrape_jobs(
|
jobs = scrape_jobs(
|
||||||
site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor"],
|
site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google"],
|
||||||
search_term="software engineer",
|
search_term="software engineer",
|
||||||
location="Dallas, TX",
|
google_search_term="software engineer jobs near San Francisco, CA since yesterday",
|
||||||
|
location="San Francisco, CA",
|
||||||
results_wanted=20,
|
results_wanted=20,
|
||||||
hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old)
|
hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old)
|
||||||
country_indeed='USA', # only needed for indeed / glassdoor
|
country_indeed='USA', # only needed for indeed / glassdoor
|
||||||
|
|
||||||
# linkedin_fetch_description=True # get full description , direct job url , company industry and job level (seniority level) for linkedin (slower)
|
# linkedin_fetch_description=True # get more info such as full description, direct job url for linkedin (slower)
|
||||||
# proxies=["208.195.175.46:65095", "208.195.175.45:65095", "localhost"],
|
# proxies=["208.195.175.46:65095", "208.195.175.45:65095", "localhost"],
|
||||||
|
|
||||||
)
|
)
|
||||||
@@ -63,10 +62,13 @@ zip_recruiter Software Developer TEKsystems Phoenix
|
|||||||
```plaintext
|
```plaintext
|
||||||
Optional
|
Optional
|
||||||
├── site_name (list|str):
|
├── site_name (list|str):
|
||||||
| linkedin, zip_recruiter, indeed, glassdoor
|
| linkedin, zip_recruiter, indeed, glassdoor, google
|
||||||
| (default is all four)
|
| (default is all)
|
||||||
│
|
│
|
||||||
├── search_term (str)
|
├── search_term (str)
|
||||||
|
|
|
||||||
|
├── google_search_term (str)
|
||||||
|
| search term for google jobs. This is is only param for filtering google jobs.
|
||||||
│
|
│
|
||||||
├── location (str)
|
├── location (str)
|
||||||
│
|
│
|
||||||
@@ -79,7 +81,7 @@ Optional
|
|||||||
├── proxies (list):
|
├── proxies (list):
|
||||||
| in format ['user:pass@host:port', 'localhost']
|
| in format ['user:pass@host:port', 'localhost']
|
||||||
| each job board scraper will round robin through the proxies
|
| each job board scraper will round robin through the proxies
|
||||||
│
|
|
|
||||||
├── is_remote (bool)
|
├── is_remote (bool)
|
||||||
│
|
│
|
||||||
├── results_wanted (int):
|
├── results_wanted (int):
|
||||||
@@ -113,6 +115,9 @@ Optional
|
|||||||
|
|
|
|
||||||
├── enforce_annual_salary (bool):
|
├── enforce_annual_salary (bool):
|
||||||
| converts wages to annual salary
|
| converts wages to annual salary
|
||||||
|
|
|
||||||
|
├── ca_cert (str)
|
||||||
|
| path to CA Certificate file for proxies
|
||||||
```
|
```
|
||||||
|
|
||||||
```
|
```
|
||||||
@@ -165,10 +170,7 @@ Indeed specific
|
|||||||
├── company_employees_label
|
├── company_employees_label
|
||||||
├── company_revenue_label
|
├── company_revenue_label
|
||||||
├── company_description
|
├── company_description
|
||||||
├── ceo_name
|
└── company_logo
|
||||||
├── ceo_photo_url
|
|
||||||
├── logo_photo_url
|
|
||||||
└── banner_photo_url
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Supported Countries for Job Searching
|
## Supported Countries for Job Searching
|
||||||
@@ -216,10 +218,8 @@ You can specify the following countries when searching on Indeed (use the exact
|
|||||||
## Frequently Asked Questions
|
## Frequently Asked Questions
|
||||||
|
|
||||||
---
|
---
|
||||||
|
**Q: Why is Indeed giving unrelated roles?**
|
||||||
**Q: Encountering issues with your queries?**
|
**A:** Indeed is searching each one of your terms e.g. software intern, it searches software OR intern. Try search_term='"software intern"' in quotes for stricter searching
|
||||||
**A:** Try reducing the number of `results_wanted` and/or broadening the filters. If problems
|
|
||||||
persist, [submit an issue](https://github.com/Bunsly/JobSpy/issues).
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@@ -230,3 +230,9 @@ persist, [submit an issue](https://github.com/Bunsly/JobSpy/issues).
|
|||||||
- Try using the proxies param to change your IP address.
|
- Try using the proxies param to change your IP address.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
**Q: Encountering issues with your queries?**
|
||||||
|
**A:** Try reducing the number of `results_wanted` and/or broadening the filters. If problems
|
||||||
|
persist, [submit an issue](https://github.com/Bunsly/JobSpy/issues).
|
||||||
|
|
||||||
|
---
|
||||||
|
|||||||
1901
poetry.lock
generated
1901
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
2
poetry.toml
Normal file
2
poetry.toml
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
[virtualenvs]
|
||||||
|
in-project = true
|
||||||
@@ -1,10 +1,11 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "python-jobspy"
|
name = "python-jobspy"
|
||||||
version = "1.1.60"
|
version = "1.1.75"
|
||||||
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
||||||
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
||||||
homepage = "https://github.com/Bunsly/JobSpy"
|
homepage = "https://github.com/Bunsly/JobSpy"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
|
keywords = ['jobs-scraper', 'linkedin', 'indeed', 'glassdoor', 'ziprecruiter']
|
||||||
|
|
||||||
packages = [
|
packages = [
|
||||||
{ include = "jobspy", from = "src" }
|
{ include = "jobspy", from = "src" }
|
||||||
@@ -18,7 +19,7 @@ pandas = "^2.1.0"
|
|||||||
NUMPY = "1.26.3"
|
NUMPY = "1.26.3"
|
||||||
pydantic = "^2.3.0"
|
pydantic = "^2.3.0"
|
||||||
tls-client = "^1.0.1"
|
tls-client = "^1.0.1"
|
||||||
markdownify = "^0.11.6"
|
markdownify = "^0.13.1"
|
||||||
regex = "^2024.4.28"
|
regex = "^2024.4.28"
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -5,10 +5,11 @@ from typing import Tuple
|
|||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
|
||||||
from .jobs import JobType, Location
|
from .jobs import JobType, Location
|
||||||
from .scrapers.utils import logger, set_logger_level, extract_salary
|
from .scrapers.utils import set_logger_level, extract_salary, create_logger
|
||||||
from .scrapers.indeed import IndeedScraper
|
from .scrapers.indeed import IndeedScraper
|
||||||
from .scrapers.ziprecruiter import ZipRecruiterScraper
|
from .scrapers.ziprecruiter import ZipRecruiterScraper
|
||||||
from .scrapers.glassdoor import GlassdoorScraper
|
from .scrapers.glassdoor import GlassdoorScraper
|
||||||
|
from .scrapers.google import GoogleJobsScraper
|
||||||
from .scrapers.linkedin import LinkedInScraper
|
from .scrapers.linkedin import LinkedInScraper
|
||||||
from .scrapers import SalarySource, ScraperInput, Site, JobResponse, Country
|
from .scrapers import SalarySource, ScraperInput, Site, JobResponse, Country
|
||||||
from .scrapers.exceptions import (
|
from .scrapers.exceptions import (
|
||||||
@@ -16,12 +17,14 @@ from .scrapers.exceptions import (
|
|||||||
IndeedException,
|
IndeedException,
|
||||||
ZipRecruiterException,
|
ZipRecruiterException,
|
||||||
GlassdoorException,
|
GlassdoorException,
|
||||||
|
GoogleJobsException,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def scrape_jobs(
|
def scrape_jobs(
|
||||||
site_name: str | list[str] | Site | list[Site] | None = None,
|
site_name: str | list[str] | Site | list[Site] | None = None,
|
||||||
search_term: str | None = None,
|
search_term: str | None = None,
|
||||||
|
google_search_term: str | None = None,
|
||||||
location: str | None = None,
|
location: str | None = None,
|
||||||
distance: int | None = 50,
|
distance: int | None = 50,
|
||||||
is_remote: bool = False,
|
is_remote: bool = False,
|
||||||
@@ -31,6 +34,7 @@ def scrape_jobs(
|
|||||||
country_indeed: str = "usa",
|
country_indeed: str = "usa",
|
||||||
hyperlinks: bool = False,
|
hyperlinks: bool = False,
|
||||||
proxies: list[str] | str | None = None,
|
proxies: list[str] | str | None = None,
|
||||||
|
ca_cert: str | None = None,
|
||||||
description_format: str = "markdown",
|
description_format: str = "markdown",
|
||||||
linkedin_fetch_description: bool | None = False,
|
linkedin_fetch_description: bool | None = False,
|
||||||
linkedin_company_ids: list[int] | None = None,
|
linkedin_company_ids: list[int] | None = None,
|
||||||
@@ -49,6 +53,7 @@ def scrape_jobs(
|
|||||||
Site.INDEED: IndeedScraper,
|
Site.INDEED: IndeedScraper,
|
||||||
Site.ZIP_RECRUITER: ZipRecruiterScraper,
|
Site.ZIP_RECRUITER: ZipRecruiterScraper,
|
||||||
Site.GLASSDOOR: GlassdoorScraper,
|
Site.GLASSDOOR: GlassdoorScraper,
|
||||||
|
Site.GOOGLE: GoogleJobsScraper,
|
||||||
}
|
}
|
||||||
set_logger_level(verbose)
|
set_logger_level(verbose)
|
||||||
|
|
||||||
@@ -82,6 +87,7 @@ def scrape_jobs(
|
|||||||
site_type=get_site_type(),
|
site_type=get_site_type(),
|
||||||
country=country_enum,
|
country=country_enum,
|
||||||
search_term=search_term,
|
search_term=search_term,
|
||||||
|
google_search_term=google_search_term,
|
||||||
location=location,
|
location=location,
|
||||||
distance=distance,
|
distance=distance,
|
||||||
is_remote=is_remote,
|
is_remote=is_remote,
|
||||||
@@ -97,11 +103,11 @@ def scrape_jobs(
|
|||||||
|
|
||||||
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
|
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
|
||||||
scraper_class = SCRAPER_MAPPING[site]
|
scraper_class = SCRAPER_MAPPING[site]
|
||||||
scraper = scraper_class(proxies=proxies)
|
scraper = scraper_class(proxies=proxies, ca_cert=ca_cert)
|
||||||
scraped_data: JobResponse = scraper.scrape(scraper_input)
|
scraped_data: JobResponse = scraper.scrape(scraper_input)
|
||||||
cap_name = site.value.capitalize()
|
cap_name = site.value.capitalize()
|
||||||
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name
|
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name
|
||||||
logger.info(f"{site_name} finished scraping")
|
create_logger(site_name).info(f"finished scraping")
|
||||||
return site.value, scraped_data
|
return site.value, scraped_data
|
||||||
|
|
||||||
site_to_jobs_dict = {}
|
site_to_jobs_dict = {}
|
||||||
@@ -189,7 +195,9 @@ def scrape_jobs(
|
|||||||
job_data["salary_source"] = SalarySource.DESCRIPTION.value
|
job_data["salary_source"] = SalarySource.DESCRIPTION.value
|
||||||
|
|
||||||
job_data["salary_source"] = (
|
job_data["salary_source"] = (
|
||||||
job_data["salary_source"] if job_data["min_amount"] else None
|
job_data["salary_source"]
|
||||||
|
if "min_amount" in job_data and job_data["min_amount"]
|
||||||
|
else None
|
||||||
)
|
)
|
||||||
job_df = pd.DataFrame([job_data])
|
job_df = pd.DataFrame([job_data])
|
||||||
jobs_dfs.append(job_df)
|
jobs_dfs.append(job_df)
|
||||||
@@ -210,8 +218,8 @@ def scrape_jobs(
|
|||||||
"title",
|
"title",
|
||||||
"company",
|
"company",
|
||||||
"location",
|
"location",
|
||||||
"job_type",
|
|
||||||
"date_posted",
|
"date_posted",
|
||||||
|
"job_type",
|
||||||
"salary_source",
|
"salary_source",
|
||||||
"interval",
|
"interval",
|
||||||
"min_amount",
|
"min_amount",
|
||||||
@@ -220,20 +228,17 @@ def scrape_jobs(
|
|||||||
"is_remote",
|
"is_remote",
|
||||||
"job_level",
|
"job_level",
|
||||||
"job_function",
|
"job_function",
|
||||||
"company_industry",
|
|
||||||
"listing_type",
|
"listing_type",
|
||||||
"emails",
|
"emails",
|
||||||
"description",
|
"description",
|
||||||
|
"company_industry",
|
||||||
"company_url",
|
"company_url",
|
||||||
|
"company_logo",
|
||||||
"company_url_direct",
|
"company_url_direct",
|
||||||
"company_addresses",
|
"company_addresses",
|
||||||
"company_num_employees",
|
"company_num_employees",
|
||||||
"company_revenue",
|
"company_revenue",
|
||||||
"company_description",
|
"company_description",
|
||||||
"logo_photo_url",
|
|
||||||
"banner_photo_url",
|
|
||||||
"ceo_name",
|
|
||||||
"ceo_photo_url",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
# Step 3: Ensure all desired columns are present, adding missing ones as empty
|
# Step 3: Ensure all desired columns are present, adding missing ones as empty
|
||||||
@@ -245,6 +250,8 @@ def scrape_jobs(
|
|||||||
jobs_df = jobs_df[desired_order]
|
jobs_df = jobs_df[desired_order]
|
||||||
|
|
||||||
# Step 4: Sort the DataFrame as required
|
# Step 4: Sort the DataFrame as required
|
||||||
return jobs_df.sort_values(by=["site", "date_posted"], ascending=[True, False])
|
return jobs_df.sort_values(
|
||||||
|
by=["site", "date_posted"], ascending=[True, False]
|
||||||
|
).reset_index(drop=True)
|
||||||
else:
|
else:
|
||||||
return pd.DataFrame()
|
return pd.DataFrame()
|
||||||
|
|||||||
@@ -92,7 +92,8 @@ class Country(Enum):
|
|||||||
JAPAN = ("japan", "jp")
|
JAPAN = ("japan", "jp")
|
||||||
KUWAIT = ("kuwait", "kw")
|
KUWAIT = ("kuwait", "kw")
|
||||||
LUXEMBOURG = ("luxembourg", "lu")
|
LUXEMBOURG = ("luxembourg", "lu")
|
||||||
MALAYSIA = ("malaysia", "malaysia")
|
MALAYSIA = ("malaysia", "malaysia:my", "com")
|
||||||
|
MALTA = ("malta", "malta:mt", "mt")
|
||||||
MEXICO = ("mexico", "mx", "com.mx")
|
MEXICO = ("mexico", "mx", "com.mx")
|
||||||
MOROCCO = ("morocco", "ma")
|
MOROCCO = ("morocco", "ma")
|
||||||
NETHERLANDS = ("netherlands", "nl", "nl")
|
NETHERLANDS = ("netherlands", "nl", "nl")
|
||||||
@@ -117,7 +118,7 @@ class Country(Enum):
|
|||||||
SWITZERLAND = ("switzerland", "ch", "de:ch")
|
SWITZERLAND = ("switzerland", "ch", "de:ch")
|
||||||
TAIWAN = ("taiwan", "tw")
|
TAIWAN = ("taiwan", "tw")
|
||||||
THAILAND = ("thailand", "th")
|
THAILAND = ("thailand", "th")
|
||||||
TURKEY = ("turkey", "tr")
|
TURKEY = ("türkiye,turkey", "tr")
|
||||||
UKRAINE = ("ukraine", "ua")
|
UKRAINE = ("ukraine", "ua")
|
||||||
UNITEDARABEMIRATES = ("united arab emirates", "ae")
|
UNITEDARABEMIRATES = ("united arab emirates", "ae")
|
||||||
UK = ("uk,united kingdom", "uk:gb", "co.uk")
|
UK = ("uk,united kingdom", "uk:gb", "co.uk")
|
||||||
@@ -255,9 +256,7 @@ class JobPost(BaseModel):
|
|||||||
company_num_employees: str | None = None
|
company_num_employees: str | None = None
|
||||||
company_revenue: str | None = None
|
company_revenue: str | None = None
|
||||||
company_description: str | None = None
|
company_description: str | None = None
|
||||||
ceo_name: str | None = None
|
company_logo: str | None = None
|
||||||
ceo_photo_url: str | None = None
|
|
||||||
logo_photo_url: str | None = None
|
|
||||||
banner_photo_url: str | None = None
|
banner_photo_url: str | None = None
|
||||||
|
|
||||||
# linkedin only atm
|
# linkedin only atm
|
||||||
|
|||||||
@@ -17,14 +17,18 @@ class Site(Enum):
|
|||||||
INDEED = "indeed"
|
INDEED = "indeed"
|
||||||
ZIP_RECRUITER = "zip_recruiter"
|
ZIP_RECRUITER = "zip_recruiter"
|
||||||
GLASSDOOR = "glassdoor"
|
GLASSDOOR = "glassdoor"
|
||||||
|
GOOGLE = "google"
|
||||||
|
|
||||||
|
|
||||||
class SalarySource(Enum):
|
class SalarySource(Enum):
|
||||||
DIRECT_DATA = "direct_data"
|
DIRECT_DATA = "direct_data"
|
||||||
DESCRIPTION = "description"
|
DESCRIPTION = "description"
|
||||||
|
|
||||||
|
|
||||||
class ScraperInput(BaseModel):
|
class ScraperInput(BaseModel):
|
||||||
site_type: list[Site]
|
site_type: list[Site]
|
||||||
search_term: str | None = None
|
search_term: str | None = None
|
||||||
|
google_search_term: str | None = None
|
||||||
|
|
||||||
location: str | None = None
|
location: str | None = None
|
||||||
country: Country | None = Country.USA
|
country: Country | None = Country.USA
|
||||||
@@ -42,9 +46,12 @@ class ScraperInput(BaseModel):
|
|||||||
|
|
||||||
|
|
||||||
class Scraper(ABC):
|
class Scraper(ABC):
|
||||||
def __init__(self, site: Site, proxies: list[str] | None = None):
|
def __init__(
|
||||||
self.proxies = proxies
|
self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None
|
||||||
|
):
|
||||||
self.site = site
|
self.site = site
|
||||||
|
self.proxies = proxies
|
||||||
|
self.ca_cert = ca_cert
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...
|
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...
|
||||||
|
|||||||
@@ -24,3 +24,8 @@ class ZipRecruiterException(Exception):
|
|||||||
class GlassdoorException(Exception):
|
class GlassdoorException(Exception):
|
||||||
def __init__(self, message=None):
|
def __init__(self, message=None):
|
||||||
super().__init__(message or "An error occurred with Glassdoor")
|
super().__init__(message or "An error occurred with Glassdoor")
|
||||||
|
|
||||||
|
|
||||||
|
class GoogleJobsException(Exception):
|
||||||
|
def __init__(self, message=None):
|
||||||
|
super().__init__(message or "An error occurred with Google Jobs")
|
||||||
|
|||||||
@@ -14,13 +14,13 @@ from typing import Optional, Tuple
|
|||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
|
||||||
|
from .constants import fallback_token, query_template, headers
|
||||||
from .. import Scraper, ScraperInput, Site
|
from .. import Scraper, ScraperInput, Site
|
||||||
from ..utils import extract_emails_from_text
|
from ..utils import extract_emails_from_text, create_logger
|
||||||
from ..exceptions import GlassdoorException
|
from ..exceptions import GlassdoorException
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
create_session,
|
create_session,
|
||||||
markdown_converter,
|
markdown_converter,
|
||||||
logger,
|
|
||||||
)
|
)
|
||||||
from ...jobs import (
|
from ...jobs import (
|
||||||
JobPost,
|
JobPost,
|
||||||
@@ -32,14 +32,18 @@ from ...jobs import (
|
|||||||
DescriptionFormat,
|
DescriptionFormat,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
logger = create_logger("Glassdoor")
|
||||||
|
|
||||||
|
|
||||||
class GlassdoorScraper(Scraper):
|
class GlassdoorScraper(Scraper):
|
||||||
def __init__(self, proxies: list[str] | str | None = None):
|
def __init__(
|
||||||
|
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Initializes GlassdoorScraper with the Glassdoor job search url
|
Initializes GlassdoorScraper with the Glassdoor job search url
|
||||||
"""
|
"""
|
||||||
site = Site(Site.GLASSDOOR)
|
site = Site(Site.GLASSDOOR)
|
||||||
super().__init__(site, proxies=proxies)
|
super().__init__(site, proxies=proxies, ca_cert=ca_cert)
|
||||||
|
|
||||||
self.base_url = None
|
self.base_url = None
|
||||||
self.country = None
|
self.country = None
|
||||||
@@ -59,9 +63,12 @@ class GlassdoorScraper(Scraper):
|
|||||||
self.scraper_input.results_wanted = min(900, scraper_input.results_wanted)
|
self.scraper_input.results_wanted = min(900, scraper_input.results_wanted)
|
||||||
self.base_url = self.scraper_input.country.get_glassdoor_url()
|
self.base_url = self.scraper_input.country.get_glassdoor_url()
|
||||||
|
|
||||||
self.session = create_session(proxies=self.proxies, is_tls=True, has_retry=True)
|
self.session = create_session(
|
||||||
|
proxies=self.proxies, ca_cert=self.ca_cert, is_tls=True, has_retry=True
|
||||||
|
)
|
||||||
token = self._get_csrf_token()
|
token = self._get_csrf_token()
|
||||||
self.headers["gd-csrf-token"] = token if token else self.fallback_token
|
headers["gd-csrf-token"] = token if token else fallback_token
|
||||||
|
self.session.headers.update(headers)
|
||||||
|
|
||||||
location_id, location_type = self._get_location(
|
location_id, location_type = self._get_location(
|
||||||
scraper_input.location, scraper_input.is_remote
|
scraper_input.location, scraper_input.is_remote
|
||||||
@@ -76,7 +83,7 @@ class GlassdoorScraper(Scraper):
|
|||||||
tot_pages = (scraper_input.results_wanted // self.jobs_per_page) + 2
|
tot_pages = (scraper_input.results_wanted // self.jobs_per_page) + 2
|
||||||
range_end = min(tot_pages, self.max_pages + 1)
|
range_end = min(tot_pages, self.max_pages + 1)
|
||||||
for page in range(range_start, range_end):
|
for page in range(range_start, range_end):
|
||||||
logger.info(f"Glassdoor search page: {page}")
|
logger.info(f"search page: {page} / {range_end-1}")
|
||||||
try:
|
try:
|
||||||
jobs, cursor = self._fetch_jobs_page(
|
jobs, cursor = self._fetch_jobs_page(
|
||||||
scraper_input, location_id, location_type, page, cursor
|
scraper_input, location_id, location_type, page, cursor
|
||||||
@@ -107,7 +114,6 @@ class GlassdoorScraper(Scraper):
|
|||||||
payload = self._add_payload(location_id, location_type, page_num, cursor)
|
payload = self._add_payload(location_id, location_type, page_num, cursor)
|
||||||
response = self.session.post(
|
response = self.session.post(
|
||||||
f"{self.base_url}/graph",
|
f"{self.base_url}/graph",
|
||||||
headers=self.headers,
|
|
||||||
timeout_seconds=15,
|
timeout_seconds=15,
|
||||||
data=payload,
|
data=payload,
|
||||||
)
|
)
|
||||||
@@ -148,9 +154,7 @@ class GlassdoorScraper(Scraper):
|
|||||||
"""
|
"""
|
||||||
Fetches csrf token needed for API by visiting a generic page
|
Fetches csrf token needed for API by visiting a generic page
|
||||||
"""
|
"""
|
||||||
res = self.session.get(
|
res = self.session.get(f"{self.base_url}/Job/computer-science-jobs.htm")
|
||||||
f"{self.base_url}/Job/computer-science-jobs.htm", headers=self.headers
|
|
||||||
)
|
|
||||||
pattern = r'"token":\s*"([^"]+)"'
|
pattern = r'"token":\s*"([^"]+)"'
|
||||||
matches = re.findall(pattern, res.text)
|
matches = re.findall(pattern, res.text)
|
||||||
token = None
|
token = None
|
||||||
@@ -199,7 +203,7 @@ class GlassdoorScraper(Scraper):
|
|||||||
.lower()
|
.lower()
|
||||||
)
|
)
|
||||||
return JobPost(
|
return JobPost(
|
||||||
id=str(job_id),
|
id=f"gd-{job_id}",
|
||||||
title=title,
|
title=title,
|
||||||
company_url=company_url if company_id else None,
|
company_url=company_url if company_id else None,
|
||||||
company_name=company_name,
|
company_name=company_name,
|
||||||
@@ -210,7 +214,7 @@ class GlassdoorScraper(Scraper):
|
|||||||
is_remote=is_remote,
|
is_remote=is_remote,
|
||||||
description=description,
|
description=description,
|
||||||
emails=extract_emails_from_text(description) if description else None,
|
emails=extract_emails_from_text(description) if description else None,
|
||||||
logo_photo_url=company_logo,
|
company_logo=company_logo,
|
||||||
listing_type=listing_type,
|
listing_type=listing_type,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -243,7 +247,7 @@ class GlassdoorScraper(Scraper):
|
|||||||
""",
|
""",
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
res = requests.post(url, json=body, headers=self.headers)
|
res = requests.post(url, json=body, headers=headers)
|
||||||
if res.status_code != 200:
|
if res.status_code != 200:
|
||||||
return None
|
return None
|
||||||
data = res.json()[0]
|
data = res.json()[0]
|
||||||
@@ -256,7 +260,7 @@ class GlassdoorScraper(Scraper):
|
|||||||
if not location or is_remote:
|
if not location or is_remote:
|
||||||
return "11047", "STATE" # remote options
|
return "11047", "STATE" # remote options
|
||||||
url = f"{self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}"
|
url = f"{self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}"
|
||||||
res = self.session.get(url, headers=self.headers)
|
res = self.session.get(url)
|
||||||
if res.status_code != 200:
|
if res.status_code != 200:
|
||||||
if res.status_code == 429:
|
if res.status_code == 429:
|
||||||
err = f"429 Response - Blocked by Glassdoor for too many requests"
|
err = f"429 Response - Blocked by Glassdoor for too many requests"
|
||||||
@@ -310,7 +314,7 @@ class GlassdoorScraper(Scraper):
|
|||||||
"fromage": fromage,
|
"fromage": fromage,
|
||||||
"sort": "date",
|
"sort": "date",
|
||||||
},
|
},
|
||||||
"query": self.query_template,
|
"query": query_template,
|
||||||
}
|
}
|
||||||
if self.scraper_input.job_type:
|
if self.scraper_input.job_type:
|
||||||
payload["variables"]["filterParams"].append(
|
payload["variables"]["filterParams"].append(
|
||||||
@@ -358,188 +362,3 @@ class GlassdoorScraper(Scraper):
|
|||||||
for cursor_data in pagination_cursors:
|
for cursor_data in pagination_cursors:
|
||||||
if cursor_data["pageNumber"] == page_num:
|
if cursor_data["pageNumber"] == page_num:
|
||||||
return cursor_data["cursor"]
|
return cursor_data["cursor"]
|
||||||
|
|
||||||
fallback_token = "Ft6oHEWlRZrxDww95Cpazw:0pGUrkb2y3TyOpAIqF2vbPmUXoXVkD3oEGDVkvfeCerceQ5-n8mBg3BovySUIjmCPHCaW0H2nQVdqzbtsYqf4Q:wcqRqeegRUa9MVLJGyujVXB7vWFPjdaS1CtrrzJq-ok"
|
|
||||||
headers = {
|
|
||||||
"authority": "www.glassdoor.com",
|
|
||||||
"accept": "*/*",
|
|
||||||
"accept-language": "en-US,en;q=0.9",
|
|
||||||
"apollographql-client-name": "job-search-next",
|
|
||||||
"apollographql-client-version": "4.65.5",
|
|
||||||
"content-type": "application/json",
|
|
||||||
"origin": "https://www.glassdoor.com",
|
|
||||||
"referer": "https://www.glassdoor.com/",
|
|
||||||
"sec-ch-ua": '"Chromium";v="118", "Google Chrome";v="118", "Not=A?Brand";v="99"',
|
|
||||||
"sec-ch-ua-mobile": "?0",
|
|
||||||
"sec-ch-ua-platform": '"macOS"',
|
|
||||||
"sec-fetch-dest": "empty",
|
|
||||||
"sec-fetch-mode": "cors",
|
|
||||||
"sec-fetch-site": "same-origin",
|
|
||||||
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
|
|
||||||
}
|
|
||||||
query_template = """
|
|
||||||
query JobSearchResultsQuery(
|
|
||||||
$excludeJobListingIds: [Long!],
|
|
||||||
$keyword: String,
|
|
||||||
$locationId: Int,
|
|
||||||
$locationType: LocationTypeEnum,
|
|
||||||
$numJobsToShow: Int!,
|
|
||||||
$pageCursor: String,
|
|
||||||
$pageNumber: Int,
|
|
||||||
$filterParams: [FilterParams],
|
|
||||||
$originalPageUrl: String,
|
|
||||||
$seoFriendlyUrlInput: String,
|
|
||||||
$parameterUrlInput: String,
|
|
||||||
$seoUrl: Boolean
|
|
||||||
) {
|
|
||||||
jobListings(
|
|
||||||
contextHolder: {
|
|
||||||
searchParams: {
|
|
||||||
excludeJobListingIds: $excludeJobListingIds,
|
|
||||||
keyword: $keyword,
|
|
||||||
locationId: $locationId,
|
|
||||||
locationType: $locationType,
|
|
||||||
numPerPage: $numJobsToShow,
|
|
||||||
pageCursor: $pageCursor,
|
|
||||||
pageNumber: $pageNumber,
|
|
||||||
filterParams: $filterParams,
|
|
||||||
originalPageUrl: $originalPageUrl,
|
|
||||||
seoFriendlyUrlInput: $seoFriendlyUrlInput,
|
|
||||||
parameterUrlInput: $parameterUrlInput,
|
|
||||||
seoUrl: $seoUrl,
|
|
||||||
searchType: SR
|
|
||||||
}
|
|
||||||
}
|
|
||||||
) {
|
|
||||||
companyFilterOptions {
|
|
||||||
id
|
|
||||||
shortName
|
|
||||||
__typename
|
|
||||||
}
|
|
||||||
filterOptions
|
|
||||||
indeedCtk
|
|
||||||
jobListings {
|
|
||||||
...JobView
|
|
||||||
__typename
|
|
||||||
}
|
|
||||||
jobListingSeoLinks {
|
|
||||||
linkItems {
|
|
||||||
position
|
|
||||||
url
|
|
||||||
__typename
|
|
||||||
}
|
|
||||||
__typename
|
|
||||||
}
|
|
||||||
jobSearchTrackingKey
|
|
||||||
jobsPageSeoData {
|
|
||||||
pageMetaDescription
|
|
||||||
pageTitle
|
|
||||||
__typename
|
|
||||||
}
|
|
||||||
paginationCursors {
|
|
||||||
cursor
|
|
||||||
pageNumber
|
|
||||||
__typename
|
|
||||||
}
|
|
||||||
indexablePageForSeo
|
|
||||||
searchResultsMetadata {
|
|
||||||
searchCriteria {
|
|
||||||
implicitLocation {
|
|
||||||
id
|
|
||||||
localizedDisplayName
|
|
||||||
type
|
|
||||||
__typename
|
|
||||||
}
|
|
||||||
keyword
|
|
||||||
location {
|
|
||||||
id
|
|
||||||
shortName
|
|
||||||
localizedShortName
|
|
||||||
localizedDisplayName
|
|
||||||
type
|
|
||||||
__typename
|
|
||||||
}
|
|
||||||
__typename
|
|
||||||
}
|
|
||||||
helpCenterDomain
|
|
||||||
helpCenterLocale
|
|
||||||
jobSerpJobOutlook {
|
|
||||||
occupation
|
|
||||||
paragraph
|
|
||||||
__typename
|
|
||||||
}
|
|
||||||
showMachineReadableJobs
|
|
||||||
__typename
|
|
||||||
}
|
|
||||||
totalJobsCount
|
|
||||||
__typename
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fragment JobView on JobListingSearchResult {
|
|
||||||
jobview {
|
|
||||||
header {
|
|
||||||
adOrderId
|
|
||||||
advertiserType
|
|
||||||
adOrderSponsorshipLevel
|
|
||||||
ageInDays
|
|
||||||
divisionEmployerName
|
|
||||||
easyApply
|
|
||||||
employer {
|
|
||||||
id
|
|
||||||
name
|
|
||||||
shortName
|
|
||||||
__typename
|
|
||||||
}
|
|
||||||
employerNameFromSearch
|
|
||||||
goc
|
|
||||||
gocConfidence
|
|
||||||
gocId
|
|
||||||
jobCountryId
|
|
||||||
jobLink
|
|
||||||
jobResultTrackingKey
|
|
||||||
jobTitleText
|
|
||||||
locationName
|
|
||||||
locationType
|
|
||||||
locId
|
|
||||||
needsCommission
|
|
||||||
payCurrency
|
|
||||||
payPeriod
|
|
||||||
payPeriodAdjustedPay {
|
|
||||||
p10
|
|
||||||
p50
|
|
||||||
p90
|
|
||||||
__typename
|
|
||||||
}
|
|
||||||
rating
|
|
||||||
salarySource
|
|
||||||
savedJobId
|
|
||||||
sponsored
|
|
||||||
__typename
|
|
||||||
}
|
|
||||||
job {
|
|
||||||
description
|
|
||||||
importConfigId
|
|
||||||
jobTitleId
|
|
||||||
jobTitleText
|
|
||||||
listingId
|
|
||||||
__typename
|
|
||||||
}
|
|
||||||
jobListingAdminDetails {
|
|
||||||
cpcVal
|
|
||||||
importConfigId
|
|
||||||
jobListingId
|
|
||||||
jobSourceId
|
|
||||||
userEligibleForAdminJobDetails
|
|
||||||
__typename
|
|
||||||
}
|
|
||||||
overview {
|
|
||||||
shortName
|
|
||||||
squareLogoUrl
|
|
||||||
__typename
|
|
||||||
}
|
|
||||||
__typename
|
|
||||||
}
|
|
||||||
__typename
|
|
||||||
}
|
|
||||||
"""
|
|
||||||
|
|||||||
184
src/jobspy/scrapers/glassdoor/constants.py
Normal file
184
src/jobspy/scrapers/glassdoor/constants.py
Normal file
@@ -0,0 +1,184 @@
|
|||||||
|
headers = {
|
||||||
|
"authority": "www.glassdoor.com",
|
||||||
|
"accept": "*/*",
|
||||||
|
"accept-language": "en-US,en;q=0.9",
|
||||||
|
"apollographql-client-name": "job-search-next",
|
||||||
|
"apollographql-client-version": "4.65.5",
|
||||||
|
"content-type": "application/json",
|
||||||
|
"origin": "https://www.glassdoor.com",
|
||||||
|
"referer": "https://www.glassdoor.com/",
|
||||||
|
"sec-ch-ua": '"Chromium";v="118", "Google Chrome";v="118", "Not=A?Brand";v="99"',
|
||||||
|
"sec-ch-ua-mobile": "?0",
|
||||||
|
"sec-ch-ua-platform": '"macOS"',
|
||||||
|
"sec-fetch-dest": "empty",
|
||||||
|
"sec-fetch-mode": "cors",
|
||||||
|
"sec-fetch-site": "same-origin",
|
||||||
|
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
|
||||||
|
}
|
||||||
|
query_template = """
|
||||||
|
query JobSearchResultsQuery(
|
||||||
|
$excludeJobListingIds: [Long!],
|
||||||
|
$keyword: String,
|
||||||
|
$locationId: Int,
|
||||||
|
$locationType: LocationTypeEnum,
|
||||||
|
$numJobsToShow: Int!,
|
||||||
|
$pageCursor: String,
|
||||||
|
$pageNumber: Int,
|
||||||
|
$filterParams: [FilterParams],
|
||||||
|
$originalPageUrl: String,
|
||||||
|
$seoFriendlyUrlInput: String,
|
||||||
|
$parameterUrlInput: String,
|
||||||
|
$seoUrl: Boolean
|
||||||
|
) {
|
||||||
|
jobListings(
|
||||||
|
contextHolder: {
|
||||||
|
searchParams: {
|
||||||
|
excludeJobListingIds: $excludeJobListingIds,
|
||||||
|
keyword: $keyword,
|
||||||
|
locationId: $locationId,
|
||||||
|
locationType: $locationType,
|
||||||
|
numPerPage: $numJobsToShow,
|
||||||
|
pageCursor: $pageCursor,
|
||||||
|
pageNumber: $pageNumber,
|
||||||
|
filterParams: $filterParams,
|
||||||
|
originalPageUrl: $originalPageUrl,
|
||||||
|
seoFriendlyUrlInput: $seoFriendlyUrlInput,
|
||||||
|
parameterUrlInput: $parameterUrlInput,
|
||||||
|
seoUrl: $seoUrl,
|
||||||
|
searchType: SR
|
||||||
|
}
|
||||||
|
}
|
||||||
|
) {
|
||||||
|
companyFilterOptions {
|
||||||
|
id
|
||||||
|
shortName
|
||||||
|
__typename
|
||||||
|
}
|
||||||
|
filterOptions
|
||||||
|
indeedCtk
|
||||||
|
jobListings {
|
||||||
|
...JobView
|
||||||
|
__typename
|
||||||
|
}
|
||||||
|
jobListingSeoLinks {
|
||||||
|
linkItems {
|
||||||
|
position
|
||||||
|
url
|
||||||
|
__typename
|
||||||
|
}
|
||||||
|
__typename
|
||||||
|
}
|
||||||
|
jobSearchTrackingKey
|
||||||
|
jobsPageSeoData {
|
||||||
|
pageMetaDescription
|
||||||
|
pageTitle
|
||||||
|
__typename
|
||||||
|
}
|
||||||
|
paginationCursors {
|
||||||
|
cursor
|
||||||
|
pageNumber
|
||||||
|
__typename
|
||||||
|
}
|
||||||
|
indexablePageForSeo
|
||||||
|
searchResultsMetadata {
|
||||||
|
searchCriteria {
|
||||||
|
implicitLocation {
|
||||||
|
id
|
||||||
|
localizedDisplayName
|
||||||
|
type
|
||||||
|
__typename
|
||||||
|
}
|
||||||
|
keyword
|
||||||
|
location {
|
||||||
|
id
|
||||||
|
shortName
|
||||||
|
localizedShortName
|
||||||
|
localizedDisplayName
|
||||||
|
type
|
||||||
|
__typename
|
||||||
|
}
|
||||||
|
__typename
|
||||||
|
}
|
||||||
|
helpCenterDomain
|
||||||
|
helpCenterLocale
|
||||||
|
jobSerpJobOutlook {
|
||||||
|
occupation
|
||||||
|
paragraph
|
||||||
|
__typename
|
||||||
|
}
|
||||||
|
showMachineReadableJobs
|
||||||
|
__typename
|
||||||
|
}
|
||||||
|
totalJobsCount
|
||||||
|
__typename
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fragment JobView on JobListingSearchResult {
|
||||||
|
jobview {
|
||||||
|
header {
|
||||||
|
adOrderId
|
||||||
|
advertiserType
|
||||||
|
adOrderSponsorshipLevel
|
||||||
|
ageInDays
|
||||||
|
divisionEmployerName
|
||||||
|
easyApply
|
||||||
|
employer {
|
||||||
|
id
|
||||||
|
name
|
||||||
|
shortName
|
||||||
|
__typename
|
||||||
|
}
|
||||||
|
employerNameFromSearch
|
||||||
|
goc
|
||||||
|
gocConfidence
|
||||||
|
gocId
|
||||||
|
jobCountryId
|
||||||
|
jobLink
|
||||||
|
jobResultTrackingKey
|
||||||
|
jobTitleText
|
||||||
|
locationName
|
||||||
|
locationType
|
||||||
|
locId
|
||||||
|
needsCommission
|
||||||
|
payCurrency
|
||||||
|
payPeriod
|
||||||
|
payPeriodAdjustedPay {
|
||||||
|
p10
|
||||||
|
p50
|
||||||
|
p90
|
||||||
|
__typename
|
||||||
|
}
|
||||||
|
rating
|
||||||
|
salarySource
|
||||||
|
savedJobId
|
||||||
|
sponsored
|
||||||
|
__typename
|
||||||
|
}
|
||||||
|
job {
|
||||||
|
description
|
||||||
|
importConfigId
|
||||||
|
jobTitleId
|
||||||
|
jobTitleText
|
||||||
|
listingId
|
||||||
|
__typename
|
||||||
|
}
|
||||||
|
jobListingAdminDetails {
|
||||||
|
cpcVal
|
||||||
|
importConfigId
|
||||||
|
jobListingId
|
||||||
|
jobSourceId
|
||||||
|
userEligibleForAdminJobDetails
|
||||||
|
__typename
|
||||||
|
}
|
||||||
|
overview {
|
||||||
|
shortName
|
||||||
|
squareLogoUrl
|
||||||
|
__typename
|
||||||
|
}
|
||||||
|
__typename
|
||||||
|
}
|
||||||
|
__typename
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
fallback_token = "Ft6oHEWlRZrxDww95Cpazw:0pGUrkb2y3TyOpAIqF2vbPmUXoXVkD3oEGDVkvfeCerceQ5-n8mBg3BovySUIjmCPHCaW0H2nQVdqzbtsYqf4Q:wcqRqeegRUa9MVLJGyujVXB7vWFPjdaS1CtrrzJq-ok"
|
||||||
250
src/jobspy/scrapers/google/__init__.py
Normal file
250
src/jobspy/scrapers/google/__init__.py
Normal file
@@ -0,0 +1,250 @@
|
|||||||
|
"""
|
||||||
|
jobspy.scrapers.google
|
||||||
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
This module contains routines to scrape Google.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import math
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
from typing import Tuple
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
from .constants import headers_jobs, headers_initial, async_param
|
||||||
|
from .. import Scraper, ScraperInput, Site
|
||||||
|
from ..utils import extract_emails_from_text, create_logger, extract_job_type
|
||||||
|
from ..utils import (
|
||||||
|
create_session,
|
||||||
|
)
|
||||||
|
from ...jobs import (
|
||||||
|
JobPost,
|
||||||
|
JobResponse,
|
||||||
|
Location,
|
||||||
|
JobType,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = create_logger("Google")
|
||||||
|
|
||||||
|
|
||||||
|
class GoogleJobsScraper(Scraper):
|
||||||
|
def __init__(
|
||||||
|
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Initializes Google Scraper with the Goodle jobs search url
|
||||||
|
"""
|
||||||
|
site = Site(Site.GOOGLE)
|
||||||
|
super().__init__(site, proxies=proxies, ca_cert=ca_cert)
|
||||||
|
|
||||||
|
self.country = None
|
||||||
|
self.session = None
|
||||||
|
self.scraper_input = None
|
||||||
|
self.jobs_per_page = 10
|
||||||
|
self.seen_urls = set()
|
||||||
|
self.url = "https://www.google.com/search"
|
||||||
|
self.jobs_url = "https://www.google.com/async/callback:550"
|
||||||
|
|
||||||
|
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
||||||
|
"""
|
||||||
|
Scrapes Google for jobs with scraper_input criteria.
|
||||||
|
:param scraper_input: Information about job search criteria.
|
||||||
|
:return: JobResponse containing a list of jobs.
|
||||||
|
"""
|
||||||
|
self.scraper_input = scraper_input
|
||||||
|
self.scraper_input.results_wanted = min(900, scraper_input.results_wanted)
|
||||||
|
|
||||||
|
self.session = create_session(
|
||||||
|
proxies=self.proxies, ca_cert=self.ca_cert, is_tls=False, has_retry=True
|
||||||
|
)
|
||||||
|
forward_cursor, job_list = self._get_initial_cursor_and_jobs()
|
||||||
|
if forward_cursor is None:
|
||||||
|
logger.warning(
|
||||||
|
"initial cursor not found, try changing your query or there was at most 10 results"
|
||||||
|
)
|
||||||
|
return JobResponse(jobs=job_list)
|
||||||
|
|
||||||
|
page = 1
|
||||||
|
|
||||||
|
while (
|
||||||
|
len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset
|
||||||
|
and forward_cursor
|
||||||
|
):
|
||||||
|
logger.info(
|
||||||
|
f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
jobs, forward_cursor = self._get_jobs_next_page(forward_cursor)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"failed to get jobs on page: {page}, {e}")
|
||||||
|
break
|
||||||
|
if not jobs:
|
||||||
|
logger.info(f"found no jobs on page: {page}")
|
||||||
|
break
|
||||||
|
job_list += jobs
|
||||||
|
page += 1
|
||||||
|
return JobResponse(
|
||||||
|
jobs=job_list[
|
||||||
|
scraper_input.offset : scraper_input.offset
|
||||||
|
+ scraper_input.results_wanted
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
def _get_initial_cursor_and_jobs(self) -> Tuple[str, list[JobPost]]:
|
||||||
|
"""Gets initial cursor and jobs to paginate through job listings"""
|
||||||
|
query = f"{self.scraper_input.search_term} jobs"
|
||||||
|
|
||||||
|
def get_time_range(hours_old):
|
||||||
|
if hours_old <= 24:
|
||||||
|
return "since yesterday"
|
||||||
|
elif hours_old <= 72:
|
||||||
|
return "in the last 3 days"
|
||||||
|
elif hours_old <= 168:
|
||||||
|
return "in the last week"
|
||||||
|
else:
|
||||||
|
return "in the last month"
|
||||||
|
|
||||||
|
job_type_mapping = {
|
||||||
|
JobType.FULL_TIME: "Full time",
|
||||||
|
JobType.PART_TIME: "Part time",
|
||||||
|
JobType.INTERNSHIP: "Internship",
|
||||||
|
JobType.CONTRACT: "Contract",
|
||||||
|
}
|
||||||
|
|
||||||
|
if self.scraper_input.job_type in job_type_mapping:
|
||||||
|
query += f" {job_type_mapping[self.scraper_input.job_type]}"
|
||||||
|
|
||||||
|
if self.scraper_input.location:
|
||||||
|
query += f" near {self.scraper_input.location}"
|
||||||
|
|
||||||
|
if self.scraper_input.hours_old:
|
||||||
|
time_filter = get_time_range(self.scraper_input.hours_old)
|
||||||
|
query += f" {time_filter}"
|
||||||
|
|
||||||
|
if self.scraper_input.is_remote:
|
||||||
|
query += " remote"
|
||||||
|
|
||||||
|
if self.scraper_input.google_search_term:
|
||||||
|
query = self.scraper_input.google_search_term
|
||||||
|
|
||||||
|
params = {"q": query, "udm": "8"}
|
||||||
|
response = self.session.get(self.url, headers=headers_initial, params=params)
|
||||||
|
|
||||||
|
pattern_fc = r'<div jsname="Yust4d"[^>]+data-async-fc="([^"]+)"'
|
||||||
|
match_fc = re.search(pattern_fc, response.text)
|
||||||
|
data_async_fc = match_fc.group(1) if match_fc else None
|
||||||
|
jobs_raw = self._find_job_info_initial_page(response.text)
|
||||||
|
jobs = []
|
||||||
|
for job_raw in jobs_raw:
|
||||||
|
job_post = self._parse_job(job_raw)
|
||||||
|
if job_post:
|
||||||
|
jobs.append(job_post)
|
||||||
|
return data_async_fc, jobs
|
||||||
|
|
||||||
|
def _get_jobs_next_page(self, forward_cursor: str) -> Tuple[list[JobPost], str]:
|
||||||
|
params = {"fc": [forward_cursor], "fcv": ["3"], "async": [async_param]}
|
||||||
|
response = self.session.get(self.jobs_url, headers=headers_jobs, params=params)
|
||||||
|
return self._parse_jobs(response.text)
|
||||||
|
|
||||||
|
def _parse_jobs(self, job_data: str) -> Tuple[list[JobPost], str]:
|
||||||
|
"""
|
||||||
|
Parses jobs on a page with next page cursor
|
||||||
|
"""
|
||||||
|
start_idx = job_data.find("[[[")
|
||||||
|
end_idx = job_data.rindex("]]]") + 3
|
||||||
|
s = job_data[start_idx:end_idx]
|
||||||
|
parsed = json.loads(s)[0]
|
||||||
|
|
||||||
|
pattern_fc = r'data-async-fc="([^"]+)"'
|
||||||
|
match_fc = re.search(pattern_fc, job_data)
|
||||||
|
data_async_fc = match_fc.group(1) if match_fc else None
|
||||||
|
jobs_on_page = []
|
||||||
|
for array in parsed:
|
||||||
|
_, job_data = array
|
||||||
|
if not job_data.startswith("[[["):
|
||||||
|
continue
|
||||||
|
job_d = json.loads(job_data)
|
||||||
|
|
||||||
|
job_info = self._find_job_info(job_d)
|
||||||
|
job_post = self._parse_job(job_info)
|
||||||
|
if job_post:
|
||||||
|
jobs_on_page.append(job_post)
|
||||||
|
return jobs_on_page, data_async_fc
|
||||||
|
|
||||||
|
def _parse_job(self, job_info: list):
|
||||||
|
job_url = job_info[3][0][0] if job_info[3] and job_info[3][0] else None
|
||||||
|
if job_url in self.seen_urls:
|
||||||
|
return
|
||||||
|
self.seen_urls.add(job_url)
|
||||||
|
|
||||||
|
title = job_info[0]
|
||||||
|
company_name = job_info[1]
|
||||||
|
location = city = job_info[2]
|
||||||
|
state = country = date_posted = None
|
||||||
|
if location and "," in location:
|
||||||
|
city, state, *country = [*map(lambda x: x.strip(), location.split(","))]
|
||||||
|
|
||||||
|
days_ago_str = job_info[12]
|
||||||
|
if type(days_ago_str) == str:
|
||||||
|
match = re.search(r"\d+", days_ago_str)
|
||||||
|
days_ago = int(match.group()) if match else None
|
||||||
|
date_posted = (datetime.now() - timedelta(days=days_ago)).date()
|
||||||
|
|
||||||
|
description = job_info[19]
|
||||||
|
|
||||||
|
job_post = JobPost(
|
||||||
|
id=f"go-{job_info[28]}",
|
||||||
|
title=title,
|
||||||
|
company_name=company_name,
|
||||||
|
location=Location(
|
||||||
|
city=city, state=state, country=country[0] if country else None
|
||||||
|
),
|
||||||
|
job_url=job_url,
|
||||||
|
date_posted=date_posted,
|
||||||
|
is_remote="remote" in description.lower() or "wfh" in description.lower(),
|
||||||
|
description=description,
|
||||||
|
emails=extract_emails_from_text(description),
|
||||||
|
job_type=extract_job_type(description),
|
||||||
|
)
|
||||||
|
return job_post
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _find_job_info(jobs_data: list | dict) -> list | None:
|
||||||
|
"""Iterates through the JSON data to find the job listings"""
|
||||||
|
if isinstance(jobs_data, dict):
|
||||||
|
for key, value in jobs_data.items():
|
||||||
|
if key == "520084652" and isinstance(value, list):
|
||||||
|
return value
|
||||||
|
else:
|
||||||
|
result = GoogleJobsScraper._find_job_info(value)
|
||||||
|
if result:
|
||||||
|
return result
|
||||||
|
elif isinstance(jobs_data, list):
|
||||||
|
for item in jobs_data:
|
||||||
|
result = GoogleJobsScraper._find_job_info(item)
|
||||||
|
if result:
|
||||||
|
return result
|
||||||
|
return None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _find_job_info_initial_page(html_text: str):
|
||||||
|
pattern = (
|
||||||
|
f'520084652":('
|
||||||
|
+ r"\[(?:[^\[\]]|\[(?:[^\[\]]|\[(?:[^\[\]]|\[[^\[\]]*\])*\])*\])*\])"
|
||||||
|
)
|
||||||
|
results = []
|
||||||
|
matches = re.finditer(pattern, html_text)
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
|
for match in matches:
|
||||||
|
try:
|
||||||
|
parsed_data = json.loads(match.group(1))
|
||||||
|
results.append(parsed_data)
|
||||||
|
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
logger.error(f"Failed to parse match: {str(e)}")
|
||||||
|
results.append({"raw_match": match.group(0), "error": str(e)})
|
||||||
|
return results
|
||||||
52
src/jobspy/scrapers/google/constants.py
Normal file
52
src/jobspy/scrapers/google/constants.py
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
headers_initial = {
|
||||||
|
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
||||||
|
"accept-language": "en-US,en;q=0.9",
|
||||||
|
"priority": "u=0, i",
|
||||||
|
"referer": "https://www.google.com/",
|
||||||
|
"sec-ch-prefers-color-scheme": "dark",
|
||||||
|
"sec-ch-ua": '"Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"',
|
||||||
|
"sec-ch-ua-arch": '"arm"',
|
||||||
|
"sec-ch-ua-bitness": '"64"',
|
||||||
|
"sec-ch-ua-form-factors": '"Desktop"',
|
||||||
|
"sec-ch-ua-full-version": '"130.0.6723.58"',
|
||||||
|
"sec-ch-ua-full-version-list": '"Chromium";v="130.0.6723.58", "Google Chrome";v="130.0.6723.58", "Not?A_Brand";v="99.0.0.0"',
|
||||||
|
"sec-ch-ua-mobile": "?0",
|
||||||
|
"sec-ch-ua-model": '""',
|
||||||
|
"sec-ch-ua-platform": '"macOS"',
|
||||||
|
"sec-ch-ua-platform-version": '"15.0.1"',
|
||||||
|
"sec-ch-ua-wow64": "?0",
|
||||||
|
"sec-fetch-dest": "document",
|
||||||
|
"sec-fetch-mode": "navigate",
|
||||||
|
"sec-fetch-site": "same-origin",
|
||||||
|
"sec-fetch-user": "?1",
|
||||||
|
"upgrade-insecure-requests": "1",
|
||||||
|
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
|
||||||
|
"x-browser-channel": "stable",
|
||||||
|
"x-browser-copyright": "Copyright 2024 Google LLC. All rights reserved.",
|
||||||
|
"x-browser-year": "2024",
|
||||||
|
}
|
||||||
|
|
||||||
|
headers_jobs = {
|
||||||
|
"accept": "*/*",
|
||||||
|
"accept-language": "en-US,en;q=0.9",
|
||||||
|
"priority": "u=1, i",
|
||||||
|
"referer": "https://www.google.com/",
|
||||||
|
"sec-ch-prefers-color-scheme": "dark",
|
||||||
|
"sec-ch-ua": '"Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"',
|
||||||
|
"sec-ch-ua-arch": '"arm"',
|
||||||
|
"sec-ch-ua-bitness": '"64"',
|
||||||
|
"sec-ch-ua-form-factors": '"Desktop"',
|
||||||
|
"sec-ch-ua-full-version": '"130.0.6723.58"',
|
||||||
|
"sec-ch-ua-full-version-list": '"Chromium";v="130.0.6723.58", "Google Chrome";v="130.0.6723.58", "Not?A_Brand";v="99.0.0.0"',
|
||||||
|
"sec-ch-ua-mobile": "?0",
|
||||||
|
"sec-ch-ua-model": '""',
|
||||||
|
"sec-ch-ua-platform": '"macOS"',
|
||||||
|
"sec-ch-ua-platform-version": '"15.0.1"',
|
||||||
|
"sec-ch-ua-wow64": "?0",
|
||||||
|
"sec-fetch-dest": "empty",
|
||||||
|
"sec-fetch-mode": "cors",
|
||||||
|
"sec-fetch-site": "same-origin",
|
||||||
|
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
|
||||||
|
}
|
||||||
|
|
||||||
|
async_param = "_basejs:/xjs/_/js/k=xjs.s.en_US.JwveA-JiKmg.2018.O/am=AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIAAAAAAAAACAAAoICAAAAAAAKMAfAAAAIAQAAAAAAAAAAAAACCAAAEJDAAACAAAAAGABAIAAARBAAABAAAAAgAgQAABAASKAfv8JAAABAAAAAAwAQAQACQAAAAAAcAEAQABoCAAAABAAAIABAACAAAAEAAAAFAAAAAAAAAAAAAAAAAAAAAAAAACAQADoBwAAAAAAAAAAAAAQBAAAAATQAAoACOAHAAAAAAAAAQAAAIIAAAA_ZAACAAAAAAAAcB8APB4wHFJ4AAAAAAAAAAAAAAAACECCYA5If0EACAAAAAAAAAAAAAAAAAAAUgRNXG4AMAE/dg=0/br=1/rs=ACT90oGxMeaFMCopIHq5tuQM-6_3M_VMjQ,_basecss:/xjs/_/ss/k=xjs.s.IwsGu62EDtU.L.B1.O/am=QOoQIAQAAAQAREADEBAAAAAAAAAAAAAAAAAAAAAgAQAAIAAAgAQAAAIAIAIAoEwCAADIC8AfsgEAawwAPkAAjgoAGAAAAAAAAEADAAAAAAIgAECHAAAAAAAAAAABAQAggAARQAAAQCEAAAAAIAAAABgAAAAAIAQIACCAAfB-AAFIQABoCEA_CgEAAIABAACEgHAEwwAEFQAM4CgAAAAAAAAAAAAACABCAAAAQEAAABAgAMCPAAA4AoE2BAEAggSAAIoAQAAAAAgAAAAACCAQAAAxEwA_ZAACAAAAAAAAAAkAAAAAAAAgAAAAAAAAAAAAAAAAAAAAAAAAQAEAAAAAAAAAAAAAAAAAAAAAQA/br=1/rs=ACT90oGZc36t3uUQkj0srnIvvbHjO2hgyg,_basecomb:/xjs/_/js/k=xjs.s.en_US.JwveA-JiKmg.2018.O/ck=xjs.s.IwsGu62EDtU.L.B1.O/am=QOoQIAQAAAQAREADEBAAAAAAAAAAAAAAAAAAAAAgAQAAIAAAgAQAAAKAIAoIqEwCAADIK8AfsgEAawwAPkAAjgoAGAAACCAAAEJDAAACAAIgAGCHAIAAARBAAABBAQAggAgRQABAQSOAfv8JIAABABgAAAwAYAQICSCAAfB-cAFIQABoCEA_ChEAAIABAACEgHAEwwAEFQAM4CgAAAAAAAAAAAAACABCAACAQEDoBxAgAMCPAAA4AoE2BAEAggTQAIoASOAHAAgAAAAACSAQAIIxEwA_ZAACAAAAAAAAcB8APB4wHFJ4AAAAAAAAAAAAAAAACECCYA5If0EACAAAAAAAAAAAAAAAAAAAUgRNXG4AMAE/d=1/ed=1/dg=0/br=1/ujg=1/rs=ACT90oFNLTjPzD_OAqhhtXwe2pg1T3WpBg,_fmt:prog,_id:fc_5FwaZ86OKsfdwN4P4La3yA4_2"
|
||||||
@@ -10,15 +10,15 @@ from __future__ import annotations
|
|||||||
import math
|
import math
|
||||||
from typing import Tuple
|
from typing import Tuple
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from concurrent.futures import ThreadPoolExecutor, Future
|
|
||||||
|
|
||||||
|
from .constants import job_search_query, api_headers
|
||||||
from .. import Scraper, ScraperInput, Site
|
from .. import Scraper, ScraperInput, Site
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
extract_emails_from_text,
|
extract_emails_from_text,
|
||||||
get_enum_from_job_type,
|
get_enum_from_job_type,
|
||||||
markdown_converter,
|
markdown_converter,
|
||||||
logger,
|
|
||||||
create_session,
|
create_session,
|
||||||
|
create_logger,
|
||||||
)
|
)
|
||||||
from ...jobs import (
|
from ...jobs import (
|
||||||
JobPost,
|
JobPost,
|
||||||
@@ -30,15 +30,21 @@ from ...jobs import (
|
|||||||
DescriptionFormat,
|
DescriptionFormat,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
logger = create_logger("Indeed")
|
||||||
|
|
||||||
|
|
||||||
class IndeedScraper(Scraper):
|
class IndeedScraper(Scraper):
|
||||||
def __init__(self, proxies: list[str] | str | None = None):
|
def __init__(
|
||||||
|
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Initializes IndeedScraper with the Indeed API url
|
Initializes IndeedScraper with the Indeed API url
|
||||||
"""
|
"""
|
||||||
super().__init__(Site.INDEED, proxies=proxies)
|
super().__init__(Site.INDEED, proxies=proxies)
|
||||||
|
|
||||||
self.session = create_session(proxies=self.proxies, is_tls=False)
|
self.session = create_session(
|
||||||
|
proxies=self.proxies, ca_cert=ca_cert, is_tls=False
|
||||||
|
)
|
||||||
self.scraper_input = None
|
self.scraper_input = None
|
||||||
self.jobs_per_page = 100
|
self.jobs_per_page = 100
|
||||||
self.num_workers = 10
|
self.num_workers = 10
|
||||||
@@ -57,29 +63,29 @@ class IndeedScraper(Scraper):
|
|||||||
self.scraper_input = scraper_input
|
self.scraper_input = scraper_input
|
||||||
domain, self.api_country_code = self.scraper_input.country.indeed_domain_value
|
domain, self.api_country_code = self.scraper_input.country.indeed_domain_value
|
||||||
self.base_url = f"https://{domain}.indeed.com"
|
self.base_url = f"https://{domain}.indeed.com"
|
||||||
self.headers = self.api_headers.copy()
|
self.headers = api_headers.copy()
|
||||||
self.headers["indeed-co"] = self.scraper_input.country.indeed_domain_value
|
self.headers["indeed-co"] = self.scraper_input.country.indeed_domain_value
|
||||||
job_list = []
|
job_list = []
|
||||||
page = 1
|
page = 1
|
||||||
|
|
||||||
cursor = None
|
cursor = None
|
||||||
offset_pages = math.ceil(self.scraper_input.offset / 100)
|
|
||||||
for _ in range(offset_pages):
|
|
||||||
logger.info(f"Indeed skipping search page: {page}")
|
|
||||||
__, cursor = self._scrape_page(cursor)
|
|
||||||
if not __:
|
|
||||||
logger.info(f"Indeed found no jobs on page: {page}")
|
|
||||||
break
|
|
||||||
|
|
||||||
while len(self.seen_urls) < scraper_input.results_wanted:
|
while len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset:
|
||||||
logger.info(f"Indeed search page: {page}")
|
logger.info(
|
||||||
|
f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
|
||||||
|
)
|
||||||
jobs, cursor = self._scrape_page(cursor)
|
jobs, cursor = self._scrape_page(cursor)
|
||||||
if not jobs:
|
if not jobs:
|
||||||
logger.info(f"Indeed found no jobs on page: {page}")
|
logger.info(f"found no jobs on page: {page}")
|
||||||
break
|
break
|
||||||
job_list += jobs
|
job_list += jobs
|
||||||
page += 1
|
page += 1
|
||||||
return JobResponse(jobs=job_list[: scraper_input.results_wanted])
|
return JobResponse(
|
||||||
|
jobs=job_list[
|
||||||
|
scraper_input.offset : scraper_input.offset
|
||||||
|
+ scraper_input.results_wanted
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
def _scrape_page(self, cursor: str | None) -> Tuple[list[JobPost], str | None]:
|
def _scrape_page(self, cursor: str | None) -> Tuple[list[JobPost], str | None]:
|
||||||
"""
|
"""
|
||||||
@@ -95,7 +101,7 @@ class IndeedScraper(Scraper):
|
|||||||
if self.scraper_input.search_term
|
if self.scraper_input.search_term
|
||||||
else ""
|
else ""
|
||||||
)
|
)
|
||||||
query = self.job_search_query.format(
|
query = job_search_query.format(
|
||||||
what=(f'what: "{search_term}"' if search_term else ""),
|
what=(f'what: "{search_term}"' if search_term else ""),
|
||||||
location=(
|
location=(
|
||||||
f'location: {{where: "{self.scraper_input.location}", radius: {self.scraper_input.distance}, radiusUnit: MILES}}'
|
f'location: {{where: "{self.scraper_input.location}", radius: {self.scraper_input.distance}, radiusUnit: MILES}}'
|
||||||
@@ -109,28 +115,29 @@ class IndeedScraper(Scraper):
|
|||||||
payload = {
|
payload = {
|
||||||
"query": query,
|
"query": query,
|
||||||
}
|
}
|
||||||
api_headers = self.api_headers.copy()
|
api_headers_temp = api_headers.copy()
|
||||||
api_headers["indeed-co"] = self.api_country_code
|
api_headers_temp["indeed-co"] = self.api_country_code
|
||||||
response = self.session.post(
|
response = self.session.post(
|
||||||
self.api_url,
|
self.api_url,
|
||||||
headers=api_headers,
|
headers=api_headers_temp,
|
||||||
json=payload,
|
json=payload,
|
||||||
timeout=10,
|
timeout=10,
|
||||||
)
|
)
|
||||||
if response.status_code != 200:
|
if not response.ok:
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Indeed responded with status code: {response.status_code} (submit GitHub issue if this appears to be a bug)"
|
f"responded with status code: {response.status_code} (submit GitHub issue if this appears to be a bug)"
|
||||||
)
|
)
|
||||||
return jobs, new_cursor
|
return jobs, new_cursor
|
||||||
data = response.json()
|
data = response.json()
|
||||||
jobs = data["data"]["jobSearch"]["results"]
|
jobs = data["data"]["jobSearch"]["results"]
|
||||||
new_cursor = data["data"]["jobSearch"]["pageInfo"]["nextCursor"]
|
new_cursor = data["data"]["jobSearch"]["pageInfo"]["nextCursor"]
|
||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
|
job_list = []
|
||||||
job_results: list[Future] = [
|
for job in jobs:
|
||||||
executor.submit(self._process_job, job["job"]) for job in jobs
|
processed_job = self._process_job(job["job"])
|
||||||
]
|
if processed_job:
|
||||||
job_list = [result.result() for result in job_results if result.result()]
|
job_list.append(processed_job)
|
||||||
|
|
||||||
return job_list, new_cursor
|
return job_list, new_cursor
|
||||||
|
|
||||||
def _build_filters(self):
|
def _build_filters(self):
|
||||||
@@ -212,7 +219,7 @@ class IndeedScraper(Scraper):
|
|||||||
employer_details = employer.get("employerDetails", {}) if employer else {}
|
employer_details = employer.get("employerDetails", {}) if employer else {}
|
||||||
rel_url = job["employer"]["relativeCompanyPageUrl"] if job["employer"] else None
|
rel_url = job["employer"]["relativeCompanyPageUrl"] if job["employer"] else None
|
||||||
return JobPost(
|
return JobPost(
|
||||||
id=str(job["key"]),
|
id=f'in-{job["key"]}',
|
||||||
title=job["title"],
|
title=job["title"],
|
||||||
description=description,
|
description=description,
|
||||||
company_name=job["employer"].get("name") if job.get("employer") else None,
|
company_name=job["employer"].get("name") if job.get("employer") else None,
|
||||||
@@ -251,18 +258,11 @@ class IndeedScraper(Scraper):
|
|||||||
company_num_employees=employer_details.get("employeesLocalizedLabel"),
|
company_num_employees=employer_details.get("employeesLocalizedLabel"),
|
||||||
company_revenue=employer_details.get("revenueLocalizedLabel"),
|
company_revenue=employer_details.get("revenueLocalizedLabel"),
|
||||||
company_description=employer_details.get("briefDescription"),
|
company_description=employer_details.get("briefDescription"),
|
||||||
ceo_name=employer_details.get("ceoName"),
|
company_logo=(
|
||||||
ceo_photo_url=employer_details.get("ceoPhotoUrl"),
|
|
||||||
logo_photo_url=(
|
|
||||||
employer["images"].get("squareLogoUrl")
|
employer["images"].get("squareLogoUrl")
|
||||||
if employer and employer.get("images")
|
if employer and employer.get("images")
|
||||||
else None
|
else None
|
||||||
),
|
),
|
||||||
banner_photo_url=(
|
|
||||||
employer["images"].get("headerImageUrl")
|
|
||||||
if employer and employer.get("images")
|
|
||||||
else None
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@@ -347,112 +347,3 @@ class IndeedScraper(Scraper):
|
|||||||
return CompensationInterval[mapped_interval]
|
return CompensationInterval[mapped_interval]
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unsupported interval: {interval}")
|
raise ValueError(f"Unsupported interval: {interval}")
|
||||||
|
|
||||||
api_headers = {
|
|
||||||
"Host": "apis.indeed.com",
|
|
||||||
"content-type": "application/json",
|
|
||||||
"indeed-api-key": "161092c2017b5bbab13edb12461a62d5a833871e7cad6d9d475304573de67ac8",
|
|
||||||
"accept": "application/json",
|
|
||||||
"indeed-locale": "en-US",
|
|
||||||
"accept-language": "en-US,en;q=0.9",
|
|
||||||
"user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Indeed App 193.1",
|
|
||||||
"indeed-app-info": "appv=193.1; appid=com.indeed.jobsearch; osv=16.6.1; os=ios; dtype=phone",
|
|
||||||
}
|
|
||||||
job_search_query = """
|
|
||||||
query GetJobData {{
|
|
||||||
jobSearch(
|
|
||||||
{what}
|
|
||||||
{location}
|
|
||||||
limit: 100
|
|
||||||
sort: DATE
|
|
||||||
{cursor}
|
|
||||||
{filters}
|
|
||||||
) {{
|
|
||||||
pageInfo {{
|
|
||||||
nextCursor
|
|
||||||
}}
|
|
||||||
results {{
|
|
||||||
trackingKey
|
|
||||||
job {{
|
|
||||||
source {{
|
|
||||||
name
|
|
||||||
}}
|
|
||||||
key
|
|
||||||
title
|
|
||||||
datePublished
|
|
||||||
dateOnIndeed
|
|
||||||
description {{
|
|
||||||
html
|
|
||||||
}}
|
|
||||||
location {{
|
|
||||||
countryName
|
|
||||||
countryCode
|
|
||||||
admin1Code
|
|
||||||
city
|
|
||||||
postalCode
|
|
||||||
streetAddress
|
|
||||||
formatted {{
|
|
||||||
short
|
|
||||||
long
|
|
||||||
}}
|
|
||||||
}}
|
|
||||||
compensation {{
|
|
||||||
estimated {{
|
|
||||||
currencyCode
|
|
||||||
baseSalary {{
|
|
||||||
unitOfWork
|
|
||||||
range {{
|
|
||||||
... on Range {{
|
|
||||||
min
|
|
||||||
max
|
|
||||||
}}
|
|
||||||
}}
|
|
||||||
}}
|
|
||||||
}}
|
|
||||||
baseSalary {{
|
|
||||||
unitOfWork
|
|
||||||
range {{
|
|
||||||
... on Range {{
|
|
||||||
min
|
|
||||||
max
|
|
||||||
}}
|
|
||||||
}}
|
|
||||||
}}
|
|
||||||
currencyCode
|
|
||||||
}}
|
|
||||||
attributes {{
|
|
||||||
key
|
|
||||||
label
|
|
||||||
}}
|
|
||||||
employer {{
|
|
||||||
relativeCompanyPageUrl
|
|
||||||
name
|
|
||||||
dossier {{
|
|
||||||
employerDetails {{
|
|
||||||
addresses
|
|
||||||
industry
|
|
||||||
employeesLocalizedLabel
|
|
||||||
revenueLocalizedLabel
|
|
||||||
briefDescription
|
|
||||||
ceoName
|
|
||||||
ceoPhotoUrl
|
|
||||||
}}
|
|
||||||
images {{
|
|
||||||
headerImageUrl
|
|
||||||
squareLogoUrl
|
|
||||||
}}
|
|
||||||
links {{
|
|
||||||
corporateWebsite
|
|
||||||
}}
|
|
||||||
}}
|
|
||||||
}}
|
|
||||||
recruit {{
|
|
||||||
viewJobUrl
|
|
||||||
detailedSalary
|
|
||||||
workSchedule
|
|
||||||
}}
|
|
||||||
}}
|
|
||||||
}}
|
|
||||||
}}
|
|
||||||
}}
|
|
||||||
"""
|
|
||||||
|
|||||||
109
src/jobspy/scrapers/indeed/constants.py
Normal file
109
src/jobspy/scrapers/indeed/constants.py
Normal file
@@ -0,0 +1,109 @@
|
|||||||
|
job_search_query = """
|
||||||
|
query GetJobData {{
|
||||||
|
jobSearch(
|
||||||
|
{what}
|
||||||
|
{location}
|
||||||
|
limit: 100
|
||||||
|
{cursor}
|
||||||
|
sort: RELEVANCE
|
||||||
|
{filters}
|
||||||
|
) {{
|
||||||
|
pageInfo {{
|
||||||
|
nextCursor
|
||||||
|
}}
|
||||||
|
results {{
|
||||||
|
trackingKey
|
||||||
|
job {{
|
||||||
|
source {{
|
||||||
|
name
|
||||||
|
}}
|
||||||
|
key
|
||||||
|
title
|
||||||
|
datePublished
|
||||||
|
dateOnIndeed
|
||||||
|
description {{
|
||||||
|
html
|
||||||
|
}}
|
||||||
|
location {{
|
||||||
|
countryName
|
||||||
|
countryCode
|
||||||
|
admin1Code
|
||||||
|
city
|
||||||
|
postalCode
|
||||||
|
streetAddress
|
||||||
|
formatted {{
|
||||||
|
short
|
||||||
|
long
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
compensation {{
|
||||||
|
estimated {{
|
||||||
|
currencyCode
|
||||||
|
baseSalary {{
|
||||||
|
unitOfWork
|
||||||
|
range {{
|
||||||
|
... on Range {{
|
||||||
|
min
|
||||||
|
max
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
baseSalary {{
|
||||||
|
unitOfWork
|
||||||
|
range {{
|
||||||
|
... on Range {{
|
||||||
|
min
|
||||||
|
max
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
currencyCode
|
||||||
|
}}
|
||||||
|
attributes {{
|
||||||
|
key
|
||||||
|
label
|
||||||
|
}}
|
||||||
|
employer {{
|
||||||
|
relativeCompanyPageUrl
|
||||||
|
name
|
||||||
|
dossier {{
|
||||||
|
employerDetails {{
|
||||||
|
addresses
|
||||||
|
industry
|
||||||
|
employeesLocalizedLabel
|
||||||
|
revenueLocalizedLabel
|
||||||
|
briefDescription
|
||||||
|
ceoName
|
||||||
|
ceoPhotoUrl
|
||||||
|
}}
|
||||||
|
images {{
|
||||||
|
headerImageUrl
|
||||||
|
squareLogoUrl
|
||||||
|
}}
|
||||||
|
links {{
|
||||||
|
corporateWebsite
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
recruit {{
|
||||||
|
viewJobUrl
|
||||||
|
detailedSalary
|
||||||
|
workSchedule
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
"""
|
||||||
|
|
||||||
|
api_headers = {
|
||||||
|
"Host": "apis.indeed.com",
|
||||||
|
"content-type": "application/json",
|
||||||
|
"indeed-api-key": "161092c2017b5bbab13edb12461a62d5a833871e7cad6d9d475304573de67ac8",
|
||||||
|
"accept": "application/json",
|
||||||
|
"indeed-locale": "en-US",
|
||||||
|
"accept-language": "en-US,en;q=0.9",
|
||||||
|
"user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Indeed App 193.1",
|
||||||
|
"indeed-app-info": "appv=193.1; appid=com.indeed.jobsearch; osv=16.6.1; os=ios; dtype=phone",
|
||||||
|
}
|
||||||
@@ -7,6 +7,7 @@ This module contains routines to scrape LinkedIn.
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import math
|
||||||
import time
|
import time
|
||||||
import random
|
import random
|
||||||
import regex as re
|
import regex as re
|
||||||
@@ -17,9 +18,10 @@ from bs4.element import Tag
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from urllib.parse import urlparse, urlunparse, unquote
|
from urllib.parse import urlparse, urlunparse, unquote
|
||||||
|
|
||||||
|
from .constants import headers
|
||||||
from .. import Scraper, ScraperInput, Site
|
from .. import Scraper, ScraperInput, Site
|
||||||
from ..exceptions import LinkedInException
|
from ..exceptions import LinkedInException
|
||||||
from ..utils import create_session, remove_attributes
|
from ..utils import create_session, remove_attributes, create_logger
|
||||||
from ...jobs import (
|
from ...jobs import (
|
||||||
JobPost,
|
JobPost,
|
||||||
Location,
|
Location,
|
||||||
@@ -30,13 +32,14 @@ from ...jobs import (
|
|||||||
DescriptionFormat,
|
DescriptionFormat,
|
||||||
)
|
)
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
logger,
|
|
||||||
extract_emails_from_text,
|
extract_emails_from_text,
|
||||||
get_enum_from_job_type,
|
get_enum_from_job_type,
|
||||||
currency_parser,
|
currency_parser,
|
||||||
markdown_converter,
|
markdown_converter,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
logger = create_logger("LinkedIn")
|
||||||
|
|
||||||
|
|
||||||
class LinkedInScraper(Scraper):
|
class LinkedInScraper(Scraper):
|
||||||
base_url = "https://www.linkedin.com"
|
base_url = "https://www.linkedin.com"
|
||||||
@@ -44,19 +47,22 @@ class LinkedInScraper(Scraper):
|
|||||||
band_delay = 4
|
band_delay = 4
|
||||||
jobs_per_page = 25
|
jobs_per_page = 25
|
||||||
|
|
||||||
def __init__(self, proxies: list[str] | str | None = None):
|
def __init__(
|
||||||
|
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Initializes LinkedInScraper with the LinkedIn job search url
|
Initializes LinkedInScraper with the LinkedIn job search url
|
||||||
"""
|
"""
|
||||||
super().__init__(Site.LINKEDIN, proxies=proxies)
|
super().__init__(Site.LINKEDIN, proxies=proxies, ca_cert=ca_cert)
|
||||||
self.session = create_session(
|
self.session = create_session(
|
||||||
proxies=self.proxies,
|
proxies=self.proxies,
|
||||||
|
ca_cert=ca_cert,
|
||||||
is_tls=False,
|
is_tls=False,
|
||||||
has_retry=True,
|
has_retry=True,
|
||||||
delay=5,
|
delay=5,
|
||||||
clear_cookies=True,
|
clear_cookies=True,
|
||||||
)
|
)
|
||||||
self.session.headers.update(self.headers)
|
self.session.headers.update(headers)
|
||||||
self.scraper_input = None
|
self.scraper_input = None
|
||||||
self.country = "worldwide"
|
self.country = "worldwide"
|
||||||
self.job_url_direct_regex = re.compile(r'(?<=\?url=)[^"]+')
|
self.job_url_direct_regex = re.compile(r'(?<=\?url=)[^"]+')
|
||||||
@@ -70,17 +76,19 @@ class LinkedInScraper(Scraper):
|
|||||||
self.scraper_input = scraper_input
|
self.scraper_input = scraper_input
|
||||||
job_list: list[JobPost] = []
|
job_list: list[JobPost] = []
|
||||||
seen_ids = set()
|
seen_ids = set()
|
||||||
page = scraper_input.offset // 10 * 10 if scraper_input.offset else 0
|
start = scraper_input.offset // 10 * 10 if scraper_input.offset else 0
|
||||||
request_count = 0
|
request_count = 0
|
||||||
seconds_old = (
|
seconds_old = (
|
||||||
scraper_input.hours_old * 3600 if scraper_input.hours_old else None
|
scraper_input.hours_old * 3600 if scraper_input.hours_old else None
|
||||||
)
|
)
|
||||||
continue_search = (
|
continue_search = (
|
||||||
lambda: len(job_list) < scraper_input.results_wanted and page < 1000
|
lambda: len(job_list) < scraper_input.results_wanted and start < 1000
|
||||||
)
|
)
|
||||||
while continue_search():
|
while continue_search():
|
||||||
request_count += 1
|
request_count += 1
|
||||||
logger.info(f"LinkedIn search page: {request_count}")
|
logger.info(
|
||||||
|
f"search page: {request_count} / {math.ceil(scraper_input.results_wanted / 10)}"
|
||||||
|
)
|
||||||
params = {
|
params = {
|
||||||
"keywords": scraper_input.search_term,
|
"keywords": scraper_input.search_term,
|
||||||
"location": scraper_input.location,
|
"location": scraper_input.location,
|
||||||
@@ -92,7 +100,7 @@ class LinkedInScraper(Scraper):
|
|||||||
else None
|
else None
|
||||||
),
|
),
|
||||||
"pageNum": 0,
|
"pageNum": 0,
|
||||||
"start": page,
|
"start": start,
|
||||||
"f_AL": "true" if scraper_input.easy_apply else None,
|
"f_AL": "true" if scraper_input.easy_apply else None,
|
||||||
"f_C": (
|
"f_C": (
|
||||||
",".join(map(str, scraper_input.linkedin_company_ids))
|
",".join(map(str, scraper_input.linkedin_company_ids))
|
||||||
@@ -154,7 +162,7 @@ class LinkedInScraper(Scraper):
|
|||||||
|
|
||||||
if continue_search():
|
if continue_search():
|
||||||
time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
|
time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
|
||||||
page += len(job_list)
|
start += len(job_list)
|
||||||
|
|
||||||
job_list = job_list[: scraper_input.results_wanted]
|
job_list = job_list[: scraper_input.results_wanted]
|
||||||
return JobResponse(jobs=job_list)
|
return JobResponse(jobs=job_list)
|
||||||
@@ -210,7 +218,7 @@ class LinkedInScraper(Scraper):
|
|||||||
job_details = self._get_job_details(job_id)
|
job_details = self._get_job_details(job_id)
|
||||||
|
|
||||||
return JobPost(
|
return JobPost(
|
||||||
id=job_id,
|
id=f"li-{job_id}",
|
||||||
title=title,
|
title=title,
|
||||||
company_name=company,
|
company_name=company,
|
||||||
company_url=company_url,
|
company_url=company_url,
|
||||||
@@ -224,7 +232,7 @@ class LinkedInScraper(Scraper):
|
|||||||
description=job_details.get("description"),
|
description=job_details.get("description"),
|
||||||
job_url_direct=job_details.get("job_url_direct"),
|
job_url_direct=job_details.get("job_url_direct"),
|
||||||
emails=extract_emails_from_text(job_details.get("description")),
|
emails=extract_emails_from_text(job_details.get("description")),
|
||||||
logo_photo_url=job_details.get("logo_photo_url"),
|
company_logo=job_details.get("company_logo"),
|
||||||
job_function=job_details.get("job_function"),
|
job_function=job_details.get("job_function"),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -236,7 +244,7 @@ class LinkedInScraper(Scraper):
|
|||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
response = self.session.get(
|
response = self.session.get(
|
||||||
f"{self.base_url}/jobs-guest/jobs/api/jobPosting/{job_id}", timeout=5
|
f"{self.base_url}/jobs/view/{job_id}", timeout=5
|
||||||
)
|
)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
except:
|
except:
|
||||||
@@ -266,15 +274,19 @@ class LinkedInScraper(Scraper):
|
|||||||
)
|
)
|
||||||
if job_function_span:
|
if job_function_span:
|
||||||
job_function = job_function_span.text.strip()
|
job_function = job_function_span.text.strip()
|
||||||
|
|
||||||
|
company_logo = (
|
||||||
|
logo_image.get("data-delayed-url")
|
||||||
|
if (logo_image := soup.find("img", {"class": "artdeco-entity-image"}))
|
||||||
|
else None
|
||||||
|
)
|
||||||
return {
|
return {
|
||||||
"description": description,
|
"description": description,
|
||||||
"job_level": self._parse_job_level(soup),
|
"job_level": self._parse_job_level(soup),
|
||||||
"company_industry": self._parse_company_industry(soup),
|
"company_industry": self._parse_company_industry(soup),
|
||||||
"job_type": self._parse_job_type(soup),
|
"job_type": self._parse_job_type(soup),
|
||||||
"job_url_direct": self._parse_job_url_direct(soup),
|
"job_url_direct": self._parse_job_url_direct(soup),
|
||||||
"logo_photo_url": soup.find("img", {"class": "artdeco-entity-image"}).get(
|
"company_logo": company_logo,
|
||||||
"data-delayed-url"
|
|
||||||
),
|
|
||||||
"job_function": job_function,
|
"job_function": job_function,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -401,12 +413,3 @@ class LinkedInScraper(Scraper):
|
|||||||
JobType.CONTRACT: "C",
|
JobType.CONTRACT: "C",
|
||||||
JobType.TEMPORARY: "T",
|
JobType.TEMPORARY: "T",
|
||||||
}.get(job_type_enum, "")
|
}.get(job_type_enum, "")
|
||||||
|
|
||||||
headers = {
|
|
||||||
"authority": "www.linkedin.com",
|
|
||||||
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
|
||||||
"accept-language": "en-US,en;q=0.9",
|
|
||||||
"cache-control": "max-age=0",
|
|
||||||
"upgrade-insecure-requests": "1",
|
|
||||||
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
||||||
}
|
|
||||||
|
|||||||
8
src/jobspy/scrapers/linkedin/constants.py
Normal file
8
src/jobspy/scrapers/linkedin/constants.py
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
headers = {
|
||||||
|
"authority": "www.linkedin.com",
|
||||||
|
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
||||||
|
"accept-language": "en-US,en;q=0.9",
|
||||||
|
"cache-control": "max-age=0",
|
||||||
|
"upgrade-insecure-requests": "1",
|
||||||
|
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||||
|
}
|
||||||
@@ -12,15 +12,18 @@ from requests.adapters import HTTPAdapter, Retry
|
|||||||
|
|
||||||
from ..jobs import CompensationInterval, JobType
|
from ..jobs import CompensationInterval, JobType
|
||||||
|
|
||||||
logger = logging.getLogger("JobSpy")
|
|
||||||
logger.propagate = False
|
def create_logger(name: str):
|
||||||
if not logger.handlers:
|
logger = logging.getLogger(f"JobSpy:{name}")
|
||||||
logger.setLevel(logging.INFO)
|
logger.propagate = False
|
||||||
console_handler = logging.StreamHandler()
|
if not logger.handlers:
|
||||||
format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
logger.setLevel(logging.INFO)
|
||||||
formatter = logging.Formatter(format)
|
console_handler = logging.StreamHandler()
|
||||||
console_handler.setFormatter(formatter)
|
format = "%(asctime)s - %(levelname)s - %(name)s - %(message)s"
|
||||||
logger.addHandler(console_handler)
|
formatter = logging.Formatter(format)
|
||||||
|
console_handler.setFormatter(formatter)
|
||||||
|
logger.addHandler(console_handler)
|
||||||
|
return logger
|
||||||
|
|
||||||
|
|
||||||
class RotatingProxySession:
|
class RotatingProxySession:
|
||||||
@@ -100,6 +103,7 @@ class TLSRotating(RotatingProxySession, tls_client.Session):
|
|||||||
def create_session(
|
def create_session(
|
||||||
*,
|
*,
|
||||||
proxies: dict | str | None = None,
|
proxies: dict | str | None = None,
|
||||||
|
ca_cert: str | None = None,
|
||||||
is_tls: bool = True,
|
is_tls: bool = True,
|
||||||
has_retry: bool = False,
|
has_retry: bool = False,
|
||||||
delay: int = 1,
|
delay: int = 1,
|
||||||
@@ -119,6 +123,9 @@ def create_session(
|
|||||||
clear_cookies=clear_cookies,
|
clear_cookies=clear_cookies,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if ca_cert:
|
||||||
|
session.verify = ca_cert
|
||||||
|
|
||||||
return session
|
return session
|
||||||
|
|
||||||
|
|
||||||
@@ -134,7 +141,9 @@ def set_logger_level(verbose: int = 2):
|
|||||||
level_name = {2: "INFO", 1: "WARNING", 0: "ERROR"}.get(verbose, "INFO")
|
level_name = {2: "INFO", 1: "WARNING", 0: "ERROR"}.get(verbose, "INFO")
|
||||||
level = getattr(logging, level_name.upper(), None)
|
level = getattr(logging, level_name.upper(), None)
|
||||||
if level is not None:
|
if level is not None:
|
||||||
logger.setLevel(level)
|
for logger_name in logging.root.manager.loggerDict:
|
||||||
|
if logger_name.startswith("JobSpy:"):
|
||||||
|
logging.getLogger(logger_name).setLevel(level)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Invalid log level: {level_name}")
|
raise ValueError(f"Invalid log level: {level_name}")
|
||||||
|
|
||||||
@@ -195,9 +204,14 @@ def extract_salary(
|
|||||||
monthly_threshold=30000,
|
monthly_threshold=30000,
|
||||||
enforce_annual_salary=False,
|
enforce_annual_salary=False,
|
||||||
):
|
):
|
||||||
|
"""
|
||||||
|
Extracts salary information from a string and returns the salary interval, min and max salary values, and currency.
|
||||||
|
(TODO: Needs test cases as the regex is complicated and may not cover all edge cases)
|
||||||
|
"""
|
||||||
if not salary_str:
|
if not salary_str:
|
||||||
return None, None, None, None
|
return None, None, None, None
|
||||||
|
|
||||||
|
annual_max_salary = None
|
||||||
min_max_pattern = r"\$(\d+(?:,\d+)?(?:\.\d+)?)([kK]?)\s*[-—–]\s*(?:\$)?(\d+(?:,\d+)?(?:\.\d+)?)([kK]?)"
|
min_max_pattern = r"\$(\d+(?:,\d+)?(?:\.\d+)?)([kK]?)\s*[-—–]\s*(?:\$)?(\d+(?:,\d+)?(?:\.\d+)?)([kK]?)"
|
||||||
|
|
||||||
def to_int(s):
|
def to_int(s):
|
||||||
@@ -238,6 +252,8 @@ def extract_salary(
|
|||||||
annual_max_salary = max_salary
|
annual_max_salary = max_salary
|
||||||
|
|
||||||
# Ensure salary range is within specified limits
|
# Ensure salary range is within specified limits
|
||||||
|
if not annual_max_salary:
|
||||||
|
return None, None, None, None
|
||||||
if (
|
if (
|
||||||
lower_limit <= annual_min_salary <= upper_limit
|
lower_limit <= annual_min_salary <= upper_limit
|
||||||
and lower_limit <= annual_max_salary <= upper_limit
|
and lower_limit <= annual_max_salary <= upper_limit
|
||||||
@@ -248,3 +264,22 @@ def extract_salary(
|
|||||||
else:
|
else:
|
||||||
return interval, min_salary, max_salary, "USD"
|
return interval, min_salary, max_salary, "USD"
|
||||||
return None, None, None, None
|
return None, None, None, None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_job_type(description: str):
|
||||||
|
if not description:
|
||||||
|
return []
|
||||||
|
|
||||||
|
keywords = {
|
||||||
|
JobType.FULL_TIME: r"full\s?time",
|
||||||
|
JobType.PART_TIME: r"part\s?time",
|
||||||
|
JobType.INTERNSHIP: r"internship",
|
||||||
|
JobType.CONTRACT: r"contract",
|
||||||
|
}
|
||||||
|
|
||||||
|
listing_types = []
|
||||||
|
for key, pattern in keywords.items():
|
||||||
|
if re.search(pattern, description, re.IGNORECASE):
|
||||||
|
listing_types.append(key)
|
||||||
|
|
||||||
|
return listing_types if listing_types else None
|
||||||
|
|||||||
@@ -18,13 +18,14 @@ from concurrent.futures import ThreadPoolExecutor
|
|||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from .constants import headers
|
||||||
from .. import Scraper, ScraperInput, Site
|
from .. import Scraper, ScraperInput, Site
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
logger,
|
|
||||||
extract_emails_from_text,
|
extract_emails_from_text,
|
||||||
create_session,
|
create_session,
|
||||||
markdown_converter,
|
markdown_converter,
|
||||||
remove_attributes,
|
remove_attributes,
|
||||||
|
create_logger,
|
||||||
)
|
)
|
||||||
from ...jobs import (
|
from ...jobs import (
|
||||||
JobPost,
|
JobPost,
|
||||||
@@ -36,19 +37,24 @@ from ...jobs import (
|
|||||||
DescriptionFormat,
|
DescriptionFormat,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
logger = create_logger("ZipRecruiter")
|
||||||
|
|
||||||
|
|
||||||
class ZipRecruiterScraper(Scraper):
|
class ZipRecruiterScraper(Scraper):
|
||||||
base_url = "https://www.ziprecruiter.com"
|
base_url = "https://www.ziprecruiter.com"
|
||||||
api_url = "https://api.ziprecruiter.com"
|
api_url = "https://api.ziprecruiter.com"
|
||||||
|
|
||||||
def __init__(self, proxies: list[str] | str | None = None):
|
def __init__(
|
||||||
|
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Initializes ZipRecruiterScraper with the ZipRecruiter job search url
|
Initializes ZipRecruiterScraper with the ZipRecruiter job search url
|
||||||
"""
|
"""
|
||||||
super().__init__(Site.ZIP_RECRUITER, proxies=proxies)
|
super().__init__(Site.ZIP_RECRUITER, proxies=proxies)
|
||||||
|
|
||||||
self.scraper_input = None
|
self.scraper_input = None
|
||||||
self.session = create_session(proxies=proxies)
|
self.session = create_session(proxies=proxies, ca_cert=ca_cert)
|
||||||
|
self.session.headers.update(headers)
|
||||||
self._get_cookies()
|
self._get_cookies()
|
||||||
|
|
||||||
self.delay = 5
|
self.delay = 5
|
||||||
@@ -71,7 +77,7 @@ class ZipRecruiterScraper(Scraper):
|
|||||||
break
|
break
|
||||||
if page > 1:
|
if page > 1:
|
||||||
time.sleep(self.delay)
|
time.sleep(self.delay)
|
||||||
logger.info(f"ZipRecruiter search page: {page}")
|
logger.info(f"search page: {page} / {max_pages}")
|
||||||
jobs_on_page, continue_token = self._find_jobs_in_page(
|
jobs_on_page, continue_token = self._find_jobs_in_page(
|
||||||
scraper_input, continue_token
|
scraper_input, continue_token
|
||||||
)
|
)
|
||||||
@@ -97,9 +103,7 @@ class ZipRecruiterScraper(Scraper):
|
|||||||
if continue_token:
|
if continue_token:
|
||||||
params["continue_from"] = continue_token
|
params["continue_from"] = continue_token
|
||||||
try:
|
try:
|
||||||
res = self.session.get(
|
res = self.session.get(f"{self.api_url}/jobs-app/jobs", params=params)
|
||||||
f"{self.api_url}/jobs-app/jobs", headers=self.headers, params=params
|
|
||||||
)
|
|
||||||
if res.status_code not in range(200, 400):
|
if res.status_code not in range(200, 400):
|
||||||
if res.status_code == 429:
|
if res.status_code == 429:
|
||||||
err = "429 Response - Blocked by ZipRecruiter for too many requests"
|
err = "429 Response - Blocked by ZipRecruiter for too many requests"
|
||||||
@@ -160,7 +164,7 @@ class ZipRecruiterScraper(Scraper):
|
|||||||
description_full, job_url_direct = self._get_descr(job_url)
|
description_full, job_url_direct = self._get_descr(job_url)
|
||||||
|
|
||||||
return JobPost(
|
return JobPost(
|
||||||
id=str(job["listing_key"]),
|
id=f'zr-{job["listing_key"]}',
|
||||||
title=title,
|
title=title,
|
||||||
company_name=company,
|
company_name=company,
|
||||||
location=location,
|
location=location,
|
||||||
@@ -180,7 +184,7 @@ class ZipRecruiterScraper(Scraper):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def _get_descr(self, job_url):
|
def _get_descr(self, job_url):
|
||||||
res = self.session.get(job_url, headers=self.headers, allow_redirects=True)
|
res = self.session.get(job_url, allow_redirects=True)
|
||||||
description_full = job_url_direct = None
|
description_full = job_url_direct = None
|
||||||
if res.ok:
|
if res.ok:
|
||||||
soup = BeautifulSoup(res.text, "html.parser")
|
soup = BeautifulSoup(res.text, "html.parser")
|
||||||
@@ -200,7 +204,7 @@ class ZipRecruiterScraper(Scraper):
|
|||||||
script_tag = soup.find("script", type="application/json")
|
script_tag = soup.find("script", type="application/json")
|
||||||
if script_tag:
|
if script_tag:
|
||||||
job_json = json.loads(script_tag.string)
|
job_json = json.loads(script_tag.string)
|
||||||
job_url_val = job_json["model"]["saveJobURL"]
|
job_url_val = job_json["model"].get("saveJobURL", "")
|
||||||
m = re.search(r"job_url=(.+)", job_url_val)
|
m = re.search(r"job_url=(.+)", job_url_val)
|
||||||
if m:
|
if m:
|
||||||
job_url_direct = m.group(1)
|
job_url_direct = m.group(1)
|
||||||
@@ -213,7 +217,7 @@ class ZipRecruiterScraper(Scraper):
|
|||||||
def _get_cookies(self):
|
def _get_cookies(self):
|
||||||
data = "event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple"
|
data = "event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple"
|
||||||
url = f"{self.api_url}/jobs-app/event"
|
url = f"{self.api_url}/jobs-app/event"
|
||||||
self.session.post(url, data=data, headers=self.headers)
|
self.session.post(url, data=data)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_job_type_enum(job_type_str: str) -> list[JobType] | None:
|
def _get_job_type_enum(job_type_str: str) -> list[JobType] | None:
|
||||||
@@ -241,14 +245,3 @@ class ZipRecruiterScraper(Scraper):
|
|||||||
if scraper_input.distance:
|
if scraper_input.distance:
|
||||||
params["radius"] = scraper_input.distance
|
params["radius"] = scraper_input.distance
|
||||||
return {k: v for k, v in params.items() if v is not None}
|
return {k: v for k, v in params.items() if v is not None}
|
||||||
|
|
||||||
headers = {
|
|
||||||
"Host": "api.ziprecruiter.com",
|
|
||||||
"accept": "*/*",
|
|
||||||
"x-zr-zva-override": "100000000;vid:ZT1huzm_EQlDTVEc",
|
|
||||||
"x-pushnotificationid": "0ff4983d38d7fc5b3370297f2bcffcf4b3321c418f5c22dd152a0264707602a0",
|
|
||||||
"x-deviceid": "D77B3A92-E589-46A4-8A39-6EF6F1D86006",
|
|
||||||
"user-agent": "Job Search/87.0 (iPhone; CPU iOS 16_6_1 like Mac OS X)",
|
|
||||||
"authorization": "Basic YTBlZjMyZDYtN2I0Yy00MWVkLWEyODMtYTI1NDAzMzI0YTcyOg==",
|
|
||||||
"accept-language": "en-US,en;q=0.9",
|
|
||||||
}
|
|
||||||
|
|||||||
10
src/jobspy/scrapers/ziprecruiter/constants.py
Normal file
10
src/jobspy/scrapers/ziprecruiter/constants.py
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
headers = {
|
||||||
|
"Host": "api.ziprecruiter.com",
|
||||||
|
"accept": "*/*",
|
||||||
|
"x-zr-zva-override": "100000000;vid:ZT1huzm_EQlDTVEc",
|
||||||
|
"x-pushnotificationid": "0ff4983d38d7fc5b3370297f2bcffcf4b3321c418f5c22dd152a0264707602a0",
|
||||||
|
"x-deviceid": "D77B3A92-E589-46A4-8A39-6EF6F1D86006",
|
||||||
|
"user-agent": "Job Search/87.0 (iPhone; CPU iOS 16_6_1 like Mac OS X)",
|
||||||
|
"authorization": "Basic YTBlZjMyZDYtN2I0Yy00MWVkLWEyODMtYTI1NDAzMzI0YTcyOg==",
|
||||||
|
"accept-language": "en-US,en;q=0.9",
|
||||||
|
}
|
||||||
@@ -1,14 +0,0 @@
|
|||||||
from ..jobspy import scrape_jobs
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
|
|
||||||
def test_all():
|
|
||||||
result = scrape_jobs(
|
|
||||||
site_name=["linkedin", "indeed", "zip_recruiter", "glassdoor"],
|
|
||||||
search_term="software engineer",
|
|
||||||
results_wanted=5,
|
|
||||||
)
|
|
||||||
|
|
||||||
assert (
|
|
||||||
isinstance(result, pd.DataFrame) and not result.empty
|
|
||||||
), "Result should be a non-empty DataFrame"
|
|
||||||
@@ -1,11 +0,0 @@
|
|||||||
from ..jobspy import scrape_jobs
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
|
|
||||||
def test_indeed():
|
|
||||||
result = scrape_jobs(
|
|
||||||
site_name="glassdoor", search_term="software engineer", country_indeed="USA"
|
|
||||||
)
|
|
||||||
assert (
|
|
||||||
isinstance(result, pd.DataFrame) and not result.empty
|
|
||||||
), "Result should be a non-empty DataFrame"
|
|
||||||
@@ -1,11 +0,0 @@
|
|||||||
from ..jobspy import scrape_jobs
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
|
|
||||||
def test_indeed():
|
|
||||||
result = scrape_jobs(
|
|
||||||
site_name="indeed", search_term="software engineer", country_indeed="usa"
|
|
||||||
)
|
|
||||||
assert (
|
|
||||||
isinstance(result, pd.DataFrame) and not result.empty
|
|
||||||
), "Result should be a non-empty DataFrame"
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
from ..jobspy import scrape_jobs
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
|
|
||||||
def test_linkedin():
|
|
||||||
result = scrape_jobs(
|
|
||||||
site_name="linkedin",
|
|
||||||
search_term="software engineer",
|
|
||||||
)
|
|
||||||
assert (
|
|
||||||
isinstance(result, pd.DataFrame) and not result.empty
|
|
||||||
), "Result should be a non-empty DataFrame"
|
|
||||||
@@ -1,13 +0,0 @@
|
|||||||
from ..jobspy import scrape_jobs
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
|
|
||||||
def test_ziprecruiter():
|
|
||||||
result = scrape_jobs(
|
|
||||||
site_name="zip_recruiter",
|
|
||||||
search_term="software engineer",
|
|
||||||
)
|
|
||||||
|
|
||||||
assert (
|
|
||||||
isinstance(result, pd.DataFrame) and not result.empty
|
|
||||||
), "Result should be a non-empty DataFrame"
|
|
||||||
18
tests/test_all.py
Normal file
18
tests/test_all.py
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
from jobspy import scrape_jobs
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def test_all():
|
||||||
|
sites = [
|
||||||
|
"indeed",
|
||||||
|
"glassdoor",
|
||||||
|
] # ziprecruiter/linkedin needs good ip, and temp fix to pass test on ci
|
||||||
|
result = scrape_jobs(
|
||||||
|
site_name=sites,
|
||||||
|
search_term="engineer",
|
||||||
|
results_wanted=5,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert (
|
||||||
|
isinstance(result, pd.DataFrame) and len(result) == len(sites) * 5
|
||||||
|
), "Result should be a non-empty DataFrame"
|
||||||
13
tests/test_glassdoor.py
Normal file
13
tests/test_glassdoor.py
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
from jobspy import scrape_jobs
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def test_glassdoor():
|
||||||
|
result = scrape_jobs(
|
||||||
|
site_name="glassdoor",
|
||||||
|
search_term="engineer",
|
||||||
|
results_wanted=5,
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
isinstance(result, pd.DataFrame) and len(result) == 5
|
||||||
|
), "Result should be a non-empty DataFrame"
|
||||||
12
tests/test_google.py
Normal file
12
tests/test_google.py
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
from jobspy import scrape_jobs
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def test_google():
|
||||||
|
result = scrape_jobs(
|
||||||
|
site_name="google", search_term="software engineer", results_wanted=5
|
||||||
|
)
|
||||||
|
|
||||||
|
assert (
|
||||||
|
isinstance(result, pd.DataFrame) and len(result) == 5
|
||||||
|
), "Result should be a non-empty DataFrame"
|
||||||
13
tests/test_indeed.py
Normal file
13
tests/test_indeed.py
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
from jobspy import scrape_jobs
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def test_indeed():
|
||||||
|
result = scrape_jobs(
|
||||||
|
site_name="indeed",
|
||||||
|
search_term="engineer",
|
||||||
|
results_wanted=5,
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
isinstance(result, pd.DataFrame) and len(result) == 5
|
||||||
|
), "Result should be a non-empty DataFrame"
|
||||||
9
tests/test_linkedin.py
Normal file
9
tests/test_linkedin.py
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
from jobspy import scrape_jobs
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def test_linkedin():
|
||||||
|
result = scrape_jobs(site_name="linkedin", search_term="engineer", results_wanted=5)
|
||||||
|
assert (
|
||||||
|
isinstance(result, pd.DataFrame) and len(result) == 5
|
||||||
|
), "Result should be a non-empty DataFrame"
|
||||||
12
tests/test_ziprecruiter.py
Normal file
12
tests/test_ziprecruiter.py
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
from jobspy import scrape_jobs
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def test_ziprecruiter():
|
||||||
|
result = scrape_jobs(
|
||||||
|
site_name="zip_recruiter", search_term="software engineer", results_wanted=5
|
||||||
|
)
|
||||||
|
|
||||||
|
assert (
|
||||||
|
isinstance(result, pd.DataFrame) and len(result) == 5
|
||||||
|
), "Result should be a non-empty DataFrame"
|
||||||
Reference in New Issue
Block a user