Compare commits

..

6 Commits

Author SHA1 Message Date
Cullen Watson
338d854b96 fix(google): search (#216) 2024-10-25 14:54:14 -05:00
Cullen Watson
811d4c40b4 chore:version 2024-10-24 15:28:25 -05:00
Cullen Watson
dba92d22c2 chore:version 2024-10-24 15:27:16 -05:00
Cullen Watson
10a3592a0f docs:file 2024-10-24 15:26:49 -05:00
Cullen Watson
b7905cc756 docs:file 2024-10-24 15:24:18 -05:00
Cullen Watson
6867d58829 docs:readme 2024-10-24 15:22:31 -05:00
5 changed files with 100 additions and 60 deletions

View File

@@ -2,8 +2,6 @@
**JobSpy** is a simple, yet comprehensive, job scraping library.
**Not technical?** Try out the web scraping tool on our site at [usejobspy.com](https://usejobspy.com).
*Looking to build a data-focused software product?* **[Book a call](https://bunsly.com/)** *to
work with us.*
@@ -32,6 +30,7 @@ from jobspy import scrape_jobs
jobs = scrape_jobs(
site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google"],
search_term="software engineer",
google_search_term="software engineer jobs near San Francisco, CA since yesterday",
location="San Francisco, CA",
results_wanted=20,
hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old)
@@ -63,10 +62,13 @@ zip_recruiter Software Developer TEKsystems Phoenix
```plaintext
Optional
├── site_name (list|str):
| linkedin, zip_recruiter, indeed, glassdoor
| (default is all four)
| linkedin, zip_recruiter, indeed, glassdoor, google
| (default is all)
├── search_term (str)
|
├── google_search_term (str)
| search term for google jobs. This is is only param for filtering google jobs.
├── location (str)

View File

@@ -1,6 +1,6 @@
[tool.poetry]
name = "python-jobspy"
version = "1.1.73"
version = "1.1.75"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy"

View File

@@ -24,6 +24,7 @@ from .scrapers.exceptions import (
def scrape_jobs(
site_name: str | list[str] | Site | list[Site] | None = None,
search_term: str | None = None,
google_search_term: str | None = None,
location: str | None = None,
distance: int | None = 50,
is_remote: bool = False,
@@ -86,6 +87,7 @@ def scrape_jobs(
site_type=get_site_type(),
country=country_enum,
search_term=search_term,
google_search_term=google_search_term,
location=location,
distance=distance,
is_remote=is_remote,
@@ -216,8 +218,8 @@ def scrape_jobs(
"title",
"company",
"location",
"job_type",
"date_posted",
"job_type",
"salary_source",
"interval",
"min_amount",
@@ -248,6 +250,8 @@ def scrape_jobs(
jobs_df = jobs_df[desired_order]
# Step 4: Sort the DataFrame as required
return jobs_df.sort_values(by=["site", "date_posted"], ascending=[True, False])
return jobs_df.sort_values(
by=["site", "date_posted"], ascending=[True, False]
).reset_index(drop=True)
else:
return pd.DataFrame()

View File

@@ -28,6 +28,7 @@ class SalarySource(Enum):
class ScraperInput(BaseModel):
site_type: list[Site]
search_term: str | None = None
google_search_term: str | None = None
location: str | None = None
country: Country | None = Country.USA

View File

@@ -2,7 +2,7 @@
jobspy.scrapers.google
~~~~~~~~~~~~~~~~~~~
This module contains routines to scrape Glassdoor.
This module contains routines to scrape Google.
"""
from __future__ import annotations
@@ -34,12 +34,11 @@ class GoogleJobsScraper(Scraper):
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
):
"""
Initializes GlassdoorScraper with the Glassdoor job search url
Initializes Google Scraper with the Goodle jobs search url
"""
site = Site(Site.GOOGLE)
super().__init__(site, proxies=proxies, ca_cert=ca_cert)
self.base_url = None
self.country = None
self.session = None
self.scraper_input = None
@@ -50,24 +49,24 @@ class GoogleJobsScraper(Scraper):
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Scrapes Glassdoor for jobs with scraper_input criteria.
Scrapes Google for jobs with scraper_input criteria.
:param scraper_input: Information about job search criteria.
:return: JobResponse containing a list of jobs.
"""
self.scraper_input = scraper_input
self.scraper_input.results_wanted = min(900, scraper_input.results_wanted)
self.base_url = self.scraper_input.country.get_glassdoor_url()
self.session = create_session(
proxies=self.proxies, ca_cert=self.ca_cert, is_tls=False, has_retry=True
)
forward_cursor = self._get_initial_cursor()
forward_cursor, job_list = self._get_initial_cursor_and_jobs()
if forward_cursor is None:
logger.error("initial cursor not found")
return JobResponse(jobs=[])
logger.warning(
"initial cursor not found, try changing your query or there was at most 10 results"
)
return JobResponse(jobs=job_list)
page = 1
job_list: list[JobPost] = []
while (
len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset
@@ -76,7 +75,11 @@ class GoogleJobsScraper(Scraper):
logger.info(
f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
)
jobs, forward_cursor = self._get_jobs_next_page(forward_cursor)
try:
jobs, forward_cursor = self._get_jobs_next_page(forward_cursor)
except Exception as e:
logger.error(f"failed to get jobs on page: {page}, {e}")
break
if not jobs:
logger.info(f"found no jobs on page: {page}")
break
@@ -89,8 +92,8 @@ class GoogleJobsScraper(Scraper):
]
)
def _get_initial_cursor(self):
"""Gets initial cursor to paginate through job listings"""
def _get_initial_cursor_and_jobs(self) -> Tuple[str, list[JobPost]]:
"""Gets initial cursor and jobs to paginate through job listings"""
query = f"{self.scraper_input.search_term} jobs"
def get_time_range(hours_old):
@@ -123,13 +126,22 @@ class GoogleJobsScraper(Scraper):
if self.scraper_input.is_remote:
query += " remote"
if self.scraper_input.google_search_term:
query = self.scraper_input.google_search_term
params = {"q": query, "udm": "8"}
response = self.session.get(self.url, headers=headers_initial, params=params)
pattern_fc = r'<div jsname="Yust4d"[^>]+data-async-fc="([^"]+)"'
match_fc = re.search(pattern_fc, response.text)
data_async_fc = match_fc.group(1) if match_fc else None
return data_async_fc
jobs_raw = self._find_job_info_initial_page(response.text)
jobs = []
for job_raw in jobs_raw:
job_post = self._parse_job(job_raw)
if job_post:
jobs.append(job_post)
return data_async_fc, jobs
def _get_jobs_next_page(self, forward_cursor: str) -> Tuple[list[JobPost], str]:
params = {"fc": [forward_cursor], "fcv": ["3"], "async": [async_param]}
@@ -149,55 +161,55 @@ class GoogleJobsScraper(Scraper):
match_fc = re.search(pattern_fc, job_data)
data_async_fc = match_fc.group(1) if match_fc else None
jobs_on_page = []
for array in parsed:
_, job_data = array
if not job_data.startswith("[[["):
continue
job_d = json.loads(job_data)
job_info = self._find_job_info(job_d)
job_url = job_info[3][0][0] if job_info[3] and job_info[3][0] else None
if job_url in self.seen_urls:
continue
self.seen_urls.add(job_url)
title = job_info[0]
company_name = job_info[1]
location = city = job_info[2]
state = country = date_posted = None
if location and "," in location:
city, state, *country = [*map(lambda x: x.strip(), location.split(","))]
days_ago_str = job_info[12]
if type(days_ago_str) == str:
match = re.search(r"\d+", days_ago_str)
days_ago = int(match.group()) if match else None
date_posted = (datetime.now() - timedelta(days=days_ago)).date()
description = job_info[19]
job_post = JobPost(
id=f"go-{job_info[28]}",
title=title,
company_name=company_name,
location=Location(
city=city, state=state, country=country[0] if country else None
),
job_url=job_url,
job_url_direct=job_url,
date_posted=date_posted,
is_remote="remote" in description.lower()
or "wfh" in description.lower(),
description=description,
emails=extract_emails_from_text(description),
job_type=extract_job_type(description),
)
jobs_on_page.append(job_post)
job_post = self._parse_job(job_info)
if job_post:
jobs_on_page.append(job_post)
return jobs_on_page, data_async_fc
def _parse_job(self, job_info: list):
job_url = job_info[3][0][0] if job_info[3] and job_info[3][0] else None
if job_url in self.seen_urls:
return
self.seen_urls.add(job_url)
title = job_info[0]
company_name = job_info[1]
location = city = job_info[2]
state = country = date_posted = None
if location and "," in location:
city, state, *country = [*map(lambda x: x.strip(), location.split(","))]
days_ago_str = job_info[12]
if type(days_ago_str) == str:
match = re.search(r"\d+", days_ago_str)
days_ago = int(match.group()) if match else None
date_posted = (datetime.now() - timedelta(days=days_ago)).date()
description = job_info[19]
job_post = JobPost(
id=f"go-{job_info[28]}",
title=title,
company_name=company_name,
location=Location(
city=city, state=state, country=country[0] if country else None
),
job_url=job_url,
date_posted=date_posted,
is_remote="remote" in description.lower() or "wfh" in description.lower(),
description=description,
emails=extract_emails_from_text(description),
job_type=extract_job_type(description),
)
return job_post
@staticmethod
def _find_job_info(jobs_data: list | dict) -> list | None:
"""Iterates through the JSON data to find the job listings"""
@@ -215,3 +227,24 @@ class GoogleJobsScraper(Scraper):
if result:
return result
return None
@staticmethod
def _find_job_info_initial_page(html_text: str):
pattern = (
f'520084652":('
+ r"\[(?:[^\[\]]|\[(?:[^\[\]]|\[(?:[^\[\]]|\[[^\[\]]*\])*\])*\])*\])"
)
results = []
matches = re.finditer(pattern, html_text)
import json
for match in matches:
try:
parsed_data = json.loads(match.group(1))
results.append(parsed_data)
except json.JSONDecodeError as e:
logger.error(f"Failed to parse match: {str(e)}")
results.append({"raw_match": match.group(0), "error": str(e)})
return results