JobSpy/jobspy/google/__init__.py

203 lines
7.1 KiB
Python
Raw Normal View History

2024-10-24 13:19:40 -07:00
from __future__ import annotations
import math
import re
import json
from typing import Tuple
from datetime import datetime, timedelta
2025-02-21 12:14:55 -08:00
from jobspy.google.constant import headers_jobs, headers_initial, async_param
from jobspy.model import (
Scraper,
ScraperInput,
Site,
2024-10-24 13:19:40 -07:00
JobPost,
JobResponse,
Location,
JobType,
)
2025-02-21 12:14:55 -08:00
from jobspy.util import extract_emails_from_text, extract_job_type, create_session
from jobspy.google.util import log, find_job_info_initial_page, find_job_info
2024-10-24 13:19:40 -07:00
2025-02-21 12:14:55 -08:00
class Google(Scraper):
2024-10-24 13:19:40 -07:00
def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
):
"""
2024-10-24 13:26:49 -07:00
Initializes Google Scraper with the Goodle jobs search url
2024-10-24 13:19:40 -07:00
"""
site = Site(Site.GOOGLE)
super().__init__(site, proxies=proxies, ca_cert=ca_cert)
self.country = None
self.session = None
self.scraper_input = None
self.jobs_per_page = 10
self.seen_urls = set()
self.url = "https://www.google.com/search"
self.jobs_url = "https://www.google.com/async/callback:550"
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
2024-10-24 13:26:49 -07:00
Scrapes Google for jobs with scraper_input criteria.
2024-10-24 13:19:40 -07:00
:param scraper_input: Information about job search criteria.
:return: JobResponse containing a list of jobs.
"""
self.scraper_input = scraper_input
self.scraper_input.results_wanted = min(900, scraper_input.results_wanted)
self.session = create_session(
proxies=self.proxies, ca_cert=self.ca_cert, is_tls=False, has_retry=True
)
2024-10-25 12:54:14 -07:00
forward_cursor, job_list = self._get_initial_cursor_and_jobs()
2024-10-24 13:19:40 -07:00
if forward_cursor is None:
2025-02-21 10:29:28 -08:00
log.warning(
2024-10-25 12:54:14 -07:00
"initial cursor not found, try changing your query or there was at most 10 results"
)
return JobResponse(jobs=job_list)
2024-10-24 13:19:40 -07:00
page = 1
while (
len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset
and forward_cursor
):
2025-02-21 10:29:28 -08:00
log.info(
2024-10-24 13:19:40 -07:00
f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
)
2024-10-25 12:54:14 -07:00
try:
jobs, forward_cursor = self._get_jobs_next_page(forward_cursor)
except Exception as e:
2025-02-21 10:29:28 -08:00
log.error(f"failed to get jobs on page: {page}, {e}")
2024-10-25 12:54:14 -07:00
break
2024-10-24 13:19:40 -07:00
if not jobs:
2025-02-21 10:29:28 -08:00
log.info(f"found no jobs on page: {page}")
2024-10-24 13:19:40 -07:00
break
job_list += jobs
page += 1
return JobResponse(
jobs=job_list[
scraper_input.offset : scraper_input.offset
+ scraper_input.results_wanted
]
)
2024-10-25 12:54:14 -07:00
def _get_initial_cursor_and_jobs(self) -> Tuple[str, list[JobPost]]:
"""Gets initial cursor and jobs to paginate through job listings"""
2024-10-24 13:19:40 -07:00
query = f"{self.scraper_input.search_term} jobs"
def get_time_range(hours_old):
if hours_old <= 24:
return "since yesterday"
elif hours_old <= 72:
return "in the last 3 days"
elif hours_old <= 168:
return "in the last week"
else:
return "in the last month"
job_type_mapping = {
JobType.FULL_TIME: "Full time",
JobType.PART_TIME: "Part time",
JobType.INTERNSHIP: "Internship",
JobType.CONTRACT: "Contract",
}
if self.scraper_input.job_type in job_type_mapping:
query += f" {job_type_mapping[self.scraper_input.job_type]}"
if self.scraper_input.location:
query += f" near {self.scraper_input.location}"
if self.scraper_input.hours_old:
time_filter = get_time_range(self.scraper_input.hours_old)
query += f" {time_filter}"
if self.scraper_input.is_remote:
query += " remote"
2024-10-25 12:54:14 -07:00
if self.scraper_input.google_search_term:
query = self.scraper_input.google_search_term
2024-10-24 13:19:40 -07:00
params = {"q": query, "udm": "8"}
response = self.session.get(self.url, headers=headers_initial, params=params)
pattern_fc = r'<div jsname="Yust4d"[^>]+data-async-fc="([^"]+)"'
match_fc = re.search(pattern_fc, response.text)
data_async_fc = match_fc.group(1) if match_fc else None
2025-02-21 12:14:55 -08:00
jobs_raw = find_job_info_initial_page(response.text)
2024-10-25 12:54:14 -07:00
jobs = []
for job_raw in jobs_raw:
job_post = self._parse_job(job_raw)
if job_post:
jobs.append(job_post)
return data_async_fc, jobs
2024-10-24 13:19:40 -07:00
def _get_jobs_next_page(self, forward_cursor: str) -> Tuple[list[JobPost], str]:
params = {"fc": [forward_cursor], "fcv": ["3"], "async": [async_param]}
response = self.session.get(self.jobs_url, headers=headers_jobs, params=params)
return self._parse_jobs(response.text)
def _parse_jobs(self, job_data: str) -> Tuple[list[JobPost], str]:
"""
Parses jobs on a page with next page cursor
"""
start_idx = job_data.find("[[[")
end_idx = job_data.rindex("]]]") + 3
s = job_data[start_idx:end_idx]
parsed = json.loads(s)[0]
pattern_fc = r'data-async-fc="([^"]+)"'
match_fc = re.search(pattern_fc, job_data)
data_async_fc = match_fc.group(1) if match_fc else None
jobs_on_page = []
for array in parsed:
_, job_data = array
if not job_data.startswith("[[["):
continue
job_d = json.loads(job_data)
2025-02-21 12:14:55 -08:00
job_info = find_job_info(job_d)
2024-10-25 12:54:14 -07:00
job_post = self._parse_job(job_info)
if job_post:
jobs_on_page.append(job_post)
2024-10-24 13:19:40 -07:00
return jobs_on_page, data_async_fc
2024-10-25 12:54:14 -07:00
def _parse_job(self, job_info: list):
job_url = job_info[3][0][0] if job_info[3] and job_info[3][0] else None
if job_url in self.seen_urls:
return
self.seen_urls.add(job_url)
title = job_info[0]
company_name = job_info[1]
location = city = job_info[2]
state = country = date_posted = None
if location and "," in location:
city, state, *country = [*map(lambda x: x.strip(), location.split(","))]
days_ago_str = job_info[12]
if type(days_ago_str) == str:
match = re.search(r"\d+", days_ago_str)
days_ago = int(match.group()) if match else None
date_posted = (datetime.now() - timedelta(days=days_ago)).date()
description = job_info[19]
job_post = JobPost(
id=f"go-{job_info[28]}",
title=title,
company_name=company_name,
location=Location(
city=city, state=state, country=country[0] if country else None
),
job_url=job_url,
date_posted=date_posted,
is_remote="remote" in description.lower() or "wfh" in description.lower(),
description=description,
emails=extract_emails_from_text(description),
job_type=extract_job_type(description),
)
return job_post