mirror of
https://github.com/Bunsly/JobSpy.git
synced 2026-03-04 19:44:30 -08:00
refactor:organize code
This commit is contained in:
202
jobspy/google/__init__.py
Normal file
202
jobspy/google/__init__.py
Normal file
@@ -0,0 +1,202 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
import re
|
||||
import json
|
||||
from typing import Tuple
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from jobspy.google.constant import headers_jobs, headers_initial, async_param
|
||||
from jobspy.model import (
|
||||
Scraper,
|
||||
ScraperInput,
|
||||
Site,
|
||||
JobPost,
|
||||
JobResponse,
|
||||
Location,
|
||||
JobType,
|
||||
)
|
||||
from jobspy.util import extract_emails_from_text, extract_job_type, create_session
|
||||
from jobspy.google.util import log, find_job_info_initial_page, find_job_info
|
||||
|
||||
|
||||
class Google(Scraper):
|
||||
def __init__(
|
||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
||||
):
|
||||
"""
|
||||
Initializes Google Scraper with the Goodle jobs search url
|
||||
"""
|
||||
site = Site(Site.GOOGLE)
|
||||
super().__init__(site, proxies=proxies, ca_cert=ca_cert)
|
||||
|
||||
self.country = None
|
||||
self.session = None
|
||||
self.scraper_input = None
|
||||
self.jobs_per_page = 10
|
||||
self.seen_urls = set()
|
||||
self.url = "https://www.google.com/search"
|
||||
self.jobs_url = "https://www.google.com/async/callback:550"
|
||||
|
||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
||||
"""
|
||||
Scrapes Google for jobs with scraper_input criteria.
|
||||
:param scraper_input: Information about job search criteria.
|
||||
:return: JobResponse containing a list of jobs.
|
||||
"""
|
||||
self.scraper_input = scraper_input
|
||||
self.scraper_input.results_wanted = min(900, scraper_input.results_wanted)
|
||||
|
||||
self.session = create_session(
|
||||
proxies=self.proxies, ca_cert=self.ca_cert, is_tls=False, has_retry=True
|
||||
)
|
||||
forward_cursor, job_list = self._get_initial_cursor_and_jobs()
|
||||
if forward_cursor is None:
|
||||
log.warning(
|
||||
"initial cursor not found, try changing your query or there was at most 10 results"
|
||||
)
|
||||
return JobResponse(jobs=job_list)
|
||||
|
||||
page = 1
|
||||
|
||||
while (
|
||||
len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset
|
||||
and forward_cursor
|
||||
):
|
||||
log.info(
|
||||
f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
|
||||
)
|
||||
try:
|
||||
jobs, forward_cursor = self._get_jobs_next_page(forward_cursor)
|
||||
except Exception as e:
|
||||
log.error(f"failed to get jobs on page: {page}, {e}")
|
||||
break
|
||||
if not jobs:
|
||||
log.info(f"found no jobs on page: {page}")
|
||||
break
|
||||
job_list += jobs
|
||||
page += 1
|
||||
return JobResponse(
|
||||
jobs=job_list[
|
||||
scraper_input.offset : scraper_input.offset
|
||||
+ scraper_input.results_wanted
|
||||
]
|
||||
)
|
||||
|
||||
def _get_initial_cursor_and_jobs(self) -> Tuple[str, list[JobPost]]:
|
||||
"""Gets initial cursor and jobs to paginate through job listings"""
|
||||
query = f"{self.scraper_input.search_term} jobs"
|
||||
|
||||
def get_time_range(hours_old):
|
||||
if hours_old <= 24:
|
||||
return "since yesterday"
|
||||
elif hours_old <= 72:
|
||||
return "in the last 3 days"
|
||||
elif hours_old <= 168:
|
||||
return "in the last week"
|
||||
else:
|
||||
return "in the last month"
|
||||
|
||||
job_type_mapping = {
|
||||
JobType.FULL_TIME: "Full time",
|
||||
JobType.PART_TIME: "Part time",
|
||||
JobType.INTERNSHIP: "Internship",
|
||||
JobType.CONTRACT: "Contract",
|
||||
}
|
||||
|
||||
if self.scraper_input.job_type in job_type_mapping:
|
||||
query += f" {job_type_mapping[self.scraper_input.job_type]}"
|
||||
|
||||
if self.scraper_input.location:
|
||||
query += f" near {self.scraper_input.location}"
|
||||
|
||||
if self.scraper_input.hours_old:
|
||||
time_filter = get_time_range(self.scraper_input.hours_old)
|
||||
query += f" {time_filter}"
|
||||
|
||||
if self.scraper_input.is_remote:
|
||||
query += " remote"
|
||||
|
||||
if self.scraper_input.google_search_term:
|
||||
query = self.scraper_input.google_search_term
|
||||
|
||||
params = {"q": query, "udm": "8"}
|
||||
response = self.session.get(self.url, headers=headers_initial, params=params)
|
||||
|
||||
pattern_fc = r'<div jsname="Yust4d"[^>]+data-async-fc="([^"]+)"'
|
||||
match_fc = re.search(pattern_fc, response.text)
|
||||
data_async_fc = match_fc.group(1) if match_fc else None
|
||||
jobs_raw = find_job_info_initial_page(response.text)
|
||||
jobs = []
|
||||
for job_raw in jobs_raw:
|
||||
job_post = self._parse_job(job_raw)
|
||||
if job_post:
|
||||
jobs.append(job_post)
|
||||
return data_async_fc, jobs
|
||||
|
||||
def _get_jobs_next_page(self, forward_cursor: str) -> Tuple[list[JobPost], str]:
|
||||
params = {"fc": [forward_cursor], "fcv": ["3"], "async": [async_param]}
|
||||
response = self.session.get(self.jobs_url, headers=headers_jobs, params=params)
|
||||
return self._parse_jobs(response.text)
|
||||
|
||||
def _parse_jobs(self, job_data: str) -> Tuple[list[JobPost], str]:
|
||||
"""
|
||||
Parses jobs on a page with next page cursor
|
||||
"""
|
||||
start_idx = job_data.find("[[[")
|
||||
end_idx = job_data.rindex("]]]") + 3
|
||||
s = job_data[start_idx:end_idx]
|
||||
parsed = json.loads(s)[0]
|
||||
|
||||
pattern_fc = r'data-async-fc="([^"]+)"'
|
||||
match_fc = re.search(pattern_fc, job_data)
|
||||
data_async_fc = match_fc.group(1) if match_fc else None
|
||||
jobs_on_page = []
|
||||
for array in parsed:
|
||||
_, job_data = array
|
||||
if not job_data.startswith("[[["):
|
||||
continue
|
||||
job_d = json.loads(job_data)
|
||||
|
||||
job_info = find_job_info(job_d)
|
||||
job_post = self._parse_job(job_info)
|
||||
if job_post:
|
||||
jobs_on_page.append(job_post)
|
||||
return jobs_on_page, data_async_fc
|
||||
|
||||
def _parse_job(self, job_info: list):
|
||||
job_url = job_info[3][0][0] if job_info[3] and job_info[3][0] else None
|
||||
if job_url in self.seen_urls:
|
||||
return
|
||||
self.seen_urls.add(job_url)
|
||||
|
||||
title = job_info[0]
|
||||
company_name = job_info[1]
|
||||
location = city = job_info[2]
|
||||
state = country = date_posted = None
|
||||
if location and "," in location:
|
||||
city, state, *country = [*map(lambda x: x.strip(), location.split(","))]
|
||||
|
||||
days_ago_str = job_info[12]
|
||||
if type(days_ago_str) == str:
|
||||
match = re.search(r"\d+", days_ago_str)
|
||||
days_ago = int(match.group()) if match else None
|
||||
date_posted = (datetime.now() - timedelta(days=days_ago)).date()
|
||||
|
||||
description = job_info[19]
|
||||
|
||||
job_post = JobPost(
|
||||
id=f"go-{job_info[28]}",
|
||||
title=title,
|
||||
company_name=company_name,
|
||||
location=Location(
|
||||
city=city, state=state, country=country[0] if country else None
|
||||
),
|
||||
job_url=job_url,
|
||||
date_posted=date_posted,
|
||||
is_remote="remote" in description.lower() or "wfh" in description.lower(),
|
||||
description=description,
|
||||
emails=extract_emails_from_text(description),
|
||||
job_type=extract_job_type(description),
|
||||
)
|
||||
return job_post
|
||||
52
jobspy/google/constant.py
Normal file
52
jobspy/google/constant.py
Normal file
@@ -0,0 +1,52 @@
|
||||
headers_initial = {
|
||||
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
"priority": "u=0, i",
|
||||
"referer": "https://www.google.com/",
|
||||
"sec-ch-prefers-color-scheme": "dark",
|
||||
"sec-ch-ua": '"Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"',
|
||||
"sec-ch-ua-arch": '"arm"',
|
||||
"sec-ch-ua-bitness": '"64"',
|
||||
"sec-ch-ua-form-factors": '"Desktop"',
|
||||
"sec-ch-ua-full-version": '"130.0.6723.58"',
|
||||
"sec-ch-ua-full-version-list": '"Chromium";v="130.0.6723.58", "Google Chrome";v="130.0.6723.58", "Not?A_Brand";v="99.0.0.0"',
|
||||
"sec-ch-ua-mobile": "?0",
|
||||
"sec-ch-ua-model": '""',
|
||||
"sec-ch-ua-platform": '"macOS"',
|
||||
"sec-ch-ua-platform-version": '"15.0.1"',
|
||||
"sec-ch-ua-wow64": "?0",
|
||||
"sec-fetch-dest": "document",
|
||||
"sec-fetch-mode": "navigate",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"sec-fetch-user": "?1",
|
||||
"upgrade-insecure-requests": "1",
|
||||
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
|
||||
"x-browser-channel": "stable",
|
||||
"x-browser-copyright": "Copyright 2024 Google LLC. All rights reserved.",
|
||||
"x-browser-year": "2024",
|
||||
}
|
||||
|
||||
headers_jobs = {
|
||||
"accept": "*/*",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
"priority": "u=1, i",
|
||||
"referer": "https://www.google.com/",
|
||||
"sec-ch-prefers-color-scheme": "dark",
|
||||
"sec-ch-ua": '"Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"',
|
||||
"sec-ch-ua-arch": '"arm"',
|
||||
"sec-ch-ua-bitness": '"64"',
|
||||
"sec-ch-ua-form-factors": '"Desktop"',
|
||||
"sec-ch-ua-full-version": '"130.0.6723.58"',
|
||||
"sec-ch-ua-full-version-list": '"Chromium";v="130.0.6723.58", "Google Chrome";v="130.0.6723.58", "Not?A_Brand";v="99.0.0.0"',
|
||||
"sec-ch-ua-mobile": "?0",
|
||||
"sec-ch-ua-model": '""',
|
||||
"sec-ch-ua-platform": '"macOS"',
|
||||
"sec-ch-ua-platform-version": '"15.0.1"',
|
||||
"sec-ch-ua-wow64": "?0",
|
||||
"sec-fetch-dest": "empty",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
|
||||
}
|
||||
|
||||
async_param = "_basejs:/xjs/_/js/k=xjs.s.en_US.JwveA-JiKmg.2018.O/am=AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIAAAAAAAAACAAAoICAAAAAAAKMAfAAAAIAQAAAAAAAAAAAAACCAAAEJDAAACAAAAAGABAIAAARBAAABAAAAAgAgQAABAASKAfv8JAAABAAAAAAwAQAQACQAAAAAAcAEAQABoCAAAABAAAIABAACAAAAEAAAAFAAAAAAAAAAAAAAAAAAAAAAAAACAQADoBwAAAAAAAAAAAAAQBAAAAATQAAoACOAHAAAAAAAAAQAAAIIAAAA_ZAACAAAAAAAAcB8APB4wHFJ4AAAAAAAAAAAAAAAACECCYA5If0EACAAAAAAAAAAAAAAAAAAAUgRNXG4AMAE/dg=0/br=1/rs=ACT90oGxMeaFMCopIHq5tuQM-6_3M_VMjQ,_basecss:/xjs/_/ss/k=xjs.s.IwsGu62EDtU.L.B1.O/am=QOoQIAQAAAQAREADEBAAAAAAAAAAAAAAAAAAAAAgAQAAIAAAgAQAAAIAIAIAoEwCAADIC8AfsgEAawwAPkAAjgoAGAAAAAAAAEADAAAAAAIgAECHAAAAAAAAAAABAQAggAARQAAAQCEAAAAAIAAAABgAAAAAIAQIACCAAfB-AAFIQABoCEA_CgEAAIABAACEgHAEwwAEFQAM4CgAAAAAAAAAAAAACABCAAAAQEAAABAgAMCPAAA4AoE2BAEAggSAAIoAQAAAAAgAAAAACCAQAAAxEwA_ZAACAAAAAAAAAAkAAAAAAAAgAAAAAAAAAAAAAAAAAAAAAAAAQAEAAAAAAAAAAAAAAAAAAAAAQA/br=1/rs=ACT90oGZc36t3uUQkj0srnIvvbHjO2hgyg,_basecomb:/xjs/_/js/k=xjs.s.en_US.JwveA-JiKmg.2018.O/ck=xjs.s.IwsGu62EDtU.L.B1.O/am=QOoQIAQAAAQAREADEBAAAAAAAAAAAAAAAAAAAAAgAQAAIAAAgAQAAAKAIAoIqEwCAADIK8AfsgEAawwAPkAAjgoAGAAACCAAAEJDAAACAAIgAGCHAIAAARBAAABBAQAggAgRQABAQSOAfv8JIAABABgAAAwAYAQICSCAAfB-cAFIQABoCEA_ChEAAIABAACEgHAEwwAEFQAM4CgAAAAAAAAAAAAACABCAACAQEDoBxAgAMCPAAA4AoE2BAEAggTQAIoASOAHAAgAAAAACSAQAIIxEwA_ZAACAAAAAAAAcB8APB4wHFJ4AAAAAAAAAAAAAAAACECCYA5If0EACAAAAAAAAAAAAAAAAAAAUgRNXG4AMAE/d=1/ed=1/dg=0/br=1/ujg=1/rs=ACT90oFNLTjPzD_OAqhhtXwe2pg1T3WpBg,_fmt:prog,_id:fc_5FwaZ86OKsfdwN4P4La3yA4_2"
|
||||
41
jobspy/google/util.py
Normal file
41
jobspy/google/util.py
Normal file
@@ -0,0 +1,41 @@
|
||||
import re
|
||||
|
||||
from jobspy.util import create_logger
|
||||
|
||||
log = create_logger("Google")
|
||||
|
||||
|
||||
def find_job_info(jobs_data: list | dict) -> list | None:
|
||||
"""Iterates through the JSON data to find the job listings"""
|
||||
if isinstance(jobs_data, dict):
|
||||
for key, value in jobs_data.items():
|
||||
if key == "520084652" and isinstance(value, list):
|
||||
return value
|
||||
else:
|
||||
result = find_job_info(value)
|
||||
if result:
|
||||
return result
|
||||
elif isinstance(jobs_data, list):
|
||||
for item in jobs_data:
|
||||
result = find_job_info(item)
|
||||
if result:
|
||||
return result
|
||||
return None
|
||||
|
||||
|
||||
def find_job_info_initial_page(html_text: str):
|
||||
pattern = f'520084652":(' + r"\[.*?\]\s*])\s*}\s*]\s*]\s*]\s*]\s*]"
|
||||
results = []
|
||||
matches = re.finditer(pattern, html_text)
|
||||
|
||||
import json
|
||||
|
||||
for match in matches:
|
||||
try:
|
||||
parsed_data = json.loads(match.group(1))
|
||||
results.append(parsed_data)
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
log.error(f"Failed to parse match: {str(e)}")
|
||||
results.append({"raw_match": match.group(0), "error": str(e)})
|
||||
return results
|
||||
Reference in New Issue
Block a user