mirror of https://github.com/Bunsly/JobSpy
parent
811d4c40b4
commit
338d854b96
|
@ -30,6 +30,7 @@ from jobspy import scrape_jobs
|
|||
jobs = scrape_jobs(
|
||||
site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google"],
|
||||
search_term="software engineer",
|
||||
google_search_term="software engineer jobs near San Francisco, CA since yesterday",
|
||||
location="San Francisco, CA",
|
||||
results_wanted=20,
|
||||
hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old)
|
||||
|
@ -65,6 +66,9 @@ Optional
|
|||
| (default is all)
|
||||
│
|
||||
├── search_term (str)
|
||||
|
|
||||
├── google_search_term (str)
|
||||
| search term for google jobs. This is is only param for filtering google jobs.
|
||||
│
|
||||
├── location (str)
|
||||
│
|
||||
|
@ -171,9 +175,9 @@ Indeed specific
|
|||
|
||||
## Supported Countries for Job Searching
|
||||
|
||||
### **LinkedIn / Google**
|
||||
### **LinkedIn**
|
||||
|
||||
LinkedIn & Google searches globally & uses only the `location` parameter.
|
||||
LinkedIn searches globally & uses only the `location` parameter.
|
||||
|
||||
### **ZipRecruiter**
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
[tool.poetry]
|
||||
name = "python-jobspy"
|
||||
version = "1.1.74"
|
||||
version = "1.1.75"
|
||||
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
||||
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
||||
homepage = "https://github.com/Bunsly/JobSpy"
|
||||
|
|
|
@ -24,6 +24,7 @@ from .scrapers.exceptions import (
|
|||
def scrape_jobs(
|
||||
site_name: str | list[str] | Site | list[Site] | None = None,
|
||||
search_term: str | None = None,
|
||||
google_search_term: str | None = None,
|
||||
location: str | None = None,
|
||||
distance: int | None = 50,
|
||||
is_remote: bool = False,
|
||||
|
@ -86,6 +87,7 @@ def scrape_jobs(
|
|||
site_type=get_site_type(),
|
||||
country=country_enum,
|
||||
search_term=search_term,
|
||||
google_search_term=google_search_term,
|
||||
location=location,
|
||||
distance=distance,
|
||||
is_remote=is_remote,
|
||||
|
@ -216,8 +218,8 @@ def scrape_jobs(
|
|||
"title",
|
||||
"company",
|
||||
"location",
|
||||
"job_type",
|
||||
"date_posted",
|
||||
"job_type",
|
||||
"salary_source",
|
||||
"interval",
|
||||
"min_amount",
|
||||
|
@ -248,6 +250,8 @@ def scrape_jobs(
|
|||
jobs_df = jobs_df[desired_order]
|
||||
|
||||
# Step 4: Sort the DataFrame as required
|
||||
return jobs_df.sort_values(by=["site", "date_posted"], ascending=[True, False])
|
||||
return jobs_df.sort_values(
|
||||
by=["site", "date_posted"], ascending=[True, False]
|
||||
).reset_index(drop=True)
|
||||
else:
|
||||
return pd.DataFrame()
|
||||
|
|
|
@ -28,6 +28,7 @@ class SalarySource(Enum):
|
|||
class ScraperInput(BaseModel):
|
||||
site_type: list[Site]
|
||||
search_term: str | None = None
|
||||
google_search_term: str | None = None
|
||||
|
||||
location: str | None = None
|
||||
country: Country | None = Country.USA
|
||||
|
|
|
@ -59,13 +59,14 @@ class GoogleJobsScraper(Scraper):
|
|||
self.session = create_session(
|
||||
proxies=self.proxies, ca_cert=self.ca_cert, is_tls=False, has_retry=True
|
||||
)
|
||||
forward_cursor = self._get_initial_cursor()
|
||||
forward_cursor, job_list = self._get_initial_cursor_and_jobs()
|
||||
if forward_cursor is None:
|
||||
logger.error("initial cursor not found")
|
||||
return JobResponse(jobs=[])
|
||||
logger.warning(
|
||||
"initial cursor not found, try changing your query or there was at most 10 results"
|
||||
)
|
||||
return JobResponse(jobs=job_list)
|
||||
|
||||
page = 1
|
||||
job_list: list[JobPost] = []
|
||||
|
||||
while (
|
||||
len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset
|
||||
|
@ -74,7 +75,11 @@ class GoogleJobsScraper(Scraper):
|
|||
logger.info(
|
||||
f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
|
||||
)
|
||||
jobs, forward_cursor = self._get_jobs_next_page(forward_cursor)
|
||||
try:
|
||||
jobs, forward_cursor = self._get_jobs_next_page(forward_cursor)
|
||||
except Exception as e:
|
||||
logger.error(f"failed to get jobs on page: {page}, {e}")
|
||||
break
|
||||
if not jobs:
|
||||
logger.info(f"found no jobs on page: {page}")
|
||||
break
|
||||
|
@ -87,8 +92,8 @@ class GoogleJobsScraper(Scraper):
|
|||
]
|
||||
)
|
||||
|
||||
def _get_initial_cursor(self):
|
||||
"""Gets initial cursor to paginate through job listings"""
|
||||
def _get_initial_cursor_and_jobs(self) -> Tuple[str, list[JobPost]]:
|
||||
"""Gets initial cursor and jobs to paginate through job listings"""
|
||||
query = f"{self.scraper_input.search_term} jobs"
|
||||
|
||||
def get_time_range(hours_old):
|
||||
|
@ -121,13 +126,22 @@ class GoogleJobsScraper(Scraper):
|
|||
if self.scraper_input.is_remote:
|
||||
query += " remote"
|
||||
|
||||
if self.scraper_input.google_search_term:
|
||||
query = self.scraper_input.google_search_term
|
||||
|
||||
params = {"q": query, "udm": "8"}
|
||||
response = self.session.get(self.url, headers=headers_initial, params=params)
|
||||
|
||||
pattern_fc = r'<div jsname="Yust4d"[^>]+data-async-fc="([^"]+)"'
|
||||
match_fc = re.search(pattern_fc, response.text)
|
||||
data_async_fc = match_fc.group(1) if match_fc else None
|
||||
return data_async_fc
|
||||
jobs_raw = self._find_job_info_initial_page(response.text)
|
||||
jobs = []
|
||||
for job_raw in jobs_raw:
|
||||
job_post = self._parse_job(job_raw)
|
||||
if job_post:
|
||||
jobs.append(job_post)
|
||||
return data_async_fc, jobs
|
||||
|
||||
def _get_jobs_next_page(self, forward_cursor: str) -> Tuple[list[JobPost], str]:
|
||||
params = {"fc": [forward_cursor], "fcv": ["3"], "async": [async_param]}
|
||||
|
@ -147,55 +161,55 @@ class GoogleJobsScraper(Scraper):
|
|||
match_fc = re.search(pattern_fc, job_data)
|
||||
data_async_fc = match_fc.group(1) if match_fc else None
|
||||
jobs_on_page = []
|
||||
|
||||
for array in parsed:
|
||||
|
||||
_, job_data = array
|
||||
if not job_data.startswith("[[["):
|
||||
continue
|
||||
job_d = json.loads(job_data)
|
||||
|
||||
job_info = self._find_job_info(job_d)
|
||||
|
||||
job_url = job_info[3][0][0] if job_info[3] and job_info[3][0] else None
|
||||
if job_url in self.seen_urls:
|
||||
continue
|
||||
self.seen_urls.add(job_url)
|
||||
|
||||
title = job_info[0]
|
||||
company_name = job_info[1]
|
||||
location = city = job_info[2]
|
||||
state = country = date_posted = None
|
||||
if location and "," in location:
|
||||
city, state, *country = [*map(lambda x: x.strip(), location.split(","))]
|
||||
|
||||
days_ago_str = job_info[12]
|
||||
if type(days_ago_str) == str:
|
||||
match = re.search(r"\d+", days_ago_str)
|
||||
days_ago = int(match.group()) if match else None
|
||||
date_posted = (datetime.now() - timedelta(days=days_ago)).date()
|
||||
|
||||
description = job_info[19]
|
||||
|
||||
job_post = JobPost(
|
||||
id=f"go-{job_info[28]}",
|
||||
title=title,
|
||||
company_name=company_name,
|
||||
location=Location(
|
||||
city=city, state=state, country=country[0] if country else None
|
||||
),
|
||||
job_url=job_url,
|
||||
job_url_direct=job_url,
|
||||
date_posted=date_posted,
|
||||
is_remote="remote" in description.lower()
|
||||
or "wfh" in description.lower(),
|
||||
description=description,
|
||||
emails=extract_emails_from_text(description),
|
||||
job_type=extract_job_type(description),
|
||||
)
|
||||
jobs_on_page.append(job_post)
|
||||
job_post = self._parse_job(job_info)
|
||||
if job_post:
|
||||
jobs_on_page.append(job_post)
|
||||
return jobs_on_page, data_async_fc
|
||||
|
||||
def _parse_job(self, job_info: list):
|
||||
job_url = job_info[3][0][0] if job_info[3] and job_info[3][0] else None
|
||||
if job_url in self.seen_urls:
|
||||
return
|
||||
self.seen_urls.add(job_url)
|
||||
|
||||
title = job_info[0]
|
||||
company_name = job_info[1]
|
||||
location = city = job_info[2]
|
||||
state = country = date_posted = None
|
||||
if location and "," in location:
|
||||
city, state, *country = [*map(lambda x: x.strip(), location.split(","))]
|
||||
|
||||
days_ago_str = job_info[12]
|
||||
if type(days_ago_str) == str:
|
||||
match = re.search(r"\d+", days_ago_str)
|
||||
days_ago = int(match.group()) if match else None
|
||||
date_posted = (datetime.now() - timedelta(days=days_ago)).date()
|
||||
|
||||
description = job_info[19]
|
||||
|
||||
job_post = JobPost(
|
||||
id=f"go-{job_info[28]}",
|
||||
title=title,
|
||||
company_name=company_name,
|
||||
location=Location(
|
||||
city=city, state=state, country=country[0] if country else None
|
||||
),
|
||||
job_url=job_url,
|
||||
date_posted=date_posted,
|
||||
is_remote="remote" in description.lower() or "wfh" in description.lower(),
|
||||
description=description,
|
||||
emails=extract_emails_from_text(description),
|
||||
job_type=extract_job_type(description),
|
||||
)
|
||||
return job_post
|
||||
|
||||
@staticmethod
|
||||
def _find_job_info(jobs_data: list | dict) -> list | None:
|
||||
"""Iterates through the JSON data to find the job listings"""
|
||||
|
@ -213,3 +227,24 @@ class GoogleJobsScraper(Scraper):
|
|||
if result:
|
||||
return result
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _find_job_info_initial_page(html_text: str):
|
||||
pattern = (
|
||||
f'520084652":('
|
||||
+ r"\[(?:[^\[\]]|\[(?:[^\[\]]|\[(?:[^\[\]]|\[[^\[\]]*\])*\])*\])*\])"
|
||||
)
|
||||
results = []
|
||||
matches = re.finditer(pattern, html_text)
|
||||
|
||||
import json
|
||||
|
||||
for match in matches:
|
||||
try:
|
||||
parsed_data = json.loads(match.group(1))
|
||||
results.append(parsed_data)
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"Failed to parse match: {str(e)}")
|
||||
results.append({"raw_match": match.group(0), "error": str(e)})
|
||||
return results
|
||||
|
|
Loading…
Reference in New Issue