mirror of
https://github.com/Bunsly/JobSpy.git
synced 2026-03-05 03:54:31 -08:00
Compare commits
15 Commits
1.1.73
...
d52e366ef7
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d52e366ef7 | ||
|
|
395ebf0017 | ||
|
|
63fddd9b7f | ||
|
|
58956868ae | ||
|
|
4fce836222 | ||
|
|
5ba25e7a7c | ||
|
|
f7cb3e9206 | ||
|
|
3ad3f121f7 | ||
|
|
ff3c782912 | ||
|
|
338d854b96 | ||
|
|
811d4c40b4 | ||
|
|
dba92d22c2 | ||
|
|
10a3592a0f | ||
|
|
b7905cc756 | ||
|
|
6867d58829 |
117
README.md
117
README.md
@@ -2,16 +2,14 @@
|
|||||||
|
|
||||||
**JobSpy** is a simple, yet comprehensive, job scraping library.
|
**JobSpy** is a simple, yet comprehensive, job scraping library.
|
||||||
|
|
||||||
**Not technical?** Try out the web scraping tool on our site at [usejobspy.com](https://usejobspy.com).
|
|
||||||
|
|
||||||
*Looking to build a data-focused software product?* **[Book a call](https://bunsly.com/)** *to
|
*Looking to build a data-focused software product?* **[Book a call](https://bunsly.com/)** *to
|
||||||
work with us.*
|
work with us.*
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, & **ZipRecruiter** simultaneously
|
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, & **ZipRecruiter** simultaneously
|
||||||
- Aggregates the job postings in a Pandas DataFrame
|
- Aggregates the job postings in a dataframe
|
||||||
- Proxies support
|
- Proxies support to bypass blocking
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
@@ -32,14 +30,14 @@ from jobspy import scrape_jobs
|
|||||||
jobs = scrape_jobs(
|
jobs = scrape_jobs(
|
||||||
site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google"],
|
site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google"],
|
||||||
search_term="software engineer",
|
search_term="software engineer",
|
||||||
|
google_search_term="software engineer jobs near San Francisco, CA since yesterday",
|
||||||
location="San Francisco, CA",
|
location="San Francisco, CA",
|
||||||
results_wanted=20,
|
results_wanted=20,
|
||||||
hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old)
|
hours_old=72,
|
||||||
country_indeed='USA', # only needed for indeed / glassdoor
|
country_indeed='USA',
|
||||||
|
|
||||||
# linkedin_fetch_description=True # get more info such as full description, direct job url for linkedin (slower)
|
# linkedin_fetch_description=True # gets more info such as description, direct job url (slower)
|
||||||
# proxies=["208.195.175.46:65095", "208.195.175.45:65095", "localhost"],
|
# proxies=["208.195.175.46:65095", "208.195.175.45:65095", "localhost"],
|
||||||
|
|
||||||
)
|
)
|
||||||
print(f"Found {len(jobs)} jobs")
|
print(f"Found {len(jobs)} jobs")
|
||||||
print(jobs.head())
|
print(jobs.head())
|
||||||
@@ -63,10 +61,13 @@ zip_recruiter Software Developer TEKsystems Phoenix
|
|||||||
```plaintext
|
```plaintext
|
||||||
Optional
|
Optional
|
||||||
├── site_name (list|str):
|
├── site_name (list|str):
|
||||||
| linkedin, zip_recruiter, indeed, glassdoor
|
| linkedin, zip_recruiter, indeed, glassdoor, google
|
||||||
| (default is all four)
|
| (default is all)
|
||||||
│
|
│
|
||||||
├── search_term (str)
|
├── search_term (str)
|
||||||
|
|
|
||||||
|
├── google_search_term (str)
|
||||||
|
| search term for google jobs. This is the only param for filtering google jobs.
|
||||||
│
|
│
|
||||||
├── location (str)
|
├── location (str)
|
||||||
│
|
│
|
||||||
@@ -86,7 +87,7 @@ Optional
|
|||||||
| number of job results to retrieve for each site specified in 'site_name'
|
| number of job results to retrieve for each site specified in 'site_name'
|
||||||
│
|
│
|
||||||
├── easy_apply (bool):
|
├── easy_apply (bool):
|
||||||
| filters for jobs that are hosted on the job board site
|
| filters for jobs that are hosted on the job board site (LinkedIn easy apply filter no longer works)
|
||||||
│
|
│
|
||||||
├── description_format (str):
|
├── description_format (str):
|
||||||
| markdown, html (Format type of the job descriptions. Default is markdown.)
|
| markdown, html (Format type of the job descriptions. Default is markdown.)
|
||||||
@@ -131,46 +132,6 @@ Optional
|
|||||||
| - easy_apply
|
| - easy_apply
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
### JobPost Schema
|
|
||||||
|
|
||||||
```plaintext
|
|
||||||
JobPost
|
|
||||||
├── title
|
|
||||||
├── company
|
|
||||||
├── company_url
|
|
||||||
├── job_url
|
|
||||||
├── location
|
|
||||||
│ ├── country
|
|
||||||
│ ├── city
|
|
||||||
│ ├── state
|
|
||||||
├── description
|
|
||||||
├── job_type: fulltime, parttime, internship, contract
|
|
||||||
├── job_function
|
|
||||||
│ ├── interval: yearly, monthly, weekly, daily, hourly
|
|
||||||
│ ├── min_amount
|
|
||||||
│ ├── max_amount
|
|
||||||
│ ├── currency
|
|
||||||
│ └── salary_source: direct_data, description (parsed from posting)
|
|
||||||
├── date_posted
|
|
||||||
├── emails
|
|
||||||
└── is_remote
|
|
||||||
|
|
||||||
Linkedin specific
|
|
||||||
└── job_level
|
|
||||||
|
|
||||||
Linkedin & Indeed specific
|
|
||||||
└── company_industry
|
|
||||||
|
|
||||||
Indeed specific
|
|
||||||
├── company_country
|
|
||||||
├── company_addresses
|
|
||||||
├── company_employees_label
|
|
||||||
├── company_revenue_label
|
|
||||||
├── company_description
|
|
||||||
└── company_logo
|
|
||||||
```
|
|
||||||
|
|
||||||
## Supported Countries for Job Searching
|
## Supported Countries for Job Searching
|
||||||
|
|
||||||
### **LinkedIn**
|
### **LinkedIn**
|
||||||
@@ -217,7 +178,18 @@ You can specify the following countries when searching on Indeed (use the exact
|
|||||||
|
|
||||||
---
|
---
|
||||||
**Q: Why is Indeed giving unrelated roles?**
|
**Q: Why is Indeed giving unrelated roles?**
|
||||||
**A:** Indeed is searching each one of your terms e.g. software intern, it searches software OR intern. Try search_term='"software intern"' in quotes for stricter searching
|
**A:** Indeed searches the description too.
|
||||||
|
|
||||||
|
- use - to remove words
|
||||||
|
- "" for exact match
|
||||||
|
|
||||||
|
Example of a good Indeed query
|
||||||
|
|
||||||
|
```py
|
||||||
|
search_term='"engineering intern" software summer (java OR python OR c++) 2025 -tax -marketing'
|
||||||
|
```
|
||||||
|
|
||||||
|
This searches the description/title and must include software, summer, 2025, one of the languages, engineering intern exactly, no tax, no marketing.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@@ -229,8 +201,41 @@ You can specify the following countries when searching on Indeed (use the exact
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
**Q: Encountering issues with your queries?**
|
### JobPost Schema
|
||||||
**A:** Try reducing the number of `results_wanted` and/or broadening the filters. If problems
|
|
||||||
persist, [submit an issue](https://github.com/Bunsly/JobSpy/issues).
|
|
||||||
|
|
||||||
---
|
```plaintext
|
||||||
|
JobPost
|
||||||
|
├── title
|
||||||
|
├── company
|
||||||
|
├── company_url
|
||||||
|
├── job_url
|
||||||
|
├── location
|
||||||
|
│ ├── country
|
||||||
|
│ ├── city
|
||||||
|
│ ├── state
|
||||||
|
├── description
|
||||||
|
├── job_type: fulltime, parttime, internship, contract
|
||||||
|
├── job_function
|
||||||
|
│ ├── interval: yearly, monthly, weekly, daily, hourly
|
||||||
|
│ ├── min_amount
|
||||||
|
│ ├── max_amount
|
||||||
|
│ ├── currency
|
||||||
|
│ └── salary_source: direct_data, description (parsed from posting)
|
||||||
|
├── date_posted
|
||||||
|
├── emails
|
||||||
|
└── is_remote
|
||||||
|
|
||||||
|
Linkedin specific
|
||||||
|
└── job_level
|
||||||
|
|
||||||
|
Linkedin & Indeed specific
|
||||||
|
└── company_industry
|
||||||
|
|
||||||
|
Indeed specific
|
||||||
|
├── company_country
|
||||||
|
├── company_addresses
|
||||||
|
├── company_employees_label
|
||||||
|
├── company_revenue_label
|
||||||
|
├── company_description
|
||||||
|
└── company_logo
|
||||||
|
```
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "python-jobspy"
|
name = "python-jobspy"
|
||||||
version = "1.1.73"
|
version = "1.1.75"
|
||||||
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
||||||
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
||||||
homepage = "https://github.com/Bunsly/JobSpy"
|
homepage = "https://github.com/Bunsly/JobSpy"
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ from .scrapers.exceptions import (
|
|||||||
def scrape_jobs(
|
def scrape_jobs(
|
||||||
site_name: str | list[str] | Site | list[Site] | None = None,
|
site_name: str | list[str] | Site | list[Site] | None = None,
|
||||||
search_term: str | None = None,
|
search_term: str | None = None,
|
||||||
|
google_search_term: str | None = None,
|
||||||
location: str | None = None,
|
location: str | None = None,
|
||||||
distance: int | None = 50,
|
distance: int | None = 50,
|
||||||
is_remote: bool = False,
|
is_remote: bool = False,
|
||||||
@@ -86,6 +87,7 @@ def scrape_jobs(
|
|||||||
site_type=get_site_type(),
|
site_type=get_site_type(),
|
||||||
country=country_enum,
|
country=country_enum,
|
||||||
search_term=search_term,
|
search_term=search_term,
|
||||||
|
google_search_term=google_search_term,
|
||||||
location=location,
|
location=location,
|
||||||
distance=distance,
|
distance=distance,
|
||||||
is_remote=is_remote,
|
is_remote=is_remote,
|
||||||
@@ -216,8 +218,8 @@ def scrape_jobs(
|
|||||||
"title",
|
"title",
|
||||||
"company",
|
"company",
|
||||||
"location",
|
"location",
|
||||||
"job_type",
|
|
||||||
"date_posted",
|
"date_posted",
|
||||||
|
"job_type",
|
||||||
"salary_source",
|
"salary_source",
|
||||||
"interval",
|
"interval",
|
||||||
"min_amount",
|
"min_amount",
|
||||||
@@ -248,6 +250,8 @@ def scrape_jobs(
|
|||||||
jobs_df = jobs_df[desired_order]
|
jobs_df = jobs_df[desired_order]
|
||||||
|
|
||||||
# Step 4: Sort the DataFrame as required
|
# Step 4: Sort the DataFrame as required
|
||||||
return jobs_df.sort_values(by=["site", "date_posted"], ascending=[True, False])
|
return jobs_df.sort_values(
|
||||||
|
by=["site", "date_posted"], ascending=[True, False]
|
||||||
|
).reset_index(drop=True)
|
||||||
else:
|
else:
|
||||||
return pd.DataFrame()
|
return pd.DataFrame()
|
||||||
|
|||||||
@@ -28,6 +28,7 @@ class SalarySource(Enum):
|
|||||||
class ScraperInput(BaseModel):
|
class ScraperInput(BaseModel):
|
||||||
site_type: list[Site]
|
site_type: list[Site]
|
||||||
search_term: str | None = None
|
search_term: str | None = None
|
||||||
|
google_search_term: str | None = None
|
||||||
|
|
||||||
location: str | None = None
|
location: str | None = None
|
||||||
country: Country | None = Country.USA
|
country: Country | None = Country.USA
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
jobspy.scrapers.google
|
jobspy.scrapers.google
|
||||||
~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
This module contains routines to scrape Glassdoor.
|
This module contains routines to scrape Google.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
@@ -34,12 +34,11 @@ class GoogleJobsScraper(Scraper):
|
|||||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Initializes GlassdoorScraper with the Glassdoor job search url
|
Initializes Google Scraper with the Goodle jobs search url
|
||||||
"""
|
"""
|
||||||
site = Site(Site.GOOGLE)
|
site = Site(Site.GOOGLE)
|
||||||
super().__init__(site, proxies=proxies, ca_cert=ca_cert)
|
super().__init__(site, proxies=proxies, ca_cert=ca_cert)
|
||||||
|
|
||||||
self.base_url = None
|
|
||||||
self.country = None
|
self.country = None
|
||||||
self.session = None
|
self.session = None
|
||||||
self.scraper_input = None
|
self.scraper_input = None
|
||||||
@@ -50,24 +49,24 @@ class GoogleJobsScraper(Scraper):
|
|||||||
|
|
||||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
||||||
"""
|
"""
|
||||||
Scrapes Glassdoor for jobs with scraper_input criteria.
|
Scrapes Google for jobs with scraper_input criteria.
|
||||||
:param scraper_input: Information about job search criteria.
|
:param scraper_input: Information about job search criteria.
|
||||||
:return: JobResponse containing a list of jobs.
|
:return: JobResponse containing a list of jobs.
|
||||||
"""
|
"""
|
||||||
self.scraper_input = scraper_input
|
self.scraper_input = scraper_input
|
||||||
self.scraper_input.results_wanted = min(900, scraper_input.results_wanted)
|
self.scraper_input.results_wanted = min(900, scraper_input.results_wanted)
|
||||||
self.base_url = self.scraper_input.country.get_glassdoor_url()
|
|
||||||
|
|
||||||
self.session = create_session(
|
self.session = create_session(
|
||||||
proxies=self.proxies, ca_cert=self.ca_cert, is_tls=False, has_retry=True
|
proxies=self.proxies, ca_cert=self.ca_cert, is_tls=False, has_retry=True
|
||||||
)
|
)
|
||||||
forward_cursor = self._get_initial_cursor()
|
forward_cursor, job_list = self._get_initial_cursor_and_jobs()
|
||||||
if forward_cursor is None:
|
if forward_cursor is None:
|
||||||
logger.error("initial cursor not found")
|
logger.warning(
|
||||||
return JobResponse(jobs=[])
|
"initial cursor not found, try changing your query or there was at most 10 results"
|
||||||
|
)
|
||||||
|
return JobResponse(jobs=job_list)
|
||||||
|
|
||||||
page = 1
|
page = 1
|
||||||
job_list: list[JobPost] = []
|
|
||||||
|
|
||||||
while (
|
while (
|
||||||
len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset
|
len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset
|
||||||
@@ -76,7 +75,11 @@ class GoogleJobsScraper(Scraper):
|
|||||||
logger.info(
|
logger.info(
|
||||||
f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
|
f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
|
||||||
)
|
)
|
||||||
jobs, forward_cursor = self._get_jobs_next_page(forward_cursor)
|
try:
|
||||||
|
jobs, forward_cursor = self._get_jobs_next_page(forward_cursor)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"failed to get jobs on page: {page}, {e}")
|
||||||
|
break
|
||||||
if not jobs:
|
if not jobs:
|
||||||
logger.info(f"found no jobs on page: {page}")
|
logger.info(f"found no jobs on page: {page}")
|
||||||
break
|
break
|
||||||
@@ -89,8 +92,8 @@ class GoogleJobsScraper(Scraper):
|
|||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
def _get_initial_cursor(self):
|
def _get_initial_cursor_and_jobs(self) -> Tuple[str, list[JobPost]]:
|
||||||
"""Gets initial cursor to paginate through job listings"""
|
"""Gets initial cursor and jobs to paginate through job listings"""
|
||||||
query = f"{self.scraper_input.search_term} jobs"
|
query = f"{self.scraper_input.search_term} jobs"
|
||||||
|
|
||||||
def get_time_range(hours_old):
|
def get_time_range(hours_old):
|
||||||
@@ -123,13 +126,22 @@ class GoogleJobsScraper(Scraper):
|
|||||||
if self.scraper_input.is_remote:
|
if self.scraper_input.is_remote:
|
||||||
query += " remote"
|
query += " remote"
|
||||||
|
|
||||||
|
if self.scraper_input.google_search_term:
|
||||||
|
query = self.scraper_input.google_search_term
|
||||||
|
|
||||||
params = {"q": query, "udm": "8"}
|
params = {"q": query, "udm": "8"}
|
||||||
response = self.session.get(self.url, headers=headers_initial, params=params)
|
response = self.session.get(self.url, headers=headers_initial, params=params)
|
||||||
|
|
||||||
pattern_fc = r'<div jsname="Yust4d"[^>]+data-async-fc="([^"]+)"'
|
pattern_fc = r'<div jsname="Yust4d"[^>]+data-async-fc="([^"]+)"'
|
||||||
match_fc = re.search(pattern_fc, response.text)
|
match_fc = re.search(pattern_fc, response.text)
|
||||||
data_async_fc = match_fc.group(1) if match_fc else None
|
data_async_fc = match_fc.group(1) if match_fc else None
|
||||||
return data_async_fc
|
jobs_raw = self._find_job_info_initial_page(response.text)
|
||||||
|
jobs = []
|
||||||
|
for job_raw in jobs_raw:
|
||||||
|
job_post = self._parse_job(job_raw)
|
||||||
|
if job_post:
|
||||||
|
jobs.append(job_post)
|
||||||
|
return data_async_fc, jobs
|
||||||
|
|
||||||
def _get_jobs_next_page(self, forward_cursor: str) -> Tuple[list[JobPost], str]:
|
def _get_jobs_next_page(self, forward_cursor: str) -> Tuple[list[JobPost], str]:
|
||||||
params = {"fc": [forward_cursor], "fcv": ["3"], "async": [async_param]}
|
params = {"fc": [forward_cursor], "fcv": ["3"], "async": [async_param]}
|
||||||
@@ -149,55 +161,55 @@ class GoogleJobsScraper(Scraper):
|
|||||||
match_fc = re.search(pattern_fc, job_data)
|
match_fc = re.search(pattern_fc, job_data)
|
||||||
data_async_fc = match_fc.group(1) if match_fc else None
|
data_async_fc = match_fc.group(1) if match_fc else None
|
||||||
jobs_on_page = []
|
jobs_on_page = []
|
||||||
|
|
||||||
for array in parsed:
|
for array in parsed:
|
||||||
|
|
||||||
_, job_data = array
|
_, job_data = array
|
||||||
if not job_data.startswith("[[["):
|
if not job_data.startswith("[[["):
|
||||||
continue
|
continue
|
||||||
job_d = json.loads(job_data)
|
job_d = json.loads(job_data)
|
||||||
|
|
||||||
job_info = self._find_job_info(job_d)
|
job_info = self._find_job_info(job_d)
|
||||||
|
job_post = self._parse_job(job_info)
|
||||||
job_url = job_info[3][0][0] if job_info[3] and job_info[3][0] else None
|
if job_post:
|
||||||
if job_url in self.seen_urls:
|
jobs_on_page.append(job_post)
|
||||||
continue
|
|
||||||
self.seen_urls.add(job_url)
|
|
||||||
|
|
||||||
title = job_info[0]
|
|
||||||
company_name = job_info[1]
|
|
||||||
location = city = job_info[2]
|
|
||||||
state = country = date_posted = None
|
|
||||||
if location and "," in location:
|
|
||||||
city, state, *country = [*map(lambda x: x.strip(), location.split(","))]
|
|
||||||
|
|
||||||
days_ago_str = job_info[12]
|
|
||||||
if type(days_ago_str) == str:
|
|
||||||
match = re.search(r"\d+", days_ago_str)
|
|
||||||
days_ago = int(match.group()) if match else None
|
|
||||||
date_posted = (datetime.now() - timedelta(days=days_ago)).date()
|
|
||||||
|
|
||||||
description = job_info[19]
|
|
||||||
|
|
||||||
job_post = JobPost(
|
|
||||||
id=f"go-{job_info[28]}",
|
|
||||||
title=title,
|
|
||||||
company_name=company_name,
|
|
||||||
location=Location(
|
|
||||||
city=city, state=state, country=country[0] if country else None
|
|
||||||
),
|
|
||||||
job_url=job_url,
|
|
||||||
job_url_direct=job_url,
|
|
||||||
date_posted=date_posted,
|
|
||||||
is_remote="remote" in description.lower()
|
|
||||||
or "wfh" in description.lower(),
|
|
||||||
description=description,
|
|
||||||
emails=extract_emails_from_text(description),
|
|
||||||
job_type=extract_job_type(description),
|
|
||||||
)
|
|
||||||
jobs_on_page.append(job_post)
|
|
||||||
return jobs_on_page, data_async_fc
|
return jobs_on_page, data_async_fc
|
||||||
|
|
||||||
|
def _parse_job(self, job_info: list):
|
||||||
|
job_url = job_info[3][0][0] if job_info[3] and job_info[3][0] else None
|
||||||
|
if job_url in self.seen_urls:
|
||||||
|
return
|
||||||
|
self.seen_urls.add(job_url)
|
||||||
|
|
||||||
|
title = job_info[0]
|
||||||
|
company_name = job_info[1]
|
||||||
|
location = city = job_info[2]
|
||||||
|
state = country = date_posted = None
|
||||||
|
if location and "," in location:
|
||||||
|
city, state, *country = [*map(lambda x: x.strip(), location.split(","))]
|
||||||
|
|
||||||
|
days_ago_str = job_info[12]
|
||||||
|
if type(days_ago_str) == str:
|
||||||
|
match = re.search(r"\d+", days_ago_str)
|
||||||
|
days_ago = int(match.group()) if match else None
|
||||||
|
date_posted = (datetime.now() - timedelta(days=days_ago)).date()
|
||||||
|
|
||||||
|
description = job_info[19]
|
||||||
|
|
||||||
|
job_post = JobPost(
|
||||||
|
id=f"go-{job_info[28]}",
|
||||||
|
title=title,
|
||||||
|
company_name=company_name,
|
||||||
|
location=Location(
|
||||||
|
city=city, state=state, country=country[0] if country else None
|
||||||
|
),
|
||||||
|
job_url=job_url,
|
||||||
|
date_posted=date_posted,
|
||||||
|
is_remote="remote" in description.lower() or "wfh" in description.lower(),
|
||||||
|
description=description,
|
||||||
|
emails=extract_emails_from_text(description),
|
||||||
|
job_type=extract_job_type(description),
|
||||||
|
)
|
||||||
|
return job_post
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _find_job_info(jobs_data: list | dict) -> list | None:
|
def _find_job_info(jobs_data: list | dict) -> list | None:
|
||||||
"""Iterates through the JSON data to find the job listings"""
|
"""Iterates through the JSON data to find the job listings"""
|
||||||
@@ -215,3 +227,24 @@ class GoogleJobsScraper(Scraper):
|
|||||||
if result:
|
if result:
|
||||||
return result
|
return result
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _find_job_info_initial_page(html_text: str):
|
||||||
|
pattern = (
|
||||||
|
f'520084652":('
|
||||||
|
+ r"\[(?:[^\[\]]|\[(?:[^\[\]]|\[(?:[^\[\]]|\[[^\[\]]*\])*\])*\])*\])"
|
||||||
|
)
|
||||||
|
results = []
|
||||||
|
matches = re.finditer(pattern, html_text)
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
|
for match in matches:
|
||||||
|
try:
|
||||||
|
parsed_data = json.loads(match.group(1))
|
||||||
|
results.append(parsed_data)
|
||||||
|
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
logger.error(f"Failed to parse match: {str(e)}")
|
||||||
|
results.append({"raw_match": match.group(0), "error": str(e)})
|
||||||
|
return results
|
||||||
|
|||||||
Reference in New Issue
Block a user