mirror of https://github.com/Bunsly/JobSpy
Compare commits
No commits in common. "811d4c40b46eb1eb303336c5cc5d0123108d3f4c" and "f395597fdddfe97384beca26a6690cf0b54c4697" have entirely different histories.
811d4c40b4
...
f395597fdd
24
README.md
24
README.md
|
@ -2,12 +2,14 @@
|
||||||
|
|
||||||
**JobSpy** is a simple, yet comprehensive, job scraping library.
|
**JobSpy** is a simple, yet comprehensive, job scraping library.
|
||||||
|
|
||||||
|
**Not technical?** Try out the web scraping tool on our site at [usejobspy.com](https://usejobspy.com).
|
||||||
|
|
||||||
*Looking to build a data-focused software product?* **[Book a call](https://bunsly.com/)** *to
|
*Looking to build a data-focused software product?* **[Book a call](https://bunsly.com/)** *to
|
||||||
work with us.*
|
work with us.*
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, **Google**, & **ZipRecruiter** simultaneously
|
- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, & **ZipRecruiter** simultaneously
|
||||||
- Aggregates the job postings in a Pandas DataFrame
|
- Aggregates the job postings in a Pandas DataFrame
|
||||||
- Proxies support
|
- Proxies support
|
||||||
|
|
||||||
|
@ -28,9 +30,9 @@ import csv
|
||||||
from jobspy import scrape_jobs
|
from jobspy import scrape_jobs
|
||||||
|
|
||||||
jobs = scrape_jobs(
|
jobs = scrape_jobs(
|
||||||
site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google"],
|
site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor"],
|
||||||
search_term="software engineer",
|
search_term="software engineer",
|
||||||
location="San Francisco, CA",
|
location="Dallas, TX",
|
||||||
results_wanted=20,
|
results_wanted=20,
|
||||||
hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old)
|
hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old)
|
||||||
country_indeed='USA', # only needed for indeed / glassdoor
|
country_indeed='USA', # only needed for indeed / glassdoor
|
||||||
|
@ -61,8 +63,8 @@ zip_recruiter Software Developer TEKsystems Phoenix
|
||||||
```plaintext
|
```plaintext
|
||||||
Optional
|
Optional
|
||||||
├── site_name (list|str):
|
├── site_name (list|str):
|
||||||
| linkedin, zip_recruiter, indeed, glassdoor, google
|
| linkedin, zip_recruiter, indeed, glassdoor
|
||||||
| (default is all)
|
| (default is all four)
|
||||||
│
|
│
|
||||||
├── search_term (str)
|
├── search_term (str)
|
||||||
│
|
│
|
||||||
|
@ -78,6 +80,9 @@ Optional
|
||||||
| in format ['user:pass@host:port', 'localhost']
|
| in format ['user:pass@host:port', 'localhost']
|
||||||
| each job board scraper will round robin through the proxies
|
| each job board scraper will round robin through the proxies
|
||||||
|
|
|
|
||||||
|
├── ca_cert (str)
|
||||||
|
| path to CA Certificate file for proxies
|
||||||
|
│
|
||||||
├── is_remote (bool)
|
├── is_remote (bool)
|
||||||
│
|
│
|
||||||
├── results_wanted (int):
|
├── results_wanted (int):
|
||||||
|
@ -111,9 +116,6 @@ Optional
|
||||||
|
|
|
|
||||||
├── enforce_annual_salary (bool):
|
├── enforce_annual_salary (bool):
|
||||||
| converts wages to annual salary
|
| converts wages to annual salary
|
||||||
|
|
|
||||||
├── ca_cert (str)
|
|
||||||
| path to CA Certificate file for proxies
|
|
||||||
```
|
```
|
||||||
|
|
||||||
```
|
```
|
||||||
|
@ -166,14 +168,14 @@ Indeed specific
|
||||||
├── company_employees_label
|
├── company_employees_label
|
||||||
├── company_revenue_label
|
├── company_revenue_label
|
||||||
├── company_description
|
├── company_description
|
||||||
└── company_logo
|
└── logo_photo_url
|
||||||
```
|
```
|
||||||
|
|
||||||
## Supported Countries for Job Searching
|
## Supported Countries for Job Searching
|
||||||
|
|
||||||
### **LinkedIn / Google**
|
### **LinkedIn**
|
||||||
|
|
||||||
LinkedIn & Google searches globally & uses only the `location` parameter.
|
LinkedIn searches globally & uses only the `location` parameter.
|
||||||
|
|
||||||
### **ZipRecruiter**
|
### **ZipRecruiter**
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "python-jobspy"
|
name = "python-jobspy"
|
||||||
version = "1.1.74"
|
version = "1.1.72"
|
||||||
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
||||||
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
||||||
homepage = "https://github.com/Bunsly/JobSpy"
|
homepage = "https://github.com/Bunsly/JobSpy"
|
||||||
|
|
|
@ -9,7 +9,6 @@ from .scrapers.utils import set_logger_level, extract_salary, create_logger
|
||||||
from .scrapers.indeed import IndeedScraper
|
from .scrapers.indeed import IndeedScraper
|
||||||
from .scrapers.ziprecruiter import ZipRecruiterScraper
|
from .scrapers.ziprecruiter import ZipRecruiterScraper
|
||||||
from .scrapers.glassdoor import GlassdoorScraper
|
from .scrapers.glassdoor import GlassdoorScraper
|
||||||
from .scrapers.google import GoogleJobsScraper
|
|
||||||
from .scrapers.linkedin import LinkedInScraper
|
from .scrapers.linkedin import LinkedInScraper
|
||||||
from .scrapers import SalarySource, ScraperInput, Site, JobResponse, Country
|
from .scrapers import SalarySource, ScraperInput, Site, JobResponse, Country
|
||||||
from .scrapers.exceptions import (
|
from .scrapers.exceptions import (
|
||||||
|
@ -17,7 +16,6 @@ from .scrapers.exceptions import (
|
||||||
IndeedException,
|
IndeedException,
|
||||||
ZipRecruiterException,
|
ZipRecruiterException,
|
||||||
GlassdoorException,
|
GlassdoorException,
|
||||||
GoogleJobsException,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -52,7 +50,6 @@ def scrape_jobs(
|
||||||
Site.INDEED: IndeedScraper,
|
Site.INDEED: IndeedScraper,
|
||||||
Site.ZIP_RECRUITER: ZipRecruiterScraper,
|
Site.ZIP_RECRUITER: ZipRecruiterScraper,
|
||||||
Site.GLASSDOOR: GlassdoorScraper,
|
Site.GLASSDOOR: GlassdoorScraper,
|
||||||
Site.GOOGLE: GoogleJobsScraper,
|
|
||||||
}
|
}
|
||||||
set_logger_level(verbose)
|
set_logger_level(verbose)
|
||||||
|
|
||||||
|
@ -226,12 +223,12 @@ def scrape_jobs(
|
||||||
"is_remote",
|
"is_remote",
|
||||||
"job_level",
|
"job_level",
|
||||||
"job_function",
|
"job_function",
|
||||||
|
"company_industry",
|
||||||
"listing_type",
|
"listing_type",
|
||||||
"emails",
|
"emails",
|
||||||
"description",
|
"description",
|
||||||
"company_industry",
|
|
||||||
"company_url",
|
"company_url",
|
||||||
"company_logo",
|
"logo_photo_url",
|
||||||
"company_url_direct",
|
"company_url_direct",
|
||||||
"company_addresses",
|
"company_addresses",
|
||||||
"company_num_employees",
|
"company_num_employees",
|
||||||
|
|
|
@ -256,7 +256,7 @@ class JobPost(BaseModel):
|
||||||
company_num_employees: str | None = None
|
company_num_employees: str | None = None
|
||||||
company_revenue: str | None = None
|
company_revenue: str | None = None
|
||||||
company_description: str | None = None
|
company_description: str | None = None
|
||||||
company_logo: str | None = None
|
logo_photo_url: str | None = None
|
||||||
banner_photo_url: str | None = None
|
banner_photo_url: str | None = None
|
||||||
|
|
||||||
# linkedin only atm
|
# linkedin only atm
|
||||||
|
|
|
@ -17,14 +17,11 @@ class Site(Enum):
|
||||||
INDEED = "indeed"
|
INDEED = "indeed"
|
||||||
ZIP_RECRUITER = "zip_recruiter"
|
ZIP_RECRUITER = "zip_recruiter"
|
||||||
GLASSDOOR = "glassdoor"
|
GLASSDOOR = "glassdoor"
|
||||||
GOOGLE = "google"
|
|
||||||
|
|
||||||
|
|
||||||
class SalarySource(Enum):
|
class SalarySource(Enum):
|
||||||
DIRECT_DATA = "direct_data"
|
DIRECT_DATA = "direct_data"
|
||||||
DESCRIPTION = "description"
|
DESCRIPTION = "description"
|
||||||
|
|
||||||
|
|
||||||
class ScraperInput(BaseModel):
|
class ScraperInput(BaseModel):
|
||||||
site_type: list[Site]
|
site_type: list[Site]
|
||||||
search_term: str | None = None
|
search_term: str | None = None
|
||||||
|
@ -45,9 +42,7 @@ class ScraperInput(BaseModel):
|
||||||
|
|
||||||
|
|
||||||
class Scraper(ABC):
|
class Scraper(ABC):
|
||||||
def __init__(
|
def __init__(self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None):
|
||||||
self, site: Site, proxies: list[str] | None = None, ca_cert: str | None = None
|
|
||||||
):
|
|
||||||
self.site = site
|
self.site = site
|
||||||
self.proxies = proxies
|
self.proxies = proxies
|
||||||
self.ca_cert = ca_cert
|
self.ca_cert = ca_cert
|
||||||
|
|
|
@ -24,8 +24,3 @@ class ZipRecruiterException(Exception):
|
||||||
class GlassdoorException(Exception):
|
class GlassdoorException(Exception):
|
||||||
def __init__(self, message=None):
|
def __init__(self, message=None):
|
||||||
super().__init__(message or "An error occurred with Glassdoor")
|
super().__init__(message or "An error occurred with Glassdoor")
|
||||||
|
|
||||||
|
|
||||||
class GoogleJobsException(Exception):
|
|
||||||
def __init__(self, message=None):
|
|
||||||
super().__init__(message or "An error occurred with Google Jobs")
|
|
||||||
|
|
|
@ -214,7 +214,7 @@ class GlassdoorScraper(Scraper):
|
||||||
is_remote=is_remote,
|
is_remote=is_remote,
|
||||||
description=description,
|
description=description,
|
||||||
emails=extract_emails_from_text(description) if description else None,
|
emails=extract_emails_from_text(description) if description else None,
|
||||||
company_logo=company_logo,
|
logo_photo_url=company_logo,
|
||||||
listing_type=listing_type,
|
listing_type=listing_type,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -1,215 +0,0 @@
|
||||||
"""
|
|
||||||
jobspy.scrapers.google
|
|
||||||
~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
This module contains routines to scrape Google.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import math
|
|
||||||
import re
|
|
||||||
import json
|
|
||||||
from typing import Tuple
|
|
||||||
from datetime import datetime, timedelta
|
|
||||||
|
|
||||||
from .constants import headers_jobs, headers_initial, async_param
|
|
||||||
from .. import Scraper, ScraperInput, Site
|
|
||||||
from ..utils import extract_emails_from_text, create_logger, extract_job_type
|
|
||||||
from ..utils import (
|
|
||||||
create_session,
|
|
||||||
)
|
|
||||||
from ...jobs import (
|
|
||||||
JobPost,
|
|
||||||
JobResponse,
|
|
||||||
Location,
|
|
||||||
JobType,
|
|
||||||
)
|
|
||||||
|
|
||||||
logger = create_logger("Google")
|
|
||||||
|
|
||||||
|
|
||||||
class GoogleJobsScraper(Scraper):
|
|
||||||
def __init__(
|
|
||||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Initializes Google Scraper with the Goodle jobs search url
|
|
||||||
"""
|
|
||||||
site = Site(Site.GOOGLE)
|
|
||||||
super().__init__(site, proxies=proxies, ca_cert=ca_cert)
|
|
||||||
|
|
||||||
self.country = None
|
|
||||||
self.session = None
|
|
||||||
self.scraper_input = None
|
|
||||||
self.jobs_per_page = 10
|
|
||||||
self.seen_urls = set()
|
|
||||||
self.url = "https://www.google.com/search"
|
|
||||||
self.jobs_url = "https://www.google.com/async/callback:550"
|
|
||||||
|
|
||||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
|
||||||
"""
|
|
||||||
Scrapes Google for jobs with scraper_input criteria.
|
|
||||||
:param scraper_input: Information about job search criteria.
|
|
||||||
:return: JobResponse containing a list of jobs.
|
|
||||||
"""
|
|
||||||
self.scraper_input = scraper_input
|
|
||||||
self.scraper_input.results_wanted = min(900, scraper_input.results_wanted)
|
|
||||||
|
|
||||||
self.session = create_session(
|
|
||||||
proxies=self.proxies, ca_cert=self.ca_cert, is_tls=False, has_retry=True
|
|
||||||
)
|
|
||||||
forward_cursor = self._get_initial_cursor()
|
|
||||||
if forward_cursor is None:
|
|
||||||
logger.error("initial cursor not found")
|
|
||||||
return JobResponse(jobs=[])
|
|
||||||
|
|
||||||
page = 1
|
|
||||||
job_list: list[JobPost] = []
|
|
||||||
|
|
||||||
while (
|
|
||||||
len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset
|
|
||||||
and forward_cursor
|
|
||||||
):
|
|
||||||
logger.info(
|
|
||||||
f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
|
|
||||||
)
|
|
||||||
jobs, forward_cursor = self._get_jobs_next_page(forward_cursor)
|
|
||||||
if not jobs:
|
|
||||||
logger.info(f"found no jobs on page: {page}")
|
|
||||||
break
|
|
||||||
job_list += jobs
|
|
||||||
page += 1
|
|
||||||
return JobResponse(
|
|
||||||
jobs=job_list[
|
|
||||||
scraper_input.offset : scraper_input.offset
|
|
||||||
+ scraper_input.results_wanted
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
def _get_initial_cursor(self):
|
|
||||||
"""Gets initial cursor to paginate through job listings"""
|
|
||||||
query = f"{self.scraper_input.search_term} jobs"
|
|
||||||
|
|
||||||
def get_time_range(hours_old):
|
|
||||||
if hours_old <= 24:
|
|
||||||
return "since yesterday"
|
|
||||||
elif hours_old <= 72:
|
|
||||||
return "in the last 3 days"
|
|
||||||
elif hours_old <= 168:
|
|
||||||
return "in the last week"
|
|
||||||
else:
|
|
||||||
return "in the last month"
|
|
||||||
|
|
||||||
job_type_mapping = {
|
|
||||||
JobType.FULL_TIME: "Full time",
|
|
||||||
JobType.PART_TIME: "Part time",
|
|
||||||
JobType.INTERNSHIP: "Internship",
|
|
||||||
JobType.CONTRACT: "Contract",
|
|
||||||
}
|
|
||||||
|
|
||||||
if self.scraper_input.job_type in job_type_mapping:
|
|
||||||
query += f" {job_type_mapping[self.scraper_input.job_type]}"
|
|
||||||
|
|
||||||
if self.scraper_input.location:
|
|
||||||
query += f" near {self.scraper_input.location}"
|
|
||||||
|
|
||||||
if self.scraper_input.hours_old:
|
|
||||||
time_filter = get_time_range(self.scraper_input.hours_old)
|
|
||||||
query += f" {time_filter}"
|
|
||||||
|
|
||||||
if self.scraper_input.is_remote:
|
|
||||||
query += " remote"
|
|
||||||
|
|
||||||
params = {"q": query, "udm": "8"}
|
|
||||||
response = self.session.get(self.url, headers=headers_initial, params=params)
|
|
||||||
|
|
||||||
pattern_fc = r'<div jsname="Yust4d"[^>]+data-async-fc="([^"]+)"'
|
|
||||||
match_fc = re.search(pattern_fc, response.text)
|
|
||||||
data_async_fc = match_fc.group(1) if match_fc else None
|
|
||||||
return data_async_fc
|
|
||||||
|
|
||||||
def _get_jobs_next_page(self, forward_cursor: str) -> Tuple[list[JobPost], str]:
|
|
||||||
params = {"fc": [forward_cursor], "fcv": ["3"], "async": [async_param]}
|
|
||||||
response = self.session.get(self.jobs_url, headers=headers_jobs, params=params)
|
|
||||||
return self._parse_jobs(response.text)
|
|
||||||
|
|
||||||
def _parse_jobs(self, job_data: str) -> Tuple[list[JobPost], str]:
|
|
||||||
"""
|
|
||||||
Parses jobs on a page with next page cursor
|
|
||||||
"""
|
|
||||||
start_idx = job_data.find("[[[")
|
|
||||||
end_idx = job_data.rindex("]]]") + 3
|
|
||||||
s = job_data[start_idx:end_idx]
|
|
||||||
parsed = json.loads(s)[0]
|
|
||||||
|
|
||||||
pattern_fc = r'data-async-fc="([^"]+)"'
|
|
||||||
match_fc = re.search(pattern_fc, job_data)
|
|
||||||
data_async_fc = match_fc.group(1) if match_fc else None
|
|
||||||
jobs_on_page = []
|
|
||||||
|
|
||||||
for array in parsed:
|
|
||||||
|
|
||||||
_, job_data = array
|
|
||||||
if not job_data.startswith("[[["):
|
|
||||||
continue
|
|
||||||
job_d = json.loads(job_data)
|
|
||||||
|
|
||||||
job_info = self._find_job_info(job_d)
|
|
||||||
|
|
||||||
job_url = job_info[3][0][0] if job_info[3] and job_info[3][0] else None
|
|
||||||
if job_url in self.seen_urls:
|
|
||||||
continue
|
|
||||||
self.seen_urls.add(job_url)
|
|
||||||
|
|
||||||
title = job_info[0]
|
|
||||||
company_name = job_info[1]
|
|
||||||
location = city = job_info[2]
|
|
||||||
state = country = date_posted = None
|
|
||||||
if location and "," in location:
|
|
||||||
city, state, *country = [*map(lambda x: x.strip(), location.split(","))]
|
|
||||||
|
|
||||||
days_ago_str = job_info[12]
|
|
||||||
if type(days_ago_str) == str:
|
|
||||||
match = re.search(r"\d+", days_ago_str)
|
|
||||||
days_ago = int(match.group()) if match else None
|
|
||||||
date_posted = (datetime.now() - timedelta(days=days_ago)).date()
|
|
||||||
|
|
||||||
description = job_info[19]
|
|
||||||
|
|
||||||
job_post = JobPost(
|
|
||||||
id=f"go-{job_info[28]}",
|
|
||||||
title=title,
|
|
||||||
company_name=company_name,
|
|
||||||
location=Location(
|
|
||||||
city=city, state=state, country=country[0] if country else None
|
|
||||||
),
|
|
||||||
job_url=job_url,
|
|
||||||
job_url_direct=job_url,
|
|
||||||
date_posted=date_posted,
|
|
||||||
is_remote="remote" in description.lower()
|
|
||||||
or "wfh" in description.lower(),
|
|
||||||
description=description,
|
|
||||||
emails=extract_emails_from_text(description),
|
|
||||||
job_type=extract_job_type(description),
|
|
||||||
)
|
|
||||||
jobs_on_page.append(job_post)
|
|
||||||
return jobs_on_page, data_async_fc
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _find_job_info(jobs_data: list | dict) -> list | None:
|
|
||||||
"""Iterates through the JSON data to find the job listings"""
|
|
||||||
if isinstance(jobs_data, dict):
|
|
||||||
for key, value in jobs_data.items():
|
|
||||||
if key == "520084652" and isinstance(value, list):
|
|
||||||
return value
|
|
||||||
else:
|
|
||||||
result = GoogleJobsScraper._find_job_info(value)
|
|
||||||
if result:
|
|
||||||
return result
|
|
||||||
elif isinstance(jobs_data, list):
|
|
||||||
for item in jobs_data:
|
|
||||||
result = GoogleJobsScraper._find_job_info(item)
|
|
||||||
if result:
|
|
||||||
return result
|
|
||||||
return None
|
|
|
@ -1,52 +0,0 @@
|
||||||
headers_initial = {
|
|
||||||
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
|
||||||
"accept-language": "en-US,en;q=0.9",
|
|
||||||
"priority": "u=0, i",
|
|
||||||
"referer": "https://www.google.com/",
|
|
||||||
"sec-ch-prefers-color-scheme": "dark",
|
|
||||||
"sec-ch-ua": '"Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"',
|
|
||||||
"sec-ch-ua-arch": '"arm"',
|
|
||||||
"sec-ch-ua-bitness": '"64"',
|
|
||||||
"sec-ch-ua-form-factors": '"Desktop"',
|
|
||||||
"sec-ch-ua-full-version": '"130.0.6723.58"',
|
|
||||||
"sec-ch-ua-full-version-list": '"Chromium";v="130.0.6723.58", "Google Chrome";v="130.0.6723.58", "Not?A_Brand";v="99.0.0.0"',
|
|
||||||
"sec-ch-ua-mobile": "?0",
|
|
||||||
"sec-ch-ua-model": '""',
|
|
||||||
"sec-ch-ua-platform": '"macOS"',
|
|
||||||
"sec-ch-ua-platform-version": '"15.0.1"',
|
|
||||||
"sec-ch-ua-wow64": "?0",
|
|
||||||
"sec-fetch-dest": "document",
|
|
||||||
"sec-fetch-mode": "navigate",
|
|
||||||
"sec-fetch-site": "same-origin",
|
|
||||||
"sec-fetch-user": "?1",
|
|
||||||
"upgrade-insecure-requests": "1",
|
|
||||||
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
|
|
||||||
"x-browser-channel": "stable",
|
|
||||||
"x-browser-copyright": "Copyright 2024 Google LLC. All rights reserved.",
|
|
||||||
"x-browser-year": "2024",
|
|
||||||
}
|
|
||||||
|
|
||||||
headers_jobs = {
|
|
||||||
"accept": "*/*",
|
|
||||||
"accept-language": "en-US,en;q=0.9",
|
|
||||||
"priority": "u=1, i",
|
|
||||||
"referer": "https://www.google.com/",
|
|
||||||
"sec-ch-prefers-color-scheme": "dark",
|
|
||||||
"sec-ch-ua": '"Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"',
|
|
||||||
"sec-ch-ua-arch": '"arm"',
|
|
||||||
"sec-ch-ua-bitness": '"64"',
|
|
||||||
"sec-ch-ua-form-factors": '"Desktop"',
|
|
||||||
"sec-ch-ua-full-version": '"130.0.6723.58"',
|
|
||||||
"sec-ch-ua-full-version-list": '"Chromium";v="130.0.6723.58", "Google Chrome";v="130.0.6723.58", "Not?A_Brand";v="99.0.0.0"',
|
|
||||||
"sec-ch-ua-mobile": "?0",
|
|
||||||
"sec-ch-ua-model": '""',
|
|
||||||
"sec-ch-ua-platform": '"macOS"',
|
|
||||||
"sec-ch-ua-platform-version": '"15.0.1"',
|
|
||||||
"sec-ch-ua-wow64": "?0",
|
|
||||||
"sec-fetch-dest": "empty",
|
|
||||||
"sec-fetch-mode": "cors",
|
|
||||||
"sec-fetch-site": "same-origin",
|
|
||||||
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
|
|
||||||
}
|
|
||||||
|
|
||||||
async_param = "_basejs:/xjs/_/js/k=xjs.s.en_US.JwveA-JiKmg.2018.O/am=AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIAAAAAAAAACAAAoICAAAAAAAKMAfAAAAIAQAAAAAAAAAAAAACCAAAEJDAAACAAAAAGABAIAAARBAAABAAAAAgAgQAABAASKAfv8JAAABAAAAAAwAQAQACQAAAAAAcAEAQABoCAAAABAAAIABAACAAAAEAAAAFAAAAAAAAAAAAAAAAAAAAAAAAACAQADoBwAAAAAAAAAAAAAQBAAAAATQAAoACOAHAAAAAAAAAQAAAIIAAAA_ZAACAAAAAAAAcB8APB4wHFJ4AAAAAAAAAAAAAAAACECCYA5If0EACAAAAAAAAAAAAAAAAAAAUgRNXG4AMAE/dg=0/br=1/rs=ACT90oGxMeaFMCopIHq5tuQM-6_3M_VMjQ,_basecss:/xjs/_/ss/k=xjs.s.IwsGu62EDtU.L.B1.O/am=QOoQIAQAAAQAREADEBAAAAAAAAAAAAAAAAAAAAAgAQAAIAAAgAQAAAIAIAIAoEwCAADIC8AfsgEAawwAPkAAjgoAGAAAAAAAAEADAAAAAAIgAECHAAAAAAAAAAABAQAggAARQAAAQCEAAAAAIAAAABgAAAAAIAQIACCAAfB-AAFIQABoCEA_CgEAAIABAACEgHAEwwAEFQAM4CgAAAAAAAAAAAAACABCAAAAQEAAABAgAMCPAAA4AoE2BAEAggSAAIoAQAAAAAgAAAAACCAQAAAxEwA_ZAACAAAAAAAAAAkAAAAAAAAgAAAAAAAAAAAAAAAAAAAAAAAAQAEAAAAAAAAAAAAAAAAAAAAAQA/br=1/rs=ACT90oGZc36t3uUQkj0srnIvvbHjO2hgyg,_basecomb:/xjs/_/js/k=xjs.s.en_US.JwveA-JiKmg.2018.O/ck=xjs.s.IwsGu62EDtU.L.B1.O/am=QOoQIAQAAAQAREADEBAAAAAAAAAAAAAAAAAAAAAgAQAAIAAAgAQAAAKAIAoIqEwCAADIK8AfsgEAawwAPkAAjgoAGAAACCAAAEJDAAACAAIgAGCHAIAAARBAAABBAQAggAgRQABAQSOAfv8JIAABABgAAAwAYAQICSCAAfB-cAFIQABoCEA_ChEAAIABAACEgHAEwwAEFQAM4CgAAAAAAAAAAAAACABCAACAQEDoBxAgAMCPAAA4AoE2BAEAggTQAIoASOAHAAgAAAAACSAQAIIxEwA_ZAACAAAAAAAAcB8APB4wHFJ4AAAAAAAAAAAAAAAACECCYA5If0EACAAAAAAAAAAAAAAAAAAAUgRNXG4AMAE/d=1/ed=1/dg=0/br=1/ujg=1/rs=ACT90oFNLTjPzD_OAqhhtXwe2pg1T3WpBg,_fmt:prog,_id:fc_5FwaZ86OKsfdwN4P4La3yA4_2"
|
|
|
@ -72,7 +72,7 @@ class IndeedScraper(Scraper):
|
||||||
|
|
||||||
while len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset:
|
while len(self.seen_urls) < scraper_input.results_wanted + scraper_input.offset:
|
||||||
logger.info(
|
logger.info(
|
||||||
f"search page: {page} / {math.ceil(scraper_input.results_wanted / self.jobs_per_page)}"
|
f"search page: {page} / {math.ceil(scraper_input.results_wanted / 100)}"
|
||||||
)
|
)
|
||||||
jobs, cursor = self._scrape_page(cursor)
|
jobs, cursor = self._scrape_page(cursor)
|
||||||
if not jobs:
|
if not jobs:
|
||||||
|
@ -258,7 +258,7 @@ class IndeedScraper(Scraper):
|
||||||
company_num_employees=employer_details.get("employeesLocalizedLabel"),
|
company_num_employees=employer_details.get("employeesLocalizedLabel"),
|
||||||
company_revenue=employer_details.get("revenueLocalizedLabel"),
|
company_revenue=employer_details.get("revenueLocalizedLabel"),
|
||||||
company_description=employer_details.get("briefDescription"),
|
company_description=employer_details.get("briefDescription"),
|
||||||
company_logo=(
|
logo_photo_url=(
|
||||||
employer["images"].get("squareLogoUrl")
|
employer["images"].get("squareLogoUrl")
|
||||||
if employer and employer.get("images")
|
if employer and employer.get("images")
|
||||||
else None
|
else None
|
||||||
|
|
|
@ -232,7 +232,7 @@ class LinkedInScraper(Scraper):
|
||||||
description=job_details.get("description"),
|
description=job_details.get("description"),
|
||||||
job_url_direct=job_details.get("job_url_direct"),
|
job_url_direct=job_details.get("job_url_direct"),
|
||||||
emails=extract_emails_from_text(job_details.get("description")),
|
emails=extract_emails_from_text(job_details.get("description")),
|
||||||
company_logo=job_details.get("company_logo"),
|
logo_photo_url=job_details.get("logo_photo_url"),
|
||||||
job_function=job_details.get("job_function"),
|
job_function=job_details.get("job_function"),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -275,7 +275,7 @@ class LinkedInScraper(Scraper):
|
||||||
if job_function_span:
|
if job_function_span:
|
||||||
job_function = job_function_span.text.strip()
|
job_function = job_function_span.text.strip()
|
||||||
|
|
||||||
company_logo = (
|
logo_photo_url = (
|
||||||
logo_image.get("data-delayed-url")
|
logo_image.get("data-delayed-url")
|
||||||
if (logo_image := soup.find("img", {"class": "artdeco-entity-image"}))
|
if (logo_image := soup.find("img", {"class": "artdeco-entity-image"}))
|
||||||
else None
|
else None
|
||||||
|
@ -286,7 +286,7 @@ class LinkedInScraper(Scraper):
|
||||||
"company_industry": self._parse_company_industry(soup),
|
"company_industry": self._parse_company_industry(soup),
|
||||||
"job_type": self._parse_job_type(soup),
|
"job_type": self._parse_job_type(soup),
|
||||||
"job_url_direct": self._parse_job_url_direct(soup),
|
"job_url_direct": self._parse_job_url_direct(soup),
|
||||||
"company_logo": company_logo,
|
"logo_photo_url": logo_photo_url,
|
||||||
"job_function": job_function,
|
"job_function": job_function,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -264,22 +264,3 @@ def extract_salary(
|
||||||
else:
|
else:
|
||||||
return interval, min_salary, max_salary, "USD"
|
return interval, min_salary, max_salary, "USD"
|
||||||
return None, None, None, None
|
return None, None, None, None
|
||||||
|
|
||||||
|
|
||||||
def extract_job_type(description: str):
|
|
||||||
if not description:
|
|
||||||
return []
|
|
||||||
|
|
||||||
keywords = {
|
|
||||||
JobType.FULL_TIME: r"full\s?time",
|
|
||||||
JobType.PART_TIME: r"part\s?time",
|
|
||||||
JobType.INTERNSHIP: r"internship",
|
|
||||||
JobType.CONTRACT: r"contract",
|
|
||||||
}
|
|
||||||
|
|
||||||
listing_types = []
|
|
||||||
for key, pattern in keywords.items():
|
|
||||||
if re.search(pattern, description, re.IGNORECASE):
|
|
||||||
listing_types.append(key)
|
|
||||||
|
|
||||||
return listing_types if listing_types else None
|
|
||||||
|
|
|
@ -1,12 +0,0 @@
|
||||||
from jobspy import scrape_jobs
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
|
|
||||||
def test_google():
|
|
||||||
result = scrape_jobs(
|
|
||||||
site_name="google", search_term="software engineer", results_wanted=5
|
|
||||||
)
|
|
||||||
|
|
||||||
assert (
|
|
||||||
isinstance(result, pd.DataFrame) and len(result) == 5
|
|
||||||
), "Result should be a non-empty DataFrame"
|
|
Loading…
Reference in New Issue