mirror of
https://github.com/Bunsly/JobSpy.git
synced 2026-03-05 12:04:33 -08:00
Compare commits
7 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d000a81eb3 | ||
|
|
ccb0c17660 | ||
|
|
df339610fa | ||
|
|
c501006bd8 | ||
|
|
89a3ee231c | ||
|
|
6439f71433 | ||
|
|
7f6271b2e0 |
14
README.md
14
README.md
@@ -13,9 +13,6 @@ work with us.*
|
|||||||
- Aggregates the job postings in a Pandas DataFrame
|
- Aggregates the job postings in a Pandas DataFrame
|
||||||
- Proxies support
|
- Proxies support
|
||||||
|
|
||||||
[Video Guide for JobSpy](https://www.youtube.com/watch?v=RuP1HrAZnxs&pp=ygUgam9icyBzY3JhcGVyIGJvdCBsaW5rZWRpbiBpbmRlZWQ%3D) -
|
|
||||||
Updated for release v1.1.3
|
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
### Installation
|
### Installation
|
||||||
@@ -41,12 +38,12 @@ jobs = scrape_jobs(
|
|||||||
country_indeed='USA', # only needed for indeed / glassdoor
|
country_indeed='USA', # only needed for indeed / glassdoor
|
||||||
|
|
||||||
# linkedin_fetch_description=True # get full description and direct job url for linkedin (slower)
|
# linkedin_fetch_description=True # get full description and direct job url for linkedin (slower)
|
||||||
# proxies=["Efb5EA8OIk0BQb:wifi;us;@proxy.soax.com:9000", "localhost"],
|
# proxies=["208.195.175.46:65095", "208.195.175.45:65095", "localhost"],
|
||||||
|
|
||||||
)
|
)
|
||||||
print(f"Found {len(jobs)} jobs")
|
print(f"Found {len(jobs)} jobs")
|
||||||
print(jobs.head())
|
print(jobs.head())
|
||||||
jobs.to_csv("jobs.csv", quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False) # to_xlsx
|
jobs.to_csv("jobs.csv", quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False) # to_excel
|
||||||
```
|
```
|
||||||
|
|
||||||
### Output
|
### Output
|
||||||
@@ -79,7 +76,7 @@ Optional
|
|||||||
├── job_type (str):
|
├── job_type (str):
|
||||||
| fulltime, parttime, internship, contract
|
| fulltime, parttime, internship, contract
|
||||||
│
|
│
|
||||||
├── proxies ():
|
├── proxies (list):
|
||||||
| in format ['user:pass@host:port', 'localhost']
|
| in format ['user:pass@host:port', 'localhost']
|
||||||
| each job board will round robin through the proxies
|
| each job board will round robin through the proxies
|
||||||
│
|
│
|
||||||
@@ -143,13 +140,14 @@ JobPost
|
|||||||
│ ├── state (str)
|
│ ├── state (str)
|
||||||
├── description (str)
|
├── description (str)
|
||||||
├── job_type (str): fulltime, parttime, internship, contract
|
├── job_type (str): fulltime, parttime, internship, contract
|
||||||
|
├── job_function (str)
|
||||||
├── compensation (object)
|
├── compensation (object)
|
||||||
│ ├── interval (str): yearly, monthly, weekly, daily, hourly
|
│ ├── interval (str): yearly, monthly, weekly, daily, hourly
|
||||||
│ ├── min_amount (int)
|
│ ├── min_amount (int)
|
||||||
│ ├── max_amount (int)
|
│ ├── max_amount (int)
|
||||||
│ └── currency (enum)
|
│ └── currency (enum)
|
||||||
└── date_posted (date)
|
├── date_posted (date)
|
||||||
└── emails (str)
|
├── emails (str)
|
||||||
└── is_remote (bool)
|
└── is_remote (bool)
|
||||||
|
|
||||||
Indeed specific
|
Indeed specific
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "python-jobspy"
|
name = "python-jobspy"
|
||||||
version = "1.1.54"
|
version = "1.1.57"
|
||||||
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
||||||
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
||||||
homepage = "https://github.com/Bunsly/JobSpy"
|
homepage = "https://github.com/Bunsly/JobSpy"
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ from typing import Tuple
|
|||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
|
||||||
from .jobs import JobType, Location
|
from .jobs import JobType, Location
|
||||||
from .scrapers.utils import logger, set_logger_level
|
from .scrapers.utils import logger, set_logger_level, extract_salary
|
||||||
from .scrapers.indeed import IndeedScraper
|
from .scrapers.indeed import IndeedScraper
|
||||||
from .scrapers.ziprecruiter import ZipRecruiterScraper
|
from .scrapers.ziprecruiter import ZipRecruiterScraper
|
||||||
from .scrapers.glassdoor import GlassdoorScraper
|
from .scrapers.glassdoor import GlassdoorScraper
|
||||||
@@ -118,6 +118,21 @@ def scrape_jobs(
|
|||||||
site_value, scraped_data = future.result()
|
site_value, scraped_data = future.result()
|
||||||
site_to_jobs_dict[site_value] = scraped_data
|
site_to_jobs_dict[site_value] = scraped_data
|
||||||
|
|
||||||
|
def convert_to_annual(job_data: dict):
|
||||||
|
if job_data["interval"] == "hourly":
|
||||||
|
job_data["min_amount"] *= 2080
|
||||||
|
job_data["max_amount"] *= 2080
|
||||||
|
if job_data["interval"] == "monthly":
|
||||||
|
job_data["min_amount"] *= 12
|
||||||
|
job_data["max_amount"] *= 12
|
||||||
|
if job_data["interval"] == "weekly":
|
||||||
|
job_data["min_amount"] *= 52
|
||||||
|
job_data["max_amount"] *= 52
|
||||||
|
if job_data["interval"] == "daily":
|
||||||
|
job_data["min_amount"] *= 260
|
||||||
|
job_data["max_amount"] *= 260
|
||||||
|
job_data["interval"] = "yearly"
|
||||||
|
|
||||||
jobs_dfs: list[pd.DataFrame] = []
|
jobs_dfs: list[pd.DataFrame] = []
|
||||||
|
|
||||||
for site, job_response in site_to_jobs_dict.items():
|
for site, job_response in site_to_jobs_dict.items():
|
||||||
@@ -150,11 +165,22 @@ def scrape_jobs(
|
|||||||
job_data["min_amount"] = compensation_obj.get("min_amount")
|
job_data["min_amount"] = compensation_obj.get("min_amount")
|
||||||
job_data["max_amount"] = compensation_obj.get("max_amount")
|
job_data["max_amount"] = compensation_obj.get("max_amount")
|
||||||
job_data["currency"] = compensation_obj.get("currency", "USD")
|
job_data["currency"] = compensation_obj.get("currency", "USD")
|
||||||
|
if (
|
||||||
|
job_data["interval"]
|
||||||
|
and job_data["interval"] != "yearly"
|
||||||
|
and job_data["min_amount"]
|
||||||
|
and job_data["max_amount"]
|
||||||
|
):
|
||||||
|
convert_to_annual(job_data)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
job_data["interval"] = None
|
if country_enum == Country.USA:
|
||||||
job_data["min_amount"] = None
|
(
|
||||||
job_data["max_amount"] = None
|
job_data["interval"],
|
||||||
job_data["currency"] = None
|
job_data["min_amount"],
|
||||||
|
job_data["max_amount"],
|
||||||
|
job_data["currency"],
|
||||||
|
) = extract_salary(job_data["description"])
|
||||||
|
|
||||||
job_df = pd.DataFrame([job_data])
|
job_df = pd.DataFrame([job_data])
|
||||||
jobs_dfs.append(job_df)
|
jobs_dfs.append(job_df)
|
||||||
@@ -182,6 +208,7 @@ def scrape_jobs(
|
|||||||
"max_amount",
|
"max_amount",
|
||||||
"currency",
|
"currency",
|
||||||
"is_remote",
|
"is_remote",
|
||||||
|
"job_function",
|
||||||
"emails",
|
"emails",
|
||||||
"description",
|
"description",
|
||||||
"company_url",
|
"company_url",
|
||||||
|
|||||||
@@ -254,6 +254,9 @@ class JobPost(BaseModel):
|
|||||||
logo_photo_url: str | None = None
|
logo_photo_url: str | None = None
|
||||||
banner_photo_url: str | None = None
|
banner_photo_url: str | None = None
|
||||||
|
|
||||||
|
# linkedin only atm
|
||||||
|
job_function: str | None = None
|
||||||
|
|
||||||
|
|
||||||
class JobResponse(BaseModel):
|
class JobResponse(BaseModel):
|
||||||
jobs: list[JobPost] = []
|
jobs: list[JobPost] = []
|
||||||
|
|||||||
@@ -69,7 +69,7 @@ class GlassdoorScraper(Scraper):
|
|||||||
if location_type is None:
|
if location_type is None:
|
||||||
logger.error("Glassdoor: location not parsed")
|
logger.error("Glassdoor: location not parsed")
|
||||||
return JobResponse(jobs=[])
|
return JobResponse(jobs=[])
|
||||||
all_jobs: list[JobPost] = []
|
job_list: list[JobPost] = []
|
||||||
cursor = None
|
cursor = None
|
||||||
|
|
||||||
range_start = 1 + (scraper_input.offset // self.jobs_per_page)
|
range_start = 1 + (scraper_input.offset // self.jobs_per_page)
|
||||||
@@ -81,14 +81,14 @@ class GlassdoorScraper(Scraper):
|
|||||||
jobs, cursor = self._fetch_jobs_page(
|
jobs, cursor = self._fetch_jobs_page(
|
||||||
scraper_input, location_id, location_type, page, cursor
|
scraper_input, location_id, location_type, page, cursor
|
||||||
)
|
)
|
||||||
all_jobs.extend(jobs)
|
job_list.extend(jobs)
|
||||||
if not jobs or len(all_jobs) >= scraper_input.results_wanted:
|
if not jobs or len(job_list) >= scraper_input.results_wanted:
|
||||||
all_jobs = all_jobs[: scraper_input.results_wanted]
|
job_list = job_list[: scraper_input.results_wanted]
|
||||||
break
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Glassdoor: {str(e)}")
|
logger.error(f"Glassdoor: {str(e)}")
|
||||||
break
|
break
|
||||||
return JobResponse(jobs=all_jobs)
|
return JobResponse(jobs=job_list)
|
||||||
|
|
||||||
def _fetch_jobs_page(
|
def _fetch_jobs_page(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -297,8 +297,8 @@ class IndeedScraper(Scraper):
|
|||||||
max_range = comp["range"].get("max")
|
max_range = comp["range"].get("max")
|
||||||
return Compensation(
|
return Compensation(
|
||||||
interval=interval,
|
interval=interval,
|
||||||
min_amount=round(min_range, 2) if min_range is not None else None,
|
min_amount=int(min_range) if min_range is not None else None,
|
||||||
max_amount=round(max_range, 2) if max_range is not None else None,
|
max_amount=int(max_range) if max_range is not None else None,
|
||||||
currency=job["compensation"]["currencyCode"],
|
currency=job["compensation"]["currencyCode"],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -13,14 +13,13 @@ import regex as re
|
|||||||
from typing import Optional
|
from typing import Optional
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
from threading import Lock
|
|
||||||
from bs4.element import Tag
|
from bs4.element import Tag
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from urllib.parse import urlparse, urlunparse, unquote
|
from urllib.parse import urlparse, urlunparse, unquote
|
||||||
|
|
||||||
from .. import Scraper, ScraperInput, Site
|
from .. import Scraper, ScraperInput, Site
|
||||||
from ..exceptions import LinkedInException
|
from ..exceptions import LinkedInException
|
||||||
from ..utils import create_session
|
from ..utils import create_session, remove_attributes
|
||||||
from ...jobs import (
|
from ...jobs import (
|
||||||
JobPost,
|
JobPost,
|
||||||
Location,
|
Location,
|
||||||
@@ -70,9 +69,9 @@ class LinkedInScraper(Scraper):
|
|||||||
"""
|
"""
|
||||||
self.scraper_input = scraper_input
|
self.scraper_input = scraper_input
|
||||||
job_list: list[JobPost] = []
|
job_list: list[JobPost] = []
|
||||||
seen_urls = set()
|
seen_ids = set()
|
||||||
url_lock = Lock()
|
page = scraper_input.offset // 10 * 10 if scraper_input.offset else 0
|
||||||
page = scraper_input.offset // 25 + 25 if scraper_input.offset else 0
|
request_count = 0
|
||||||
seconds_old = (
|
seconds_old = (
|
||||||
scraper_input.hours_old * 3600 if scraper_input.hours_old else None
|
scraper_input.hours_old * 3600 if scraper_input.hours_old else None
|
||||||
)
|
)
|
||||||
@@ -80,7 +79,8 @@ class LinkedInScraper(Scraper):
|
|||||||
lambda: len(job_list) < scraper_input.results_wanted and page < 1000
|
lambda: len(job_list) < scraper_input.results_wanted and page < 1000
|
||||||
)
|
)
|
||||||
while continue_search():
|
while continue_search():
|
||||||
logger.info(f"LinkedIn search page: {page // 25 + 1}")
|
request_count += 1
|
||||||
|
logger.info(f"LinkedIn search page: {request_count}")
|
||||||
params = {
|
params = {
|
||||||
"keywords": scraper_input.search_term,
|
"keywords": scraper_input.search_term,
|
||||||
"location": scraper_input.location,
|
"location": scraper_input.location,
|
||||||
@@ -92,7 +92,7 @@ class LinkedInScraper(Scraper):
|
|||||||
else None
|
else None
|
||||||
),
|
),
|
||||||
"pageNum": 0,
|
"pageNum": 0,
|
||||||
"start": page + scraper_input.offset,
|
"start": page,
|
||||||
"f_AL": "true" if scraper_input.easy_apply else None,
|
"f_AL": "true" if scraper_input.easy_apply else None,
|
||||||
"f_C": (
|
"f_C": (
|
||||||
",".join(map(str, scraper_input.linkedin_company_ids))
|
",".join(map(str, scraper_input.linkedin_company_ids))
|
||||||
@@ -133,36 +133,34 @@ class LinkedInScraper(Scraper):
|
|||||||
return JobResponse(jobs=job_list)
|
return JobResponse(jobs=job_list)
|
||||||
|
|
||||||
for job_card in job_cards:
|
for job_card in job_cards:
|
||||||
job_url = None
|
|
||||||
href_tag = job_card.find("a", class_="base-card__full-link")
|
href_tag = job_card.find("a", class_="base-card__full-link")
|
||||||
if href_tag and "href" in href_tag.attrs:
|
if href_tag and "href" in href_tag.attrs:
|
||||||
href = href_tag.attrs["href"].split("?")[0]
|
href = href_tag.attrs["href"].split("?")[0]
|
||||||
job_id = href.split("-")[-1]
|
job_id = href.split("-")[-1]
|
||||||
job_url = f"{self.base_url}/jobs/view/{job_id}"
|
|
||||||
|
|
||||||
with url_lock:
|
if job_id in seen_ids:
|
||||||
if job_url in seen_urls:
|
|
||||||
continue
|
continue
|
||||||
seen_urls.add(job_url)
|
seen_ids.add(job_id)
|
||||||
try:
|
|
||||||
fetch_desc = scraper_input.linkedin_fetch_description
|
try:
|
||||||
job_post = self._process_job(job_card, job_url, fetch_desc)
|
fetch_desc = scraper_input.linkedin_fetch_description
|
||||||
if job_post:
|
job_post = self._process_job(job_card, job_id, fetch_desc)
|
||||||
job_list.append(job_post)
|
if job_post:
|
||||||
if not continue_search():
|
job_list.append(job_post)
|
||||||
break
|
if not continue_search():
|
||||||
except Exception as e:
|
break
|
||||||
raise LinkedInException(str(e))
|
except Exception as e:
|
||||||
|
raise LinkedInException(str(e))
|
||||||
|
|
||||||
if continue_search():
|
if continue_search():
|
||||||
time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
|
time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
|
||||||
page += self.jobs_per_page
|
page += len(job_list)
|
||||||
|
|
||||||
job_list = job_list[: scraper_input.results_wanted]
|
job_list = job_list[: scraper_input.results_wanted]
|
||||||
return JobResponse(jobs=job_list)
|
return JobResponse(jobs=job_list)
|
||||||
|
|
||||||
def _process_job(
|
def _process_job(
|
||||||
self, job_card: Tag, job_url: str, full_descr: bool
|
self, job_card: Tag, job_id: str, full_descr: bool
|
||||||
) -> Optional[JobPost]:
|
) -> Optional[JobPost]:
|
||||||
salary_tag = job_card.find("span", class_="job-search-card__salary-info")
|
salary_tag = job_card.find("span", class_="job-search-card__salary-info")
|
||||||
|
|
||||||
@@ -209,46 +207,39 @@ class LinkedInScraper(Scraper):
|
|||||||
date_posted = None
|
date_posted = None
|
||||||
job_details = {}
|
job_details = {}
|
||||||
if full_descr:
|
if full_descr:
|
||||||
job_details = self._get_job_details(job_url)
|
job_details = self._get_job_details(job_id)
|
||||||
|
|
||||||
return JobPost(
|
return JobPost(
|
||||||
id=self._get_id(job_url),
|
id=job_id,
|
||||||
title=title,
|
title=title,
|
||||||
company_name=company,
|
company_name=company,
|
||||||
company_url=company_url,
|
company_url=company_url,
|
||||||
location=location,
|
location=location,
|
||||||
date_posted=date_posted,
|
date_posted=date_posted,
|
||||||
job_url=job_url,
|
job_url=f"{self.base_url}/jobs/view/{job_id}",
|
||||||
compensation=compensation,
|
compensation=compensation,
|
||||||
job_type=job_details.get("job_type"),
|
job_type=job_details.get("job_type"),
|
||||||
description=job_details.get("description"),
|
description=job_details.get("description"),
|
||||||
job_url_direct=job_details.get("job_url_direct"),
|
job_url_direct=job_details.get("job_url_direct"),
|
||||||
emails=extract_emails_from_text(job_details.get("description")),
|
emails=extract_emails_from_text(job_details.get("description")),
|
||||||
logo_photo_url=job_details.get("logo_photo_url"),
|
logo_photo_url=job_details.get("logo_photo_url"),
|
||||||
|
job_function=job_details.get("job_function"),
|
||||||
)
|
)
|
||||||
|
|
||||||
def _get_id(self, url: str):
|
def _get_job_details(self, job_id: str) -> dict:
|
||||||
"""
|
|
||||||
Extracts the job id from the job url
|
|
||||||
:param url:
|
|
||||||
:return: str
|
|
||||||
"""
|
|
||||||
if not url:
|
|
||||||
return None
|
|
||||||
return url.split("/")[-1]
|
|
||||||
|
|
||||||
def _get_job_details(self, job_page_url: str) -> dict:
|
|
||||||
"""
|
"""
|
||||||
Retrieves job description and other job details by going to the job page url
|
Retrieves job description and other job details by going to the job page url
|
||||||
:param job_page_url:
|
:param job_page_url:
|
||||||
:return: dict
|
:return: dict
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
response = self.session.get(job_page_url, timeout=5)
|
response = self.session.get(
|
||||||
|
f"{self.base_url}/jobs-guest/jobs/api/jobPosting/{job_id}", timeout=5
|
||||||
|
)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
except:
|
except:
|
||||||
return {}
|
return {}
|
||||||
if response.url == "https://www.linkedin.com/signup":
|
if "linkedin.com/signup" in response.url:
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
@@ -257,16 +248,22 @@ class LinkedInScraper(Scraper):
|
|||||||
)
|
)
|
||||||
description = None
|
description = None
|
||||||
if div_content is not None:
|
if div_content is not None:
|
||||||
|
|
||||||
def remove_attributes(tag):
|
|
||||||
for attr in list(tag.attrs):
|
|
||||||
del tag[attr]
|
|
||||||
return tag
|
|
||||||
|
|
||||||
div_content = remove_attributes(div_content)
|
div_content = remove_attributes(div_content)
|
||||||
description = div_content.prettify(formatter="html")
|
description = div_content.prettify(formatter="html")
|
||||||
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
||||||
description = markdown_converter(description)
|
description = markdown_converter(description)
|
||||||
|
|
||||||
|
h3_tag = soup.find(
|
||||||
|
"h3", text=lambda text: text and "Job function" in text.strip()
|
||||||
|
)
|
||||||
|
|
||||||
|
job_function = None
|
||||||
|
if h3_tag:
|
||||||
|
job_function_span = h3_tag.find_next(
|
||||||
|
"span", class_="description__job-criteria-text"
|
||||||
|
)
|
||||||
|
if job_function_span:
|
||||||
|
job_function = job_function_span.text.strip()
|
||||||
return {
|
return {
|
||||||
"description": description,
|
"description": description,
|
||||||
"job_type": self._parse_job_type(soup),
|
"job_type": self._parse_job_type(soup),
|
||||||
@@ -274,6 +271,7 @@ class LinkedInScraper(Scraper):
|
|||||||
"logo_photo_url": soup.find("img", {"class": "artdeco-entity-image"}).get(
|
"logo_photo_url": soup.find("img", {"class": "artdeco-entity-image"}).get(
|
||||||
"data-delayed-url"
|
"data-delayed-url"
|
||||||
),
|
),
|
||||||
|
"job_function": job_function,
|
||||||
}
|
}
|
||||||
|
|
||||||
def _get_location(self, metadata_card: Optional[Tag]) -> Location:
|
def _get_location(self, metadata_card: Optional[Tag]) -> Location:
|
||||||
|
|||||||
@@ -93,6 +93,7 @@ class TLSRotating(RotatingProxySession, tls_client.Session):
|
|||||||
else:
|
else:
|
||||||
self.proxies = {}
|
self.proxies = {}
|
||||||
response = tls_client.Session.execute_request(self, *args, **kwargs)
|
response = tls_client.Session.execute_request(self, *args, **kwargs)
|
||||||
|
response.ok = response.status_code in range(200, 400)
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
@@ -178,3 +179,61 @@ def currency_parser(cur_str):
|
|||||||
num = float(cur_str)
|
num = float(cur_str)
|
||||||
|
|
||||||
return np.round(num, 2)
|
return np.round(num, 2)
|
||||||
|
|
||||||
|
|
||||||
|
def remove_attributes(tag):
|
||||||
|
for attr in list(tag.attrs):
|
||||||
|
del tag[attr]
|
||||||
|
return tag
|
||||||
|
|
||||||
|
|
||||||
|
def extract_salary(
|
||||||
|
salary_str,
|
||||||
|
lower_limit=1000,
|
||||||
|
upper_limit=700000,
|
||||||
|
hourly_threshold=350,
|
||||||
|
monthly_threshold=30000,
|
||||||
|
):
|
||||||
|
if not salary_str:
|
||||||
|
return None, None, None, None
|
||||||
|
|
||||||
|
min_max_pattern = r"\$(\d+(?:,\d+)?(?:\.\d+)?)([kK]?)\s*[-—–]\s*(?:\$)?(\d+(?:,\d+)?(?:\.\d+)?)([kK]?)"
|
||||||
|
|
||||||
|
def to_int(s):
|
||||||
|
return int(float(s.replace(",", "")))
|
||||||
|
|
||||||
|
def convert_hourly_to_annual(hourly_wage):
|
||||||
|
return hourly_wage * 2080
|
||||||
|
|
||||||
|
def convert_monthly_to_annual(monthly_wage):
|
||||||
|
return monthly_wage * 12
|
||||||
|
|
||||||
|
match = re.search(min_max_pattern, salary_str)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
min_salary = to_int(match.group(1))
|
||||||
|
max_salary = to_int(match.group(3))
|
||||||
|
# Handle 'k' suffix for min and max salaries independently
|
||||||
|
if "k" in match.group(2).lower() or "k" in match.group(4).lower():
|
||||||
|
min_salary *= 1000
|
||||||
|
max_salary *= 1000
|
||||||
|
|
||||||
|
# Convert to annual if less than the hourly threshold
|
||||||
|
if min_salary < hourly_threshold:
|
||||||
|
min_salary = convert_hourly_to_annual(min_salary)
|
||||||
|
if max_salary < hourly_threshold:
|
||||||
|
max_salary = convert_hourly_to_annual(max_salary)
|
||||||
|
|
||||||
|
elif min_salary < monthly_threshold:
|
||||||
|
min_salary = convert_monthly_to_annual(min_salary)
|
||||||
|
if max_salary < monthly_threshold:
|
||||||
|
max_salary = convert_monthly_to_annual(max_salary)
|
||||||
|
|
||||||
|
# Ensure salary range is within specified limits
|
||||||
|
if (
|
||||||
|
lower_limit <= min_salary <= upper_limit
|
||||||
|
and lower_limit <= max_salary <= upper_limit
|
||||||
|
and min_salary < max_salary
|
||||||
|
):
|
||||||
|
return "yearly", min_salary, max_salary, "USD"
|
||||||
|
return None, None, None, None
|
||||||
|
|||||||
@@ -7,19 +7,24 @@ This module contains routines to scrape ZipRecruiter.
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
import math
|
import math
|
||||||
|
import re
|
||||||
import time
|
import time
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Optional, Tuple, Any
|
from typing import Optional, Tuple, Any
|
||||||
|
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from .. import Scraper, ScraperInput, Site
|
from .. import Scraper, ScraperInput, Site
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
logger,
|
logger,
|
||||||
extract_emails_from_text,
|
extract_emails_from_text,
|
||||||
create_session,
|
create_session,
|
||||||
markdown_converter,
|
markdown_converter,
|
||||||
|
remove_attributes,
|
||||||
)
|
)
|
||||||
from ...jobs import (
|
from ...jobs import (
|
||||||
JobPost,
|
JobPost,
|
||||||
@@ -151,6 +156,8 @@ class ZipRecruiterScraper(Scraper):
|
|||||||
comp_min = int(job["compensation_min"]) if "compensation_min" in job else None
|
comp_min = int(job["compensation_min"]) if "compensation_min" in job else None
|
||||||
comp_max = int(job["compensation_max"]) if "compensation_max" in job else None
|
comp_max = int(job["compensation_max"]) if "compensation_max" in job else None
|
||||||
comp_currency = job.get("compensation_currency")
|
comp_currency = job.get("compensation_currency")
|
||||||
|
description_full, job_url_direct = self._get_descr(job_url)
|
||||||
|
|
||||||
return JobPost(
|
return JobPost(
|
||||||
id=str(job["listing_key"]),
|
id=str(job["listing_key"]),
|
||||||
title=title,
|
title=title,
|
||||||
@@ -165,10 +172,42 @@ class ZipRecruiterScraper(Scraper):
|
|||||||
),
|
),
|
||||||
date_posted=date_posted,
|
date_posted=date_posted,
|
||||||
job_url=job_url,
|
job_url=job_url,
|
||||||
description=description,
|
description=description_full if description_full else description,
|
||||||
emails=extract_emails_from_text(description) if description else None,
|
emails=extract_emails_from_text(description) if description else None,
|
||||||
|
job_url_direct=job_url_direct,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _get_descr(self, job_url):
|
||||||
|
res = self.session.get(job_url, headers=self.headers, allow_redirects=True)
|
||||||
|
description_full = job_url_direct = None
|
||||||
|
if res.ok:
|
||||||
|
soup = BeautifulSoup(res.text, "html.parser")
|
||||||
|
job_descr_div = soup.find("div", class_="job_description")
|
||||||
|
company_descr_section = soup.find("section", class_="company_description")
|
||||||
|
job_description_clean = (
|
||||||
|
remove_attributes(job_descr_div).prettify(formatter="html")
|
||||||
|
if job_descr_div
|
||||||
|
else ""
|
||||||
|
)
|
||||||
|
company_description_clean = (
|
||||||
|
remove_attributes(company_descr_section).prettify(formatter="html")
|
||||||
|
if company_descr_section
|
||||||
|
else ""
|
||||||
|
)
|
||||||
|
description_full = job_description_clean + company_description_clean
|
||||||
|
script_tag = soup.find("script", type="application/json")
|
||||||
|
if script_tag:
|
||||||
|
job_json = json.loads(script_tag.string)
|
||||||
|
job_url_val = job_json["model"]["saveJobURL"]
|
||||||
|
m = re.search(r"job_url=(.+)", job_url_val)
|
||||||
|
if m:
|
||||||
|
job_url_direct = m.group(1)
|
||||||
|
|
||||||
|
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
||||||
|
description_full = markdown_converter(description_full)
|
||||||
|
|
||||||
|
return description_full, job_url_direct
|
||||||
|
|
||||||
def _get_cookies(self):
|
def _get_cookies(self):
|
||||||
data = "event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple"
|
data = "event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple"
|
||||||
url = f"{self.api_url}/jobs-app/event"
|
url = f"{self.api_url}/jobs-app/event"
|
||||||
|
|||||||
Reference in New Issue
Block a user