Compare commits

..

14 Commits

Author SHA1 Message Date
Cullen Watson
93223b6a38 bug fix 2023-10-30 13:57:23 -05:00
Cullen Watson
e3fc222eb5 readd proxy support for zip (#64) 2023-10-29 08:54:56 -05:00
Cullen
b303b3f841 chore: version 2023-10-28 16:58:32 -05:00
Cullen
1a0c75f323 chore: version 2023-10-28 16:54:04 -05:00
Cullen
e2f6885d61 chore: format 2023-10-28 16:52:05 -05:00
Cullen
8d65d1b652 [chore] version 2023-10-28 16:43:44 -05:00
Cullen
216d3fd39f ziprecruiter: 5s delay 2023-10-28 16:41:32 -05:00
Cullen Watson
d3bfdc0a6e ziprecruiter api (#63) 2023-10-28 16:17:28 -05:00
Cullen Watson
ba5ed803ca use ziprecuriter api (#62) 2023-10-28 15:51:29 -05:00
Cullen Watson
ff1eb0f7b0 [docs] update readme 2023-10-18 14:32:21 -05:00
Cullen Watson
f2cc74b7f2 Fix Indeed exceptions on parsing description 2023-10-18 14:25:53 -05:00
Cullen Watson
5e71866630 [docs] link change 2023-10-18 11:18:03 -05:00
Zachary Hampton
4e67c6e5a3 Update README.md 2023-10-17 20:22:56 -07:00
Cullen Watson
caf655525a docs: update readme 2023-10-10 11:54:14 -05:00
10 changed files with 1079 additions and 1306 deletions

View File

@@ -4,10 +4,10 @@
**Not technical?** Try out the web scraping tool on our site at [usejobspy.com](https://usejobspy.com).
*Looking to build a data-focused software product?* **[Book a call](https://calendly.com/zachary-products/15min)** *to
*Looking to build a data-focused software product?* **[Book a call](https://calendly.com/bunsly/15min)** *to
work with us.*
\
Check out another project we wrote: ***[HomeHarvest](https://github.com/ZacharyHampton/HomeHarvest)** a Python package
Check out another project we wrote: ***[HomeHarvest](https://github.com/Bunsly/HomeHarvest)** a Python package
for real estate scraping*
## Features
@@ -24,7 +24,7 @@ Updated for release v1.1.3
### Installation
```
pip install --upgrade python-jobspy
pip install python-jobspy
```
_Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) required_
@@ -43,11 +43,7 @@ jobs = scrape_jobs(
)
print(f"Found {len(jobs)} jobs")
print(jobs.head())
jobs.to_csv("jobs.csv", index=False)
# output to .xlsx
# jobs.to_xlsx('jobs.xlsx', index=False)
jobs.to_csv("jobs.csv", index=False) # / to_xlsx
```
### Output
@@ -92,16 +88,16 @@ JobPost
│ ├── city (str)
│ ├── state (str)
├── description (str)
├── job_type (enum): fulltime, parttime, internship, contract
├── job_type (str): fulltime, parttime, internship, contract
├── compensation (object)
│ ├── interval (enum): yearly, monthly, weekly, daily, hourly
│ ├── interval (str): yearly, monthly, weekly, daily, hourly
│ ├── min_amount (int)
│ ├── max_amount (int)
│ └── currency (enum)
└── date_posted (date)
└── emails (str)
└── num_urgent_words (int)
└── is_remote (bool) - just for Indeed at the momen
└── is_remote (bool)
```
### Exceptions
@@ -154,13 +150,12 @@ You can specify the following countries when searching on Indeed (use the exact
**Q: Encountering issues with your queries?**
**A:** Try reducing the number of `results_wanted` and/or broadening the filters. If problems
persist, [submit an issue](https://github.com/cullenwatson/JobSpy/issues).
persist, [submit an issue](https://github.com/Bunsly/JobSpy/issues).
---
**Q: Received a response code 429?**
**A:** This indicates that you have been blocked by the job board site for sending too many requests. Currently, *
*LinkedIn** is particularly aggressive with blocking. We recommend:
**A:** This indicates that you have been blocked by the job board site for sending too many requests. All of the job board sites are aggressive with blocking. We recommend:
- Waiting a few seconds between requests.
- Trying a VPN or proxy to change your IP address.

1793
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -1,9 +1,9 @@
[tool.poetry]
name = "python-jobspy"
version = "1.1.13"
version = "1.1.23"
description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
homepage = "https://github.com/cullenwatson/JobSpy"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy"
readme = "README.md"
packages = [
@@ -16,6 +16,7 @@ requests = "^2.31.0"
tls-client = "^0.2.1"
beautifulsoup4 = "^4.12.2"
pandas = "^2.1.0"
NUMPY = "1.24.2"
pydantic = "^2.3.0"

View File

@@ -84,13 +84,12 @@ def scrape_jobs(
except (LinkedInException, IndeedException, ZipRecruiterException) as lie:
raise lie
except Exception as e:
# unhandled exceptions
if site == Site.LINKEDIN:
raise LinkedInException()
raise LinkedInException(str(e))
if site == Site.INDEED:
raise IndeedException()
raise IndeedException(str(e))
if site == Site.ZIP_RECRUITER:
raise ZipRecruiterException()
raise ZipRecruiterException(str(e))
else:
raise e
return site.value, scraped_data

View File

@@ -37,10 +37,16 @@ class JobType(Enum):
"повназайнятість",
"toànthờigian",
)
PART_TIME = ("parttime", "teilzeit")
PART_TIME = ("parttime", "teilzeit", "částečnýúvazek", "deltid")
CONTRACT = ("contract", "contractor")
TEMPORARY = ("temporary",)
INTERNSHIP = ("internship", "prácticas", "ojt(onthejobtraining)", "praktikum")
INTERNSHIP = (
"internship",
"prácticas",
"ojt(onthejobtraining)",
"praktikum",
"praktik",
)
PER_DIEM = ("perdiem",)
NIGHTS = ("nights",)
@@ -171,8 +177,8 @@ class CompensationInterval(Enum):
class Compensation(BaseModel):
interval: Optional[CompensationInterval] = None
min_amount: int = None
max_amount: int = None
min_amount: int | None = None
max_amount: int | None = None
currency: Optional[str] = "USD"

View File

@@ -7,12 +7,15 @@ This module contains the set of Scrapers' exceptions.
class LinkedInException(Exception):
"""Failed to scrape LinkedIn"""
def __init__(self, message=None):
super().__init__(message or "An error occurred with LinkedIn")
class IndeedException(Exception):
"""Failed to scrape Indeed"""
def __init__(self, message=None):
super().__init__(message or "An error occurred with Indeed")
class ZipRecruiterException(Exception):
"""Failed to scrape ZipRecruiter"""
def __init__(self, message=None):
super().__init__(message or "An error occurred with ZipRecruiter")

View File

@@ -16,7 +16,12 @@ from bs4.element import Tag
from concurrent.futures import ThreadPoolExecutor, Future
from ..exceptions import IndeedException
from ..utils import count_urgent_words, extract_emails_from_text, create_session
from ..utils import (
count_urgent_words,
extract_emails_from_text,
create_session,
get_enum_from_job_type,
)
from ...jobs import (
JobPost,
Compensation,
@@ -53,7 +58,6 @@ class IndeedScraper(Scraper):
self.country = scraper_input.country
domain = self.country.domain_value
self.url = f"https://{domain}.indeed.com"
session = create_session(self.proxy)
params = {
"q": scraper_input.search_term,
@@ -73,6 +77,7 @@ class IndeedScraper(Scraper):
if sc_values:
params["sc"] = "0kf:" + "".join(sc_values) + ";"
try:
session = create_session(self.proxy, is_tls=True)
response = session.get(
f"{self.url}/jobs",
headers=self.get_headers(),
@@ -162,10 +167,10 @@ class IndeedScraper(Scraper):
)
return job_post
jobs = jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]
with ThreadPoolExecutor(max_workers=1) as executor:
job_results: list[Future] = [
executor.submit(process_job, job)
for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]
executor.submit(process_job, job) for job in jobs
]
job_list = [result.result() for result in job_results if result.result()]
@@ -230,13 +235,37 @@ class IndeedScraper(Scraper):
if response.status_code not in range(200, 400):
return None
raw_description = response.json()["body"]["jobInfoWrapperModel"][
"jobInfoModel"
]["sanitizedJobDescription"]
with io.StringIO(raw_description) as f:
soup = BeautifulSoup(f, "html.parser")
text_content = " ".join(soup.get_text().split()).strip()
return text_content
soup = BeautifulSoup(response.text, "html.parser")
script_tag = soup.find(
"script", text=lambda x: x and "window._initialData" in x
)
if not script_tag:
return None
script_code = script_tag.string
match = re.search(r"window\._initialData\s*=\s*({.*?})\s*;", script_code, re.S)
if not match:
return None
json_string = match.group(1)
data = json.loads(json_string)
try:
job_description = data["jobInfoWrapperModel"]["jobInfoModel"][
"sanitizedJobDescription"
]
except (KeyError, TypeError, IndexError):
return None
soup = BeautifulSoup(
job_description, "html.parser"
)
text_content = " ".join(
soup.get_text(separator=" ").split()
).strip()
return text_content
@staticmethod
def get_job_type(job: dict) -> list[JobType] | None:
@@ -252,22 +281,11 @@ class IndeedScraper(Scraper):
label = taxonomy["attributes"][i].get("label")
if label:
job_type_str = label.replace("-", "").replace(" ", "").lower()
job_types.append(
IndeedScraper.get_enum_from_job_type(job_type_str)
)
job_type = get_enum_from_job_type(job_type_str)
if job_type:
job_types.append(job_type)
return job_types
@staticmethod
def get_enum_from_job_type(job_type_str):
"""
Given a string, returns the corresponding JobType enum member if a match is found.
for job_type in JobType:
"""
for job_type in JobType:
if job_type_str in job_type.value:
return job_type
return None
@staticmethod
def parse_jobs(soup: BeautifulSoup) -> dict:
"""

View File

@@ -9,7 +9,6 @@ from datetime import datetime
import requests
import time
import re
from requests.exceptions import ProxyError
from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup
@@ -17,7 +16,7 @@ from bs4.element import Tag
from threading import Lock
from .. import Scraper, ScraperInput, Site
from ..utils import count_urgent_words, extract_emails_from_text
from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type
from ..exceptions import LinkedInException
from ...jobs import (
JobPost,
@@ -237,17 +236,10 @@ class LinkedInScraper(Scraper):
employment_type = employment_type.lower()
employment_type = employment_type.replace("-", "")
return LinkedInScraper.get_enum_from_value(employment_type)
return [get_enum_from_job_type(employment_type)]
return description, get_job_type(soup)
@staticmethod
def get_enum_from_value(value_str):
for job_type in JobType:
if value_str in job_type.value:
return [job_type]
return None
def get_location(self, metadata_card: Optional[Tag]) -> Location:
"""
Extracts the location data from the job metadata card.

View File

@@ -1,5 +1,8 @@
import re
import requests
import tls_client
from ..jobs import JobType
def count_urgent_words(description: str) -> int:
@@ -23,22 +26,39 @@ def extract_emails_from_text(text: str) -> list[str] | None:
return email_regex.findall(text)
def create_session(proxy: str | None = None):
def create_session(proxy: dict | None = None, is_tls: bool = True):
"""
Creates a tls client session
:return: A session object with or without proxies.
"""
session = tls_client.Session(
client_identifier="chrome112",
random_tls_extension_order=True,
)
session.proxies = proxy
# TODO multiple proxies
# if self.proxies:
# session.proxies = {
# "http": random.choice(self.proxies),
# "https": random.choice(self.proxies),
# }
if is_tls:
session = tls_client.Session(
client_identifier="chrome112",
random_tls_extension_order=True,
)
session.proxies = proxy
# TODO multiple proxies
# if self.proxies:
# session.proxies = {
# "http": random.choice(self.proxies),
# "https": random.choice(self.proxies),
# }
else:
session = requests.Session()
session.allow_redirects = True
if proxy:
session.proxies.update(proxy)
return session
def get_enum_from_job_type(job_type_str: str) -> JobType | None:
"""
Given a string, returns the corresponding JobType enum member if a match is found.
"""
res = None
for job_type in JobType:
if job_type_str in job_type.value:
res = job_type
return res

View File

@@ -5,29 +5,18 @@ jobspy.scrapers.ziprecruiter
This module contains routines to scrape ZipRecruiter.
"""
import math
import json
import time
import re
from datetime import datetime, date
from typing import Optional, Tuple, Any
from urllib.parse import urlparse, parse_qs, urlunparse
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
from concurrent.futures import ThreadPoolExecutor, Future
from concurrent.futures import ThreadPoolExecutor
from .. import Scraper, ScraperInput, Site
from ..exceptions import ZipRecruiterException
from ..utils import count_urgent_words, extract_emails_from_text, create_session
from ...jobs import (
JobPost,
Compensation,
CompensationInterval,
Location,
JobResponse,
JobType,
Country,
)
from ...jobs import JobPost, Compensation, Location, JobResponse, JobType
class ZipRecruiterScraper(Scraper):
@@ -42,23 +31,23 @@ class ZipRecruiterScraper(Scraper):
self.jobs_per_page = 20
self.seen_urls = set()
def find_jobs_in_page(
self, scraper_input: ScraperInput, page: int
) -> list[JobPost]:
def find_jobs_in_page(self, scraper_input: ScraperInput, continue_token: str | None = None) -> Tuple[list[JobPost], Optional[str]]:
"""
Scrapes a page of ZipRecruiter for jobs with scraper_input criteria
:param scraper_input:
:param page:
:param continue_token:
:return: jobs found on page
"""
session = create_session(self.proxy)
params = self.add_params(scraper_input)
if continue_token:
params['continue'] = continue_token
try:
session = create_session(self.proxy, is_tls=False)
response = session.get(
f"{self.url}/jobs-search",
f"https://api.ziprecruiter.com/jobs-app/jobs",
headers=self.headers(),
params=self.add_params(scraper_input, page),
allow_redirects=True,
timeout_seconds=10,
params=self.add_params(scraper_input),
timeout=10,
)
if response.status_code != 200:
raise ZipRecruiterException(
@@ -68,118 +57,66 @@ class ZipRecruiterScraper(Scraper):
if "Proxy responded with non 200 code" in str(e):
raise ZipRecruiterException("bad proxy")
raise ZipRecruiterException(str(e))
else:
soup = BeautifulSoup(response.text, "html.parser")
js_tag = soup.find("script", {"id": "js_variables"})
if js_tag:
page_json = json.loads(js_tag.string)
jobs_list = page_json.get("jobList")
if jobs_list:
page_variant = "javascript"
# print('type javascript', len(jobs_list))
else:
page_variant = "html_2"
jobs_list = soup.find_all("div", {"class": "job_content"})
# print('type 2 html', len(jobs_list))
else:
page_variant = "html_1"
jobs_list = soup.find_all("li", {"class": "job-listing"})
# print('type 1 html', len(jobs_list))
time.sleep(5)
response_data = response.json()
jobs_list = response_data.get("jobs", [])
next_continue_token = response_data.get('continue', None)
with ThreadPoolExecutor(max_workers=10) as executor:
if page_variant == "javascript":
job_results = [
executor.submit(self.process_job_javascript, job)
for job in jobs_list
]
elif page_variant == "html_1":
job_results = [
executor.submit(self.process_job_html_1, job) for job in jobs_list
]
elif page_variant == "html_2":
job_results = [
executor.submit(self.process_job_html_2, job) for job in jobs_list
]
with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor:
job_results = [
executor.submit(self.process_job, job)
for job in jobs_list
]
job_list = [result.result() for result in job_results if result.result()]
return job_list
return job_list, next_continue_token
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Scrapes ZipRecruiter for jobs with scraper_input criteria
:param scraper_input:
:return: job_response
Scrapes ZipRecruiter for jobs with scraper_input criteria.
:param scraper_input: Information about job search criteria.
:return: JobResponse containing a list of jobs.
"""
start_page = (
(scraper_input.offset // self.jobs_per_page) + 1
if scraper_input.offset
else 1
)
#: get first page to initialize session
job_list: list[JobPost] = self.find_jobs_in_page(scraper_input, start_page)
pages_to_process = max(
3, math.ceil(scraper_input.results_wanted / self.jobs_per_page)
)
job_list: list[JobPost] = []
continue_token = None
with ThreadPoolExecutor(max_workers=10) as executor:
futures: list[Future] = [
executor.submit(self.find_jobs_in_page, scraper_input, page)
for page in range(start_page + 1, start_page + pages_to_process + 2)
]
max_pages = math.ceil(scraper_input.results_wanted / self.jobs_per_page)
for future in futures:
jobs = future.result()
for page in range(1, max_pages + 1):
if len(job_list) >= scraper_input.results_wanted:
break
job_list += jobs
jobs_on_page, continue_token = self.find_jobs_in_page(scraper_input, continue_token)
if jobs_on_page:
job_list.extend(jobs_on_page)
if not continue_token:
break
if len(job_list) > scraper_input.results_wanted:
job_list = job_list[:scraper_input.results_wanted]
job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list)
def process_job_javascript(self, job: dict) -> JobPost:
"""the most common type of jobs page on ZR"""
title = job.get("Title")
job_url = job.get("JobURL")
@staticmethod
def process_job(job: dict) -> JobPost:
""" Processes an individual job dict from the response """
title = job.get("name")
job_url = job.get("job_url")
description, updated_job_url = self.get_description(job_url)
# job_url = updated_job_url if updated_job_url else job_url
if description is None:
description = BeautifulSoup(
job.get("Snippet", "").strip(), "html.parser"
).get_text()
description = BeautifulSoup(
job.get("job_description", "").strip(), "html.parser"
).get_text()
company = job.get("OrgName")
company = job['hiring_company'].get("name") if "hiring_company" in job else None
location = Location(
city=job.get("City"), state=job.get("State"), country=Country.US_CANADA
city=job.get("job_city"), state=job.get("job_state"), country='usa' if job.get("job_country") == 'US' else 'canada'
)
job_type = ZipRecruiterScraper.get_job_type_enum(
job.get("EmploymentType", "").replace("-", "").lower()
job.get("employment_type", "").replace("_", "").lower()
)
formatted_salary = job.get("FormattedSalaryShort", "")
salary_parts = formatted_salary.split(" ")
min_salary_str = salary_parts[0][1:].replace(",", "")
if "." in min_salary_str:
min_amount = int(float(min_salary_str) * 1000)
else:
min_amount = int(min_salary_str.replace("K", "000"))
if len(salary_parts) >= 3 and salary_parts[2].startswith("$"):
max_salary_str = salary_parts[2][1:].replace(",", "")
if "." in max_salary_str:
max_amount = int(float(max_salary_str) * 1000)
else:
max_amount = int(max_salary_str.replace("K", "000"))
else:
max_amount = 0
compensation = Compensation(
interval=CompensationInterval.YEARLY,
min_amount=min_amount,
max_amount=max_amount,
currency="USD/CAD",
)
save_job_url = job.get("SaveJobURL", "")
posted_time_match = re.search(
r"posted_time=(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z)", save_job_url
@@ -196,7 +133,12 @@ class ZipRecruiterScraper(Scraper):
company_name=company,
location=location,
job_type=job_type,
compensation=compensation,
compensation=Compensation(
interval="yearly" if job.get("compensation_interval") == "annual" else job.get("compensation_interval"),
min_amount=int(job["compensation_min"]) if "compensation_min" in job else None,
max_amount=int(job["compensation_max"]) if "compensation_max" in job else None,
currency=job.get("compensation_currency"),
),
date_posted=date_posted,
job_url=job_url,
description=description,
@@ -204,95 +146,6 @@ class ZipRecruiterScraper(Scraper):
num_urgent_words=count_urgent_words(description) if description else None,
)
def process_job_html_2(self, job: Tag) -> Optional[JobPost]:
"""
second most common type of jobs page on ZR after process_job_javascript()
Parses a job from the job content tag for a second variat of HTML that ZR uses
:param job: BeautifulSoup Tag for one job post
:return JobPost
"""
job_url = job.find("a", class_="job_link")["href"]
title = job.find("h2", class_="title").text
company = job.find("a", class_="company_name").text.strip()
description, updated_job_url = self.get_description(job_url)
# job_url = updated_job_url if updated_job_url else job_url
if description is None:
description = job.find("p", class_="job_snippet").get_text().strip()
job_type_text = job.find("li", class_="perk_item perk_type")
job_type = None
if job_type_text:
job_type_text = (
job_type_text.get_text()
.strip()
.lower()
.replace("-", "")
.replace(" ", "")
)
job_type = ZipRecruiterScraper.get_job_type_enum(job_type_text)
date_posted = ZipRecruiterScraper.get_date_posted(job)
job_post = JobPost(
title=title,
company_name=company,
location=ZipRecruiterScraper.get_location(job),
job_type=job_type,
compensation=ZipRecruiterScraper.get_compensation(job),
date_posted=date_posted,
job_url=job_url,
description=description,
emails=extract_emails_from_text(description) if description else None,
num_urgent_words=count_urgent_words(description) if description else None,
)
return job_post
def process_job_html_1(self, job: Tag) -> Optional[JobPost]:
"""
TODO this method isnt finished due to not encountering this type of html often
least common type of jobs page on ZR (rarely found)
Parses a job from the job content tag
:param job: BeautifulSoup Tag for one job post
:return JobPost
"""
job_url = job.find("a", {"class": "job_link"})["href"]
# job_url = self.cleanurl(job.find("a", {"class": "job_link"})["href"])
if job_url in self.seen_urls:
return None
title = job.find("h2", {"class": "title"}).text
company = job.find("a", {"class": "company_name"}).text.strip()
description, _ = self.get_description(job_url)
# job_url = updated_job_url if updated_job_url else job_url
# get description from jobs listing page if get_description from the specific job page fails
if description is None:
description = job.find("p", {"class": "job_snippet"}).text.strip()
job_type_element = job.find("li", {"class": "perk_item perk_type"})
job_type = None
if job_type_element:
job_type_text = (
job_type_element.text.strip().lower().replace("_", "").replace(" ", "")
)
job_type = ZipRecruiterScraper.get_job_type_enum(job_type_text)
date_posted = ZipRecruiterScraper.get_date_posted(job)
job_post = JobPost(
title=title,
description=description,
company_name=company,
location=ZipRecruiterScraper.get_location(job),
job_type=job_type,
compensation=ZipRecruiterScraper.get_compensation(job),
date_posted=date_posted,
job_url=job_url,
emails=extract_emails_from_text(description),
num_urgent_words=count_urgent_words(description),
)
return job_post
@staticmethod
def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
for job_type in JobType:
@@ -300,39 +153,11 @@ class ZipRecruiterScraper(Scraper):
return [job_type]
return None
def get_description(self, job_page_url: str) -> Tuple[str | None, str | None]:
"""
Retrieves job description by going to the job page url
:param job_page_url:
:return: description or None, response url
"""
try:
session = create_session(self.proxy)
response = session.get(
job_page_url,
headers=self.headers(),
allow_redirects=True,
timeout_seconds=5,
)
if response.status_code not in range(200, 400):
return None, None
except Exception as e:
return None, None
html_string = response.content
soup_job = BeautifulSoup(html_string, "html.parser")
job_description_div = soup_job.find("div", {"class": "job_description"})
if job_description_div:
return job_description_div.text.strip(), response.url
return None, response.url
@staticmethod
def add_params(scraper_input, page) -> dict[str, str | Any]:
def add_params(scraper_input) -> dict[str, str | Any]:
params = {
"search": scraper_input.search_term,
"location": scraper_input.location,
"page": page,
"form": "jobs-landing",
}
job_type_value = None
@@ -357,107 +182,6 @@ class ZipRecruiterScraper(Scraper):
return params
@staticmethod
def get_interval(interval_str: str):
"""
Maps the interval alias to its appropriate CompensationInterval.
:param interval_str
:return: CompensationInterval
"""
interval_alias = {"annually": CompensationInterval.YEARLY}
interval_str = interval_str.lower()
if interval_str in interval_alias:
return interval_alias[interval_str]
return CompensationInterval(interval_str)
@staticmethod
def get_date_posted(job: Tag) -> Optional[datetime.date]:
"""
Extracts the date a job was posted
:param job
:return: date the job was posted or None
"""
button = job.find(
"button", {"class": "action_input save_job zrs_btn_secondary_200"}
)
if not button:
return None
url_time = button.get("data-href", "")
url_components = urlparse(url_time)
params = parse_qs(url_components.query)
posted_time_str = params.get("posted_time", [None])[0]
if posted_time_str:
posted_date = datetime.strptime(
posted_time_str, "%Y-%m-%dT%H:%M:%SZ"
).date()
return posted_date
return None
@staticmethod
def get_compensation(job: Tag) -> Optional[Compensation]:
"""
Parses the compensation tag from the job BeautifulSoup object
:param job
:return: Compensation object or None
"""
pay_element = job.find("li", {"class": "perk_item perk_pay"})
if pay_element is None:
return None
pay = pay_element.find("div", {"class": "value"}).find("span").text.strip()
def create_compensation_object(pay_string: str) -> Compensation:
"""
Creates a Compensation object from a pay_string
:param pay_string
:return: compensation
"""
interval = ZipRecruiterScraper.get_interval(pay_string.split()[-1])
amounts = []
for amount in pay_string.split("to"):
amount = amount.replace(",", "").strip("$ ").split(" ")[0]
if "K" in amount:
amount = amount.replace("K", "")
amount = int(float(amount)) * 1000
else:
amount = int(float(amount))
amounts.append(amount)
compensation = Compensation(
interval=interval,
min_amount=min(amounts),
max_amount=max(amounts),
currency="USD/CAD",
)
return compensation
return create_compensation_object(pay)
@staticmethod
def get_location(job: Tag) -> Location:
"""
Extracts the job location from BeatifulSoup object
:param job:
:return: location
"""
location_link = job.find("a", {"class": "company_location"})
if location_link is not None:
location_string = location_link.text.strip()
parts = location_string.split(", ")
if len(parts) == 2:
city, state = parts
else:
city, state = None, None
else:
city, state = None, None
return Location(city=city, state=state, country=Country.US_CANADA)
@staticmethod
def headers() -> dict:
"""
@@ -465,11 +189,13 @@ class ZipRecruiterScraper(Scraper):
:return: dict - Dictionary containing headers
"""
return {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36"
'Host': 'api.ziprecruiter.com',
'Cookie': 'ziprecruiter_browser=018188e0-045b-4ad7-aa50-627a6c3d43aa; ziprecruiter_session=5259b2219bf95b6d2299a1417424bc2edc9f4b38; SplitSV=2016-10-19%3AU2FsdGVkX19f9%2Bx70knxc%2FeR3xXR8lWoTcYfq5QjmLU%3D%0A; __cf_bm=qXim3DtLPbOL83GIp.ddQEOFVFTc1OBGPckiHYxcz3o-1698521532-0-AfUOCkgCZyVbiW1ziUwyefCfzNrJJTTKPYnif1FZGQkT60dMowmSU/Y/lP+WiygkFPW/KbYJmyc+MQSkkad5YygYaARflaRj51abnD+SyF9V; zglobalid=68d49bd5-0326-428e-aba8-8a04b64bc67c.af2d99ff7c03.653d61bb; ziprecruiter_browser=018188e0-045b-4ad7-aa50-627a6c3d43aa; ziprecruiter_session=5259b2219bf95b6d2299a1417424bc2edc9f4b38',
'accept': '*/*',
'x-zr-zva-override': '100000000;vid:ZT1huzm_EQlDTVEc',
'x-pushnotificationid': '0ff4983d38d7fc5b3370297f2bcffcf4b3321c418f5c22dd152a0264707602a0',
'x-deviceid': 'D77B3A92-E589-46A4-8A39-6EF6F1D86006',
'user-agent': 'Job Search/87.0 (iPhone; CPU iOS 16_6_1 like Mac OS X)',
'authorization': 'Basic YTBlZjMyZDYtN2I0Yy00MWVkLWEyODMtYTI1NDAzMzI0YTcyOg==',
'accept-language': 'en-US,en;q=0.9'
}
# @staticmethod
# def cleanurl(url) -> str:
# parsed_url = urlparse(url)
#
# return urlunparse((parsed_url.scheme, parsed_url.netloc, parsed_url.path, parsed_url.params, '', ''))