Library Migration (#31)

This commit is contained in:
Zachary Hampton
2023-09-03 07:29:25 -07:00
committed by GitHub
parent 7efece8fe9
commit 153ac35248
36 changed files with 3604 additions and 1473 deletions

View File

View File

@@ -0,0 +1,75 @@
from typing import Union, Optional
from datetime import date
from enum import Enum
from pydantic import BaseModel, validator
class JobType(Enum):
FULL_TIME = "fulltime"
PART_TIME = "parttime"
CONTRACT = "contract"
TEMPORARY = "temporary"
INTERNSHIP = "internship"
PER_DIEM = "perdiem"
NIGHTS = "nights"
OTHER = "other"
SUMMER = "summer"
VOLUNTEER = "volunteer"
class Location(BaseModel):
country: str = "USA"
city: str = None
state: Optional[str] = None
class CompensationInterval(Enum):
YEARLY = "yearly"
MONTHLY = "monthly"
WEEKLY = "weekly"
DAILY = "daily"
HOURLY = "hourly"
class Compensation(BaseModel):
interval: CompensationInterval
min_amount: int = None
max_amount: int = None
currency: str = "USD"
class JobPost(BaseModel):
title: str
company_name: str
job_url: str
location: Optional[Location]
description: str = None
job_type: Optional[JobType] = None
compensation: Optional[Compensation] = None
date_posted: date = None
class JobResponse(BaseModel):
success: bool
error: str = None
total_results: Optional[int] = None
jobs: list[JobPost] = []
returned_results: int = None
@validator("returned_results", pre=True, always=True)
def set_returned_results(cls, v, values):
jobs_list = values.get("jobs")
if v is None:
if jobs_list is not None:
return len(jobs_list)
else:
return 0
return v

View File

@@ -0,0 +1,43 @@
from ..jobs import Enum, BaseModel, JobType, JobResponse
from typing import List, Dict, Optional, Any
class StatusException(Exception):
def __init__(self, status_code: int):
self.status_code = status_code
class Site(Enum):
LINKEDIN = "linkedin"
INDEED = "indeed"
ZIP_RECRUITER = "zip_recruiter"
class ScraperInput(BaseModel):
site_type: List[Site]
search_term: str
location: str = None
distance: Optional[int] = None
is_remote: bool = False
job_type: Optional[JobType] = None
easy_apply: bool = None # linkedin
results_wanted: int = 15
class CommonResponse(BaseModel):
status: Optional[str]
error: Optional[str]
linkedin: Optional[Any] = None
indeed: Optional[Any] = None
zip_recruiter: Optional[Any] = None
class Scraper:
def __init__(self, site: Site, url: str):
self.site = site
self.url = url
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
...

View File

@@ -0,0 +1,301 @@
import re
import sys
import math
import json
from datetime import datetime
from typing import Optional, Tuple, List
import tls_client
import urllib.parse
from bs4 import BeautifulSoup
from bs4.element import Tag
from concurrent.futures import ThreadPoolExecutor, Future
from ...jobs import JobPost, Compensation, CompensationInterval, Location, JobResponse, JobType
from .. import Scraper, ScraperInput, Site, StatusException
class ParsingException(Exception):
pass
class IndeedScraper(Scraper):
def __init__(self):
"""
Initializes IndeedScraper with the Indeed job search url
"""
site = Site(Site.INDEED)
url = "https://www.indeed.com"
super().__init__(site, url)
self.jobs_per_page = 15
self.seen_urls = set()
def scrape_page(
self, scraper_input: ScraperInput, page: int, session: tls_client.Session
) -> tuple[list[JobPost], int]:
"""
Scrapes a page of Indeed for jobs with scraper_input criteria
:param scraper_input:
:param page:
:param session:
:return: jobs found on page, total number of jobs found for search
"""
job_list = []
params = {
"q": scraper_input.search_term,
"l": scraper_input.location,
"radius": scraper_input.distance,
"filter": 0,
"start": 0 + page * 10,
}
sc_values = []
if scraper_input.is_remote:
sc_values.append("attr(DSQF7)")
if scraper_input.job_type:
sc_values.append("jt({})".format(scraper_input.job_type.value))
if sc_values:
params["sc"] = "0kf:" + "".join(sc_values) + ";"
response = session.get(self.url + "/jobs", params=params)
if (
response.status_code != 200
and response.status_code != 307
):
raise StatusException(response.status_code)
soup = BeautifulSoup(response.content, "html.parser")
if "did not match any jobs" in str(soup):
raise ParsingException("Search did not match any jobs")
jobs = IndeedScraper.parse_jobs(
soup
) #: can raise exception, handled by main scrape function
total_num_jobs = IndeedScraper.total_jobs(soup)
if (
not jobs.get("metaData", {})
.get("mosaicProviderJobCardsModel", {})
.get("results")
):
raise Exception("No jobs found.")
def process_job(job) -> Optional[JobPost]:
job_url = f'{self.url}/jobs/viewjob?jk={job["jobkey"]}'
job_url_client = f'{self.url}/viewjob?jk={job["jobkey"]}'
if job_url in self.seen_urls:
return None
snippet_html = BeautifulSoup(job["snippet"], "html.parser")
extracted_salary = job.get("extractedSalary")
compensation = None
if extracted_salary:
salary_snippet = job.get("salarySnippet")
currency = salary_snippet.get("currency") if salary_snippet else None
interval = (extracted_salary.get("type"),)
if isinstance(interval, tuple):
interval = interval[0]
interval = interval.upper()
if interval in CompensationInterval.__members__:
compensation = Compensation(
interval=CompensationInterval[interval],
min_amount=int(extracted_salary.get("max")),
max_amount=int(extracted_salary.get("min")),
currency=currency,
)
job_type = IndeedScraper.get_job_type(job)
timestamp_seconds = job["pubDate"] / 1000
date_posted = datetime.fromtimestamp(timestamp_seconds)
date_posted = date_posted.strftime("%Y-%m-%d")
description = self.get_description(job_url, session)
li_elements = snippet_html.find_all("li")
if description is None and li_elements:
description = " ".join(li.text for li in li_elements)
first_li = snippet_html.find("li")
job_post = JobPost(
title=job["normTitle"],
description=description,
company_name=job["company"],
location=Location(
city=job.get("jobLocationCity"),
state=job.get("jobLocationState"),
),
job_type=job_type,
compensation=compensation,
date_posted=date_posted,
job_url=job_url_client,
)
return job_post
with ThreadPoolExecutor(max_workers=10) as executor:
job_results: list[Future] = [executor.submit(process_job, job) for job in
jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]]
job_list = [result.result() for result in job_results if result.result()]
return job_list, total_num_jobs
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Scrapes Indeed for jobs with scraper_input criteria
:param scraper_input:
:return: job_response
"""
session = tls_client.Session(
client_identifier="chrome112", random_tls_extension_order=True
)
pages_to_process = (
math.ceil(scraper_input.results_wanted / self.jobs_per_page) - 1
)
try:
#: get first page to initialize session
job_list, total_results = self.scrape_page(scraper_input, 0, session)
with ThreadPoolExecutor(max_workers=10) as executor:
futures: list[Future] = [
executor.submit(self.scrape_page, scraper_input, page, session)
for page in range(1, pages_to_process + 1)
]
for future in futures:
jobs, _ = future.result()
job_list += jobs
except StatusException as e:
return JobResponse(
success=False,
error=f"Indeed returned status code {e.status_code}",
)
except ParsingException as e:
return JobResponse(
success=False,
error=f"Indeed failed to parse response: {e}",
)
except Exception as e:
return JobResponse(
success=False,
error=f"Indeed failed to scrape: {e}",
)
if len(job_list) > scraper_input.results_wanted:
job_list = job_list[: scraper_input.results_wanted]
job_response = JobResponse(
success=True,
jobs=job_list,
total_results=total_results,
)
return job_response
def get_description(self, job_page_url: str, session: tls_client.Session) -> str:
"""
Retrieves job description by going to the job page url
:param job_page_url:
:param session:
:return: description
"""
parsed_url = urllib.parse.urlparse(job_page_url)
params = urllib.parse.parse_qs(parsed_url.query)
jk_value = params.get("jk", [None])[0]
formatted_url = f"{self.url}/viewjob?jk={jk_value}&spa=1"
response = session.get(formatted_url, allow_redirects=True)
if response.status_code not in range(200, 400):
return None
raw_description = response.json()["body"]["jobInfoWrapperModel"][
"jobInfoModel"
]["sanitizedJobDescription"]
soup = BeautifulSoup(raw_description, "html.parser")
text_content = " ".join(soup.get_text().split()).strip()
return text_content
@staticmethod
def get_job_type(job: dict) -> Optional[JobType]:
"""
Parses the job to get JobTypeIndeed
:param job:
:return:
"""
for taxonomy in job["taxonomyAttributes"]:
if taxonomy["label"] == "job-types":
if len(taxonomy["attributes"]) > 0:
job_type_str = (
taxonomy["attributes"][0]["label"]
.replace("-", "_")
.replace(" ", "_")
.upper()
)
return JobType[job_type_str]
return None
@staticmethod
def parse_jobs(soup: BeautifulSoup) -> dict:
"""
Parses the jobs from the soup object
:param soup:
:return: jobs
"""
def find_mosaic_script() -> Optional[Tag]:
"""
Finds jobcards script tag
:return: script_tag
"""
script_tags = soup.find_all("script")
for tag in script_tags:
if (
tag.string
and "mosaic.providerData" in tag.string
and "mosaic-provider-jobcards" in tag.string
):
return tag
return None
script_tag = find_mosaic_script()
if script_tag:
script_str = script_tag.string
pattern = r'window.mosaic.providerData\["mosaic-provider-jobcards"\]\s*=\s*({.*?});'
p = re.compile(pattern, re.DOTALL)
m = p.search(script_str)
if m:
jobs = json.loads(m.group(1).strip())
return jobs
else:
raise ParsingException("Could not find mosaic provider job cards data")
else:
raise ParsingException(
"Could not find a script tag containing mosaic provider data"
)
@staticmethod
def total_jobs(soup: BeautifulSoup) -> int:
"""
Parses the total jobs for that search from soup object
:param soup:
:return: total_num_jobs
"""
script = soup.find("script", string=lambda t: "window._initialData" in t)
pattern = re.compile(r"window._initialData\s*=\s*({.*})\s*;", re.DOTALL)
match = pattern.search(script.string)
total_num_jobs = 0
if match:
json_str = match.group(1)
data = json.loads(json_str)
total_num_jobs = int(data["searchTitleBarModel"]["totalNumResults"])
return total_num_jobs

View File

@@ -0,0 +1,213 @@
from typing import Optional, Tuple
from datetime import datetime
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
from .. import Scraper, ScraperInput, Site
from ...jobs import JobPost, Location, JobResponse, JobType, Compensation, CompensationInterval
class LinkedInScraper(Scraper):
def __init__(self):
"""
Initializes LinkedInScraper with the LinkedIn job search url
"""
site = Site(Site.LINKEDIN)
url = "https://www.linkedin.com"
super().__init__(site, url)
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Scrapes LinkedIn for jobs with scraper_input criteria
:param scraper_input:
:return: job_response
"""
job_list: list[JobPost] = []
seen_urls = set()
page, processed_jobs, job_count = 0, 0, 0
def job_type_code(job_type):
mapping = {
JobType.FULL_TIME: "F",
JobType.PART_TIME: "P",
JobType.INTERNSHIP: "I",
JobType.CONTRACT: "C",
JobType.TEMPORARY: "T",
}
return mapping.get(job_type, "")
with requests.Session() as session:
while len(job_list) < scraper_input.results_wanted:
params = {
"keywords": scraper_input.search_term,
"location": scraper_input.location,
"distance": scraper_input.distance,
"f_WT": 2 if scraper_input.is_remote else None,
"f_JT": job_type_code(scraper_input.job_type)
if scraper_input.job_type
else None,
"pageNum": page,
"f_AL": "true" if scraper_input.easy_apply else None,
}
params = {k: v for k, v in params.items() if v is not None}
response = session.get(
f"{self.url}/jobs/search", params=params, allow_redirects=True
)
if response.status_code != 200:
return JobResponse(
success=False,
error=f"Response returned {response.status_code}",
)
soup = BeautifulSoup(response.text, "html.parser")
if page == 0:
job_count_text = soup.find(
"span", class_="results-context-header__job-count"
).text
job_count = int("".join(filter(str.isdigit, job_count_text)))
for job_card in soup.find_all(
"div",
class_="base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card",
):
processed_jobs += 1
data_entity_urn = job_card.get("data-entity-urn", "")
job_id = (
data_entity_urn.split(":")[-1] if data_entity_urn else "N/A"
)
job_url = f"{self.url}/jobs/view/{job_id}"
if job_url in seen_urls:
continue
seen_urls.add(job_url)
job_info = job_card.find("div", class_="base-search-card__info")
if job_info is None:
continue
title_tag = job_info.find("h3", class_="base-search-card__title")
title = title_tag.text.strip() if title_tag else "N/A"
company_tag = job_info.find("a", class_="hidden-nested-link")
company = company_tag.text.strip() if company_tag else "N/A"
metadata_card = job_info.find(
"div", class_="base-search-card__metadata"
)
location: Location = LinkedInScraper.get_location(metadata_card)
datetime_tag = metadata_card.find(
"time", class_="job-search-card__listdate"
)
description, job_type = LinkedInScraper.get_description(job_url)
if datetime_tag:
datetime_str = datetime_tag["datetime"]
date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")
else:
date_posted = None
job_post = JobPost(
title=title,
description=description,
company_name=company,
location=location,
date_posted=date_posted,
job_url=job_url,
job_type=job_type,
compensation=Compensation(interval=CompensationInterval.YEARLY, currency="USD")
)
job_list.append(job_post)
if (
len(job_list) >= scraper_input.results_wanted
or processed_jobs >= job_count
):
break
if (
len(job_list) >= scraper_input.results_wanted
or processed_jobs >= job_count
):
break
page += 1
job_list = job_list[: scraper_input.results_wanted]
job_response = JobResponse(
success=True,
jobs=job_list,
total_results=job_count,
)
return job_response
@staticmethod
def get_description(job_page_url: str) -> Optional[str]:
"""
Retrieves job description by going to the job page url
:param job_page_url:
:return: description or None
"""
response = requests.get(job_page_url, allow_redirects=True)
if response.status_code not in range(200, 400):
return None, None
soup = BeautifulSoup(response.text, "html.parser")
div_content = soup.find(
"div", class_=lambda x: x and "show-more-less-html__markup" in x
)
text_content = None
if div_content:
text_content = " ".join(div_content.get_text().split()).strip()
def get_job_type(
soup: BeautifulSoup,
) -> Tuple[Optional[str], Optional[JobType]]:
"""
Gets the job type from job page
:param soup:
:return: JobType
"""
h3_tag = soup.find(
"h3",
class_="description__job-criteria-subheader",
string=lambda text: "Employment type" in text,
)
employment_type = None
if h3_tag:
employment_type_span = h3_tag.find_next_sibling(
"span",
class_="description__job-criteria-text description__job-criteria-text--criteria",
)
if employment_type_span:
employment_type = employment_type_span.get_text(strip=True)
employment_type = employment_type.lower()
employment_type = employment_type.replace("-", "")
return JobType(employment_type)
return text_content, get_job_type(soup)
@staticmethod
def get_location(metadata_card: Optional[Tag]) -> Location:
"""
Extracts the location data from the job metadata card.
:param metadata_card
:return: location
"""
if metadata_card is not None:
location_tag = metadata_card.find(
"span", class_="job-search-card__location"
)
location_string = location_tag.text.strip() if location_tag else "N/A"
parts = location_string.split(", ")
if len(parts) == 2:
city, state = parts
location = Location(
city=city,
state=state,
)
return location

View File

@@ -0,0 +1,405 @@
import math
import json
import re
from datetime import datetime
from typing import Optional, Tuple, List
from urllib.parse import urlparse, parse_qs
import tls_client
from bs4 import BeautifulSoup
from bs4.element import Tag
from concurrent.futures import ThreadPoolExecutor, Future
from .. import Scraper, ScraperInput, Site, StatusException
from ...jobs import JobPost, Compensation, CompensationInterval, Location, JobResponse, JobType
class ZipRecruiterScraper(Scraper):
def __init__(self):
"""
Initializes LinkedInScraper with the ZipRecruiter job search url
"""
site = Site(Site.ZIP_RECRUITER)
url = "https://www.ziprecruiter.com"
super().__init__(site, url)
self.jobs_per_page = 20
self.seen_urls = set()
self.session = tls_client.Session(
client_identifier="chrome112", random_tls_extension_order=True
)
def scrape_page(
self, scraper_input: ScraperInput, page: int
) -> tuple[list[JobPost], int | None]:
"""
Scrapes a page of ZipRecruiter for jobs with scraper_input criteria
:param scraper_input:
:param page:
:param session:
:return: jobs found on page, total number of jobs found for search
"""
job_list = []
job_type_value = None
if scraper_input.job_type:
if scraper_input.job_type.value == "fulltime":
job_type_value = "full_time"
elif scraper_input.job_type.value == "parttime":
job_type_value = "part_time"
else:
job_type_value = scraper_input.job_type.value
params = {
"search": scraper_input.search_term,
"location": scraper_input.location,
"page": page,
"form": "jobs-landing"
}
if scraper_input.is_remote:
params["refine_by_location_type"] = "only_remote"
if scraper_input.distance:
params["radius"] = scraper_input.distance
if job_type_value:
params["refine_by_employment"] = f"employment_type:employment_type:{job_type_value}"
response = self.session.get(
self.url + "/jobs-search",
headers=ZipRecruiterScraper.headers(),
params=params,
)
if response.status_code != 200:
raise StatusException(response.status_code)
html_string = response.text
soup = BeautifulSoup(html_string, "html.parser")
script_tag = soup.find("script", {"id": "js_variables"})
data = json.loads(script_tag.string)
if page == 1:
job_count = int(data["totalJobCount"].replace(",", ""))
else:
job_count = None
with ThreadPoolExecutor(max_workers=10) as executor:
if "jobList" in data and data["jobList"]:
jobs_js = data["jobList"]
job_results = [executor.submit(self.process_job_js, job) for job in jobs_js]
else:
jobs_html = soup.find_all("div", {"class": "job_content"})
job_results = [executor.submit(self.process_job_html, job) for job in
jobs_html]
job_list = [result.result() for result in job_results if result.result()]
return job_list, job_count
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Scrapes ZipRecruiter for jobs with scraper_input criteria
:param scraper_input:
:return: job_response
"""
pages_to_process = max(3, math.ceil(scraper_input.results_wanted / self.jobs_per_page))
try:
#: get first page to initialize session
job_list, total_results = self.scrape_page(scraper_input, 1)
with ThreadPoolExecutor(max_workers=10) as executor:
futures: list[Future] = [
executor.submit(self.scrape_page, scraper_input, page)
for page in range(2, pages_to_process + 1)
]
for future in futures:
jobs, _ = future.result()
job_list += jobs
except StatusException as e:
return JobResponse(
success=False,
error=f"ZipRecruiter returned status code {e.status_code}",
)
except Exception as e:
return JobResponse(
success=False,
error=f"ZipRecruiter failed to scrape: {e}",
)
#: note: this does not handle if the results are more or less than the results_wanted
if len(job_list) > scraper_input.results_wanted:
job_list = job_list[: scraper_input.results_wanted]
job_response = JobResponse(
success=True,
jobs=job_list,
total_results=total_results,
)
return job_response
def process_job_html(self, job: Tag) -> Optional[JobPost]:
"""
Parses a job from the job content tag
:param job: BeautifulSoup Tag for one job post
:return JobPost
"""
job_url = job.find("a", {"class": "job_link"})["href"]
if job_url in self.seen_urls:
return None
title = job.find("h2", {"class": "title"}).text
company = job.find("a", {"class": "company_name"}).text.strip()
description, updated_job_url = self.get_description(
job_url
)
if updated_job_url is not None:
job_url = updated_job_url
if description is None:
description = job.find("p", {"class": "job_snippet"}).text.strip()
job_type_element = job.find("li", {"class": "perk_item perk_type"})
if job_type_element:
job_type_text = (
job_type_element.text.strip()
.lower()
.replace("-", "")
.replace(" ", "")
)
if job_type_text == "contractor":
job_type_text = "contract"
job_type = JobType(job_type_text)
else:
job_type = None
date_posted = ZipRecruiterScraper.get_date_posted(job)
job_post = JobPost(
title=title,
description=description,
company_name=company,
location=ZipRecruiterScraper.get_location(job),
job_type=job_type,
compensation=ZipRecruiterScraper.get_compensation(job),
date_posted=date_posted,
job_url=job_url,
)
return job_post
def process_job_js(self, job: dict) -> JobPost:
# Map the job data to the expected fields by the Pydantic model
title = job.get("Title")
description = BeautifulSoup(job.get("Snippet","").strip(), "html.parser").get_text()
company = job.get("OrgName")
location = Location(city=job.get("City"), state=job.get("State"))
try:
job_type = ZipRecruiterScraper.job_type_from_string(job.get("EmploymentType", "").replace("-", "_").lower())
except ValueError:
# print(f"Skipping job due to unrecognized job type: {job.get('EmploymentType')}")
return None
formatted_salary = job.get("FormattedSalaryShort", "")
salary_parts = formatted_salary.split(" ")
min_salary_str = salary_parts[0][1:].replace(",", "")
if '.' in min_salary_str:
min_amount = int(float(min_salary_str) * 1000)
else:
min_amount = int(min_salary_str.replace("K", "000"))
if len(salary_parts) >= 3 and salary_parts[2].startswith("$"):
max_salary_str = salary_parts[2][1:].replace(",", "")
if '.' in max_salary_str:
max_amount = int(float(max_salary_str) * 1000)
else:
max_amount = int(max_salary_str.replace("K", "000"))
else:
max_amount = 0
compensation = Compensation(
interval=CompensationInterval.YEARLY,
min_amount=min_amount,
max_amount=max_amount
)
save_job_url = job.get("SaveJobURL", "")
posted_time_match = re.search(r"posted_time=(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z)", save_job_url)
if posted_time_match:
date_time_str = posted_time_match.group(1)
date_posted_obj = datetime.strptime(date_time_str, "%Y-%m-%dT%H:%M:%SZ")
date_posted = date_posted_obj.date()
else:
date_posted = date.today()
job_url = job.get("JobURL")
return JobPost(
title=title,
description=description,
company_name=company,
location=location,
job_type=job_type,
compensation=compensation,
date_posted=date_posted,
job_url=job_url,
)
return job_post
@staticmethod
def job_type_from_string(value: str) -> Optional[JobType]:
if not value:
return None
if value.lower() == "contractor":
value = "contract"
normalized_value = value.replace("_", "")
for item in JobType:
if item.value == normalized_value:
return item
raise ValueError(f"Invalid value for JobType: {value}")
def get_description(
self,
job_page_url: str
) -> Tuple[Optional[str], Optional[str]]:
"""
Retrieves job description by going to the job page url
:param job_page_url:
:param session:
:return: description or None, response url
"""
response = self.session.get(
job_page_url, headers=ZipRecruiterScraper.headers(), allow_redirects=True
)
if response.status_code not in range(200, 400):
return None, None
html_string = response.content
soup_job = BeautifulSoup(html_string, "html.parser")
job_description_div = soup_job.find("div", {"class": "job_description"})
if job_description_div:
return job_description_div.text.strip(), response.url
return None, response.url
@staticmethod
def get_interval(interval_str: str):
"""
Maps the interval alias to its appropriate CompensationInterval.
:param interval_str
:return: CompensationInterval
"""
interval_alias = {"annually": CompensationInterval.YEARLY}
interval_str = interval_str.lower()
if interval_str in interval_alias:
return interval_alias[interval_str]
return CompensationInterval(interval_str)
@staticmethod
def get_date_posted(job: BeautifulSoup) -> Optional[datetime.date]:
"""
Extracts the date a job was posted
:param job
:return: date the job was posted or None
"""
button = job.find(
"button", {"class": "action_input save_job zrs_btn_secondary_200"}
)
if not button:
return None
url_time = button.get("data-href", "")
url_components = urlparse(url_time)
params = parse_qs(url_components.query)
posted_time_str = params.get("posted_time", [None])[0]
if posted_time_str:
posted_date = datetime.strptime(
posted_time_str, "%Y-%m-%dT%H:%M:%SZ"
).date()
return posted_date
return None
@staticmethod
def get_compensation(job: BeautifulSoup) -> Optional[Compensation]:
"""
Parses the compensation tag from the job BeautifulSoup object
:param job
:return: Compensation object or None
"""
pay_element = job.find("li", {"class": "perk_item perk_pay"})
if pay_element is None:
return None
pay = pay_element.find("div", {"class": "value"}).find("span").text.strip()
def create_compensation_object(pay_string: str) -> Compensation:
"""
Creates a Compensation object from a pay_string
:param pay_string
:return: compensation
"""
interval = ZipRecruiterScraper.get_interval(pay_string.split()[-1])
amounts = []
for amount in pay_string.split("to"):
amount = amount.replace(",", "").strip("$ ").split(" ")[0]
if "K" in amount:
amount = amount.replace("K", "")
amount = int(float(amount)) * 1000
else:
amount = int(float(amount))
amounts.append(amount)
compensation = Compensation(
interval=interval, min_amount=min(amounts), max_amount=max(amounts)
)
return compensation
return create_compensation_object(pay)
@staticmethod
def get_location(job: BeautifulSoup) -> Location:
"""
Extracts the job location from BeatifulSoup object
:param job:
:return: location
"""
location_link = job.find("a", {"class": "company_location"})
if location_link is not None:
location_string = location_link.text.strip()
parts = location_string.split(", ")
if len(parts) == 2:
city, state = parts
else:
city, state = None, None
else:
city, state = None, None
return Location(
city=city,
state=state,
)
@staticmethod
def headers() -> dict:
"""
Returns headers needed for requests
:return: dict - Dictionary containing headers
"""
return {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36"
}