feat(jobs): filter by is_remote

pull/12/head
Cullen Watson 2023-07-11 05:42:20 -05:00
parent 2b63d4c84f
commit d69c41392d
5 changed files with 33 additions and 32 deletions

View File

@ -37,25 +37,16 @@ class Compensation(BaseModel):
currency: str = "US" currency: str = "US"
class DeliveryEnum(Enum):
EMAIL = "email"
URL = "url"
class Delivery(BaseModel):
method: DeliveryEnum
value: str
class JobPost(BaseModel): class JobPost(BaseModel):
title: str title: str
company_name: str company_name: str
job_url: str
location: Location location: Location
description: str = None description: str = None
job_type: JobType = None job_type: JobType = None
compensation: Compensation = None compensation: Compensation = None
date_posted: datetime = None date_posted: datetime = None
delivery: Delivery = None
class JobResponse(BaseModel): class JobResponse(BaseModel):

View File

@ -11,10 +11,11 @@ class Site(Enum):
class ScraperInput(BaseModel): class ScraperInput(BaseModel):
site_type: Site site_type: Site
search_term: str search_term: str
location: str
distance: int = 25 location: str = None
distance: int = None
is_remote: bool = False
results_wanted: int = 15 results_wanted: int = 15

View File

@ -23,6 +23,7 @@ class IndeedScraper(Scraper):
site = Site(Site.INDEED) site = Site(Site.INDEED)
super().__init__(site) super().__init__(site)
self.url = "https://www.indeed.com/jobs" self.url = "https://www.indeed.com/jobs"
self.job_url = "https://www.indeed.com/viewjob?jk="
def scrape(self, scraper_input: ScraperInput) -> JobResponse: def scrape(self, scraper_input: ScraperInput) -> JobResponse:
""" """
@ -41,14 +42,18 @@ class IndeedScraper(Scraper):
while len(job_list) < scraper_input.results_wanted: while len(job_list) < scraper_input.results_wanted:
params = { params = {
"q": scraper_input.search_term, "q": scraper_input.search_term,
"l": scraper_input.location, "location": scraper_input.location,
"radius": scraper_input.distance,
"sc": "0kf:attr(DSQF7);" if scraper_input.is_remote else None,
"filter": 0, "filter": 0,
"start": 0 + page * 10, "start": 0 + page * 10,
"radius": scraper_input.distance,
} }
response = session.get(self.url, params=params) response = session.get(self.url, params=params)
if response.status_code == 307:
new_url = response.headers["Location"]
response = session.get(new_url)
if response.status_code != status.HTTP_200_OK: if response.status_code != status.HTTP_200_OK:
return JobResponse( return JobResponse(
success=False, success=False,
@ -78,7 +83,7 @@ class IndeedScraper(Scraper):
) )
for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]: for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]:
job_url = job["thirdPartyApplyUrl"] job_url = f'{self.job_url}{job["jobkey"]}'
if job_url in seen_urls: if job_url in seen_urls:
continue continue
snippet_html = BeautifulSoup(job["snippet"], "html.parser") snippet_html = BeautifulSoup(job["snippet"], "html.parser")
@ -104,10 +109,6 @@ class IndeedScraper(Scraper):
) )
job_type = IndeedScraper.get_job_type(job) job_type = IndeedScraper.get_job_type(job)
if job.get("thirdPartyApplyUrl"):
delivery = Delivery(method=DeliveryEnum.URL, value=job_url)
else:
delivery = None
timestamp_seconds = job["pubDate"] / 1000 timestamp_seconds = job["pubDate"] / 1000
date_posted = datetime.fromtimestamp(timestamp_seconds) date_posted = datetime.fromtimestamp(timestamp_seconds)
@ -117,15 +118,15 @@ class IndeedScraper(Scraper):
description=first_li.text if first_li else None, description=first_li.text if first_li else None,
company_name=job["company"], company_name=job["company"],
location=Location( location=Location(
city=job["jobLocationCity"], city=job.get("jobLocationCity"),
state=job["jobLocationState"], state=job.get("jobLocationState"),
postal_code=job.get("jobLocationPostal"), postal_code=job.get("jobLocationPostal"),
country="US", country="US",
), ),
job_type=job_type, job_type=job_type,
compensation=compensation, compensation=compensation,
date_posted=date_posted, date_posted=date_posted,
delivery=delivery, job_url=job_url,
) )
job_list.append(job_post) job_list.append(job_post)
if len(job_list) >= scraper_input.results_wanted: if len(job_list) >= scraper_input.results_wanted:

View File

@ -17,7 +17,8 @@ class LinkedInScraper(Scraper):
site = Site(Site.LINKEDIN) site = Site(Site.LINKEDIN)
super().__init__(site) super().__init__(site)
self.url = "https://www.linkedin.com/jobs" self.url = "https://www.linkedin.com/jobs/search/"
self.job_url = "https://www.linkedin.com/jobs/view/"
def scrape(self, scraper_input: ScraperInput) -> JobResponse: def scrape(self, scraper_input: ScraperInput) -> JobResponse:
""" """
@ -32,12 +33,13 @@ class LinkedInScraper(Scraper):
with requests.Session() as session: with requests.Session() as session:
while len(job_list) < scraper_input.results_wanted: while len(job_list) < scraper_input.results_wanted:
params = { params = {
"pageNum": page, "keywords": scraper_input.search_term,
"location": scraper_input.location, "location": scraper_input.location,
"distance": scraper_input.distance, "distance": scraper_input.distance,
"f_WT": 2 if scraper_input.is_remote else None,
"pageNum": page,
} }
self.url = f"{self.url}/{scraper_input.search_term}-jobs"
response = session.get(self.url, params=params, allow_redirects=True) response = session.get(self.url, params=params, allow_redirects=True)
if response.status_code != status.HTTP_200_OK: if response.status_code != status.HTTP_200_OK:
@ -58,8 +60,11 @@ class LinkedInScraper(Scraper):
"div", "div",
class_="base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card", class_="base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card",
): ):
job_url_tag = job_card.find("a", class_="base-card__full-link") data_entity_urn = job_card.get("data-entity-urn", "")
job_url = job_url_tag["href"] if job_url_tag else "N/A" job_id = (
data_entity_urn.split(":")[-1] if data_entity_urn else "N/A"
)
job_url = f"{self.job_url}{job_id}"
if job_url in seen_urls: if job_url in seen_urls:
continue continue
seen_urls.add(job_url) seen_urls.add(job_url)
@ -91,7 +96,7 @@ class LinkedInScraper(Scraper):
company_name=company, company_name=company,
location=location, location=location,
date_posted=date_posted, date_posted=date_posted,
delivery=Delivery(method=DeliveryEnum.URL, value=job_url), job_url=job_url,
) )
job_list.append(job_post) job_list.append(job_post)
if len(job_list) >= scraper_input.results_wanted: if len(job_list) >= scraper_input.results_wanted:

View File

@ -38,8 +38,11 @@ class ZipRecruiterScraper(Scraper):
params = { params = {
"search": scraper_input.search_term, "search": scraper_input.search_term,
"location": scraper_input.location, "location": scraper_input.location,
"page": page,
"radius": scraper_input.distance, "radius": scraper_input.distance,
"refine_by_location_type": "only_remote"
if scraper_input.is_remote
else None,
"page": page,
} }
response = session.get( response = session.get(
@ -88,7 +91,7 @@ class ZipRecruiterScraper(Scraper):
job_type=job_type, job_type=job_type,
compensation=ZipRecruiterScraper.get_compensation(job), compensation=ZipRecruiterScraper.get_compensation(job),
date_posted=date_posted, date_posted=date_posted,
delivery=Delivery(method=DeliveryEnum.URL, value=job_url), job_url=job_url,
) )
job_list.append(job_post) job_list.append(job_post)
if len(job_list) >= scraper_input.results_wanted: if len(job_list) >= scraper_input.results_wanted: