From d69c41392dd6a3ff5e276de05a4bb4d0fab2d1ab Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Tue, 11 Jul 2023 05:42:20 -0500 Subject: [PATCH] feat(jobs): filter by is_remote --- api/core/jobs/__init__.py | 13 ++----------- api/core/scrapers/__init__.py | 7 ++++--- api/core/scrapers/indeed/__init__.py | 21 +++++++++++---------- api/core/scrapers/linkedin/__init__.py | 17 +++++++++++------ api/core/scrapers/ziprecruiter/__init__.py | 7 +++++-- 5 files changed, 33 insertions(+), 32 deletions(-) diff --git a/api/core/jobs/__init__.py b/api/core/jobs/__init__.py index ee7e63e..37262e5 100644 --- a/api/core/jobs/__init__.py +++ b/api/core/jobs/__init__.py @@ -37,25 +37,16 @@ class Compensation(BaseModel): currency: str = "US" -class DeliveryEnum(Enum): - EMAIL = "email" - URL = "url" - - -class Delivery(BaseModel): - method: DeliveryEnum - value: str - - class JobPost(BaseModel): title: str company_name: str + job_url: str location: Location + description: str = None job_type: JobType = None compensation: Compensation = None date_posted: datetime = None - delivery: Delivery = None class JobResponse(BaseModel): diff --git a/api/core/scrapers/__init__.py b/api/core/scrapers/__init__.py index 018ab4c..80dc204 100644 --- a/api/core/scrapers/__init__.py +++ b/api/core/scrapers/__init__.py @@ -11,10 +11,11 @@ class Site(Enum): class ScraperInput(BaseModel): site_type: Site - search_term: str - location: str - distance: int = 25 + + location: str = None + distance: int = None + is_remote: bool = False results_wanted: int = 15 diff --git a/api/core/scrapers/indeed/__init__.py b/api/core/scrapers/indeed/__init__.py index 5bdb905..3d6a158 100644 --- a/api/core/scrapers/indeed/__init__.py +++ b/api/core/scrapers/indeed/__init__.py @@ -23,6 +23,7 @@ class IndeedScraper(Scraper): site = Site(Site.INDEED) super().__init__(site) self.url = "https://www.indeed.com/jobs" + self.job_url = "https://www.indeed.com/viewjob?jk=" def scrape(self, scraper_input: ScraperInput) -> JobResponse: """ @@ -41,14 +42,18 @@ class IndeedScraper(Scraper): while len(job_list) < scraper_input.results_wanted: params = { "q": scraper_input.search_term, - "l": scraper_input.location, + "location": scraper_input.location, + "radius": scraper_input.distance, + "sc": "0kf:attr(DSQF7);" if scraper_input.is_remote else None, "filter": 0, "start": 0 + page * 10, - "radius": scraper_input.distance, } response = session.get(self.url, params=params) + if response.status_code == 307: + new_url = response.headers["Location"] + response = session.get(new_url) if response.status_code != status.HTTP_200_OK: return JobResponse( success=False, @@ -78,7 +83,7 @@ class IndeedScraper(Scraper): ) for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]: - job_url = job["thirdPartyApplyUrl"] + job_url = f'{self.job_url}{job["jobkey"]}' if job_url in seen_urls: continue snippet_html = BeautifulSoup(job["snippet"], "html.parser") @@ -104,10 +109,6 @@ class IndeedScraper(Scraper): ) job_type = IndeedScraper.get_job_type(job) - if job.get("thirdPartyApplyUrl"): - delivery = Delivery(method=DeliveryEnum.URL, value=job_url) - else: - delivery = None timestamp_seconds = job["pubDate"] / 1000 date_posted = datetime.fromtimestamp(timestamp_seconds) @@ -117,15 +118,15 @@ class IndeedScraper(Scraper): description=first_li.text if first_li else None, company_name=job["company"], location=Location( - city=job["jobLocationCity"], - state=job["jobLocationState"], + city=job.get("jobLocationCity"), + state=job.get("jobLocationState"), postal_code=job.get("jobLocationPostal"), country="US", ), job_type=job_type, compensation=compensation, date_posted=date_posted, - delivery=delivery, + job_url=job_url, ) job_list.append(job_post) if len(job_list) >= scraper_input.results_wanted: diff --git a/api/core/scrapers/linkedin/__init__.py b/api/core/scrapers/linkedin/__init__.py index aaec33b..d88db08 100644 --- a/api/core/scrapers/linkedin/__init__.py +++ b/api/core/scrapers/linkedin/__init__.py @@ -17,7 +17,8 @@ class LinkedInScraper(Scraper): site = Site(Site.LINKEDIN) super().__init__(site) - self.url = "https://www.linkedin.com/jobs" + self.url = "https://www.linkedin.com/jobs/search/" + self.job_url = "https://www.linkedin.com/jobs/view/" def scrape(self, scraper_input: ScraperInput) -> JobResponse: """ @@ -32,12 +33,13 @@ class LinkedInScraper(Scraper): with requests.Session() as session: while len(job_list) < scraper_input.results_wanted: params = { - "pageNum": page, + "keywords": scraper_input.search_term, "location": scraper_input.location, "distance": scraper_input.distance, + "f_WT": 2 if scraper_input.is_remote else None, + "pageNum": page, } - self.url = f"{self.url}/{scraper_input.search_term}-jobs" response = session.get(self.url, params=params, allow_redirects=True) if response.status_code != status.HTTP_200_OK: @@ -58,8 +60,11 @@ class LinkedInScraper(Scraper): "div", class_="base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card", ): - job_url_tag = job_card.find("a", class_="base-card__full-link") - job_url = job_url_tag["href"] if job_url_tag else "N/A" + data_entity_urn = job_card.get("data-entity-urn", "") + job_id = ( + data_entity_urn.split(":")[-1] if data_entity_urn else "N/A" + ) + job_url = f"{self.job_url}{job_id}" if job_url in seen_urls: continue seen_urls.add(job_url) @@ -91,7 +96,7 @@ class LinkedInScraper(Scraper): company_name=company, location=location, date_posted=date_posted, - delivery=Delivery(method=DeliveryEnum.URL, value=job_url), + job_url=job_url, ) job_list.append(job_post) if len(job_list) >= scraper_input.results_wanted: diff --git a/api/core/scrapers/ziprecruiter/__init__.py b/api/core/scrapers/ziprecruiter/__init__.py index 24de04b..8f9cb73 100644 --- a/api/core/scrapers/ziprecruiter/__init__.py +++ b/api/core/scrapers/ziprecruiter/__init__.py @@ -38,8 +38,11 @@ class ZipRecruiterScraper(Scraper): params = { "search": scraper_input.search_term, "location": scraper_input.location, - "page": page, "radius": scraper_input.distance, + "refine_by_location_type": "only_remote" + if scraper_input.is_remote + else None, + "page": page, } response = session.get( @@ -88,7 +91,7 @@ class ZipRecruiterScraper(Scraper): job_type=job_type, compensation=ZipRecruiterScraper.get_compensation(job), date_posted=date_posted, - delivery=Delivery(method=DeliveryEnum.URL, value=job_url), + job_url=job_url, ) job_list.append(job_post) if len(job_list) >= scraper_input.results_wanted: