feat(jobs): filter by is_remote

2023-07-11 05:42:20 -05:00 · 2023-07-11 05:42:20 -05:00 · d69c41392d
parent 2b63d4c84f
commit d69c41392d
5 changed files with 33 additions and 32 deletions
--- a/api/core/jobs/init.py
+++ b/api/core/jobs/init.py
@ -37,25 +37,16 @@ class Compensation(BaseModel):
    currency: str = "US"
 class DeliveryEnum(Enum):
    EMAIL = "email"
    URL = "url"
 class Delivery(BaseModel):
    method: DeliveryEnum
    value: str
 class JobPost(BaseModel):
    title: str
    company_name: str
    job_url: str
    location: Location
    description: str = None
    job_type: JobType = None
    compensation: Compensation = None
    date_posted: datetime = None
    delivery: Delivery = None
 class JobResponse(BaseModel):
--- a/api/core/scrapers/init.py
+++ b/api/core/scrapers/init.py
@ -11,10 +11,11 @@ class Site(Enum):
 class ScraperInput(BaseModel):
    site_type: Site
    search_term: str
-    location: str
+
-    distance: int = 25
+    location: str = None
    distance: int = None
    is_remote: bool = False
    results_wanted: int = 15
--- a/api/core/scrapers/indeed/init.py
+++ b/api/core/scrapers/indeed/init.py
@ -23,6 +23,7 @@ class IndeedScraper(Scraper):
        site = Site(Site.INDEED)
        super().__init__(site)
        self.url = "https://www.indeed.com/jobs"
        self.job_url = "https://www.indeed.com/viewjob?jk="
    def scrape(self, scraper_input: ScraperInput) -> JobResponse:
        """
@ -41,14 +42,18 @@ class IndeedScraper(Scraper):
        while len(job_list) < scraper_input.results_wanted:
            params = {
                "q": scraper_input.search_term,
-                "l": scraper_input.location,
+                "location": scraper_input.location,
                "radius": scraper_input.distance,
                "sc": "0kf:attr(DSQF7);" if scraper_input.is_remote else None,
                "filter": 0,
                "start": 0 + page * 10,
                "radius": scraper_input.distance,
            }
            response = session.get(self.url, params=params)
            if response.status_code == 307:
                new_url = response.headers["Location"]
                response = session.get(new_url)
            if response.status_code != status.HTTP_200_OK:
                return JobResponse(
                    success=False,
@ -78,7 +83,7 @@ class IndeedScraper(Scraper):
                )
            for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]:
-                job_url = job["thirdPartyApplyUrl"]
+                job_url = f'{self.job_url}{job["jobkey"]}'
                if job_url in seen_urls:
                    continue
                snippet_html = BeautifulSoup(job["snippet"], "html.parser")
@ -104,10 +109,6 @@ class IndeedScraper(Scraper):
                        )
                job_type = IndeedScraper.get_job_type(job)
                if job.get("thirdPartyApplyUrl"):
                    delivery = Delivery(method=DeliveryEnum.URL, value=job_url)
                else:
                    delivery = None
                timestamp_seconds = job["pubDate"] / 1000
                date_posted = datetime.fromtimestamp(timestamp_seconds)
@ -117,15 +118,15 @@ class IndeedScraper(Scraper):
                    description=first_li.text if first_li else None,
                    company_name=job["company"],
                    location=Location(
-                        city=job["jobLocationCity"],
+                        city=job.get("jobLocationCity"),
-                        state=job["jobLocationState"],
+                        state=job.get("jobLocationState"),
                        postal_code=job.get("jobLocationPostal"),
                        country="US",
                    ),
                    job_type=job_type,
                    compensation=compensation,
                    date_posted=date_posted,
-                    delivery=delivery,
+                    job_url=job_url,
                )
                job_list.append(job_post)
                if len(job_list) >= scraper_input.results_wanted:
--- a/api/core/scrapers/linkedin/init.py
+++ b/api/core/scrapers/linkedin/init.py
@ -17,7 +17,8 @@ class LinkedInScraper(Scraper):
        site = Site(Site.LINKEDIN)
        super().__init__(site)
-        self.url = "https://www.linkedin.com/jobs"
+        self.url = "https://www.linkedin.com/jobs/search/"
        self.job_url = "https://www.linkedin.com/jobs/view/"
    def scrape(self, scraper_input: ScraperInput) -> JobResponse:
        """
@ -32,12 +33,13 @@ class LinkedInScraper(Scraper):
        with requests.Session() as session:
            while len(job_list) < scraper_input.results_wanted:
                params = {
-                    "pageNum": page,
+                    "keywords": scraper_input.search_term,
                    "location": scraper_input.location,
                    "distance": scraper_input.distance,
                    "f_WT": 2 if scraper_input.is_remote else None,
                    "pageNum": page,
                }
                self.url = f"{self.url}/{scraper_input.search_term}-jobs"
                response = session.get(self.url, params=params, allow_redirects=True)
                if response.status_code != status.HTTP_200_OK:
@ -58,8 +60,11 @@ class LinkedInScraper(Scraper):
                    "div",
                    class_="base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card",
                ):
-                    job_url_tag = job_card.find("a", class_="base-card__full-link")
+                    data_entity_urn = job_card.get("data-entity-urn", "")
-                    job_url = job_url_tag["href"] if job_url_tag else "N/A"
+                    job_id = (
                        data_entity_urn.split(":")[-1] if data_entity_urn else "N/A"
                    )
                    job_url = f"{self.job_url}{job_id}"
                    if job_url in seen_urls:
                        continue
                    seen_urls.add(job_url)
@ -91,7 +96,7 @@ class LinkedInScraper(Scraper):
                        company_name=company,
                        location=location,
                        date_posted=date_posted,
-                        delivery=Delivery(method=DeliveryEnum.URL, value=job_url),
+                        job_url=job_url,
                    )
                    job_list.append(job_post)
                    if len(job_list) >= scraper_input.results_wanted:
--- a/api/core/scrapers/ziprecruiter/init.py
+++ b/api/core/scrapers/ziprecruiter/init.py
@ -38,8 +38,11 @@ class ZipRecruiterScraper(Scraper):
            params = {
                "search": scraper_input.search_term,
                "location": scraper_input.location,
                "page": page,
                "radius": scraper_input.distance,
                "refine_by_location_type": "only_remote"
                if scraper_input.is_remote
                else None,
                "page": page,
            }
            response = session.get(
@ -88,7 +91,7 @@ class ZipRecruiterScraper(Scraper):
                    job_type=job_type,
                    compensation=ZipRecruiterScraper.get_compensation(job),
                    date_posted=date_posted,
-                    delivery=Delivery(method=DeliveryEnum.URL, value=job_url),
+                    job_url=job_url,
                )
                job_list.append(job_post)
                if len(job_list) >= scraper_input.results_wanted: