diff --git a/api/core/jobs/__init__.py b/api/core/jobs/__init__.py index 88c052b..823eb82 100644 --- a/api/core/jobs/__init__.py +++ b/api/core/jobs/__init__.py @@ -16,6 +16,7 @@ class JobType(Enum): NIGHTS = "nights" OTHER = "other" SUMMER = "summer" + VOLUNTEER = "volunteer" class Location(BaseModel): @@ -48,7 +49,6 @@ class JobPost(BaseModel): description: str = None job_type: JobType = None compensation: Compensation = None - # why is 08-28-2023 a validiation error for type date? how do I fix this? date_posted: date = None diff --git a/api/core/scrapers/linkedin/__init__.py b/api/core/scrapers/linkedin/__init__.py index cbbafa7..c39458a 100644 --- a/api/core/scrapers/linkedin/__init__.py +++ b/api/core/scrapers/linkedin/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Tuple from datetime import datetime import requests @@ -103,7 +103,7 @@ class LinkedInScraper(Scraper): datetime_tag = metadata_card.find( "time", class_="job-search-card__listdate" ) - description = LinkedInScraper.get_description(job_url) + description, job_type = LinkedInScraper.get_description(job_url) if datetime_tag: datetime_str = datetime_tag["datetime"] date_posted = datetime.strptime(datetime_str, "%Y-%m-%d") @@ -117,6 +117,7 @@ class LinkedInScraper(Scraper): location=location, date_posted=date_posted, job_url=job_url, + job_type=job_type, ) job_list.append(job_post) if ( @@ -149,7 +150,7 @@ class LinkedInScraper(Scraper): """ response = requests.get(job_page_url, allow_redirects=True) if response.status_code not in range(200, 400): - return None + return None, None soup = BeautifulSoup(response.text, "html.parser") div_content = soup.find( @@ -159,7 +160,36 @@ class LinkedInScraper(Scraper): text_content = None if div_content: text_content = " ".join(div_content.get_text().split()).strip() - return text_content + + def get_job_type( + soup: BeautifulSoup, + ) -> Tuple[Optional[str], Optional[JobType]]: + """ + Gets the job type from job page + :param soup: + :return: JobType + """ + h3_tag = soup.find( + "h3", + class_="description__job-criteria-subheader", + string=lambda text: "Employment type" in text, + ) + + employment_type = None + if h3_tag: + employment_type_span = h3_tag.find_next_sibling( + "span", + class_="description__job-criteria-text description__job-criteria-text--criteria", + ) + if employment_type_span: + employment_type = employment_type_span.get_text(strip=True) + employment_type = employment_type.lower() + employment_type = employment_type.replace("-", "") + print(employment_type) + + return JobType(employment_type) + + return text_content, get_job_type(soup) @staticmethod def get_location(metadata_card: Optional[Tag]) -> Location: