diff --git a/src/jobspy/jobs/__init__.py b/src/jobspy/jobs/__init__.py index 08c5ad2..25a056f 100644 --- a/src/jobspy/jobs/__init__.py +++ b/src/jobspy/jobs/__init__.py @@ -243,9 +243,11 @@ class JobPost(BaseModel): emails: list[str] | None = None is_remote: bool | None = None + # linkedin and indeed specific + company_industry: str | None = None + # indeed specific company_addresses: str | None = None - company_industry: str | None = None company_num_employees: str | None = None company_revenue: str | None = None company_description: str | None = None diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py index 3db5557..8e57e10 100644 --- a/src/jobspy/scrapers/linkedin/__init__.py +++ b/src/jobspy/scrapers/linkedin/__init__.py @@ -219,6 +219,7 @@ class LinkedInScraper(Scraper): job_url=f"{self.base_url}/jobs/view/{job_id}", compensation=compensation, job_type=job_details.get("job_type"), + company_industry=job_details.get("company_industry"), description=job_details.get("description"), job_url_direct=job_details.get("job_url_direct"), emails=extract_emails_from_text(job_details.get("description")), @@ -266,6 +267,7 @@ class LinkedInScraper(Scraper): job_function = job_function_span.text.strip() return { "description": description, + "company_industry": self._parse_company_industry(soup), "job_type": self._parse_job_type(soup), "job_url_direct": self._parse_job_url_direct(soup), "logo_photo_url": soup.find("img", {"class": "artdeco-entity-image"}).get( @@ -325,6 +327,29 @@ class LinkedInScraper(Scraper): return [get_enum_from_job_type(employment_type)] if employment_type else [] + @staticmethod + def _parse_company_industry(soup_industry: BeautifulSoup) -> str | None: + """ + Gets the company industry from job page + :param soup_industry: + :return: str + """ + h3_tag = soup_industry.find( + "h3", + class_="description__job-criteria-subheader", + string=lambda text: "Industries" in text, + ) + industry = None + if h3_tag: + industry_span = h3_tag.find_next_sibling( + "span", + class_="description__job-criteria-text description__job-criteria-text--criteria", + ) + if industry_span: + industry = industry_span.get_text(strip=True) + + return industry + def _parse_job_url_direct(self, soup: BeautifulSoup) -> str | None: """ Gets the job url direct from job page