feat: Add company_industry to linkedin scraper

pull/166/head
Ali Bakhshi Ilani 2024-07-06 19:59:40 +03:30
parent d000a81eb3
commit 4ff002916c
2 changed files with 28 additions and 1 deletions

View File

@ -243,9 +243,11 @@ class JobPost(BaseModel):
emails: list[str] | None = None
is_remote: bool | None = None
# linkedin and indeed specific
company_industry: str | None = None
# indeed specific
company_addresses: str | None = None
company_industry: str | None = None
company_num_employees: str | None = None
company_revenue: str | None = None
company_description: str | None = None

View File

@ -219,6 +219,7 @@ class LinkedInScraper(Scraper):
job_url=f"{self.base_url}/jobs/view/{job_id}",
compensation=compensation,
job_type=job_details.get("job_type"),
company_industry=job_details.get("company_industry"),
description=job_details.get("description"),
job_url_direct=job_details.get("job_url_direct"),
emails=extract_emails_from_text(job_details.get("description")),
@ -266,6 +267,7 @@ class LinkedInScraper(Scraper):
job_function = job_function_span.text.strip()
return {
"description": description,
"company_industry": self._parse_company_industry(soup),
"job_type": self._parse_job_type(soup),
"job_url_direct": self._parse_job_url_direct(soup),
"logo_photo_url": soup.find("img", {"class": "artdeco-entity-image"}).get(
@ -325,6 +327,29 @@ class LinkedInScraper(Scraper):
return [get_enum_from_job_type(employment_type)] if employment_type else []
@staticmethod
def _parse_company_industry(soup_industry: BeautifulSoup) -> str | None:
"""
Gets the company industry from job page
:param soup_industry:
:return: str
"""
h3_tag = soup_industry.find(
"h3",
class_="description__job-criteria-subheader",
string=lambda text: "Industries" in text,
)
industry = None
if h3_tag:
industry_span = h3_tag.find_next_sibling(
"span",
class_="description__job-criteria-text description__job-criteria-text--criteria",
)
if industry_span:
industry = industry_span.get_text(strip=True)
return industry
def _parse_job_url_direct(self, soup: BeautifulSoup) -> str | None:
"""
Gets the job url direct from job page