Add company industry and job level to linkedin scraper (#166)

pull/169/head
Ali Bakhshi Ilani 2024-07-16 05:37:39 +03:30 committed by GitHub
parent edffe18e65
commit 48631ea271
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 64 additions and 3 deletions

View File

@ -37,7 +37,7 @@ jobs = scrape_jobs(
hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old) hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old)
country_indeed='USA', # only needed for indeed / glassdoor country_indeed='USA', # only needed for indeed / glassdoor
# linkedin_fetch_description=True # get full description and direct job url for linkedin (slower) # linkedin_fetch_description=True # get full description , direct job url , company industry and job level (seniority level) for linkedin (slower)
# proxies=["208.195.175.46:65095", "208.195.175.45:65095", "localhost"], # proxies=["208.195.175.46:65095", "208.195.175.45:65095", "localhost"],
) )
@ -150,10 +150,15 @@ JobPost
├── emails (str) ├── emails (str)
└── is_remote (bool) └── is_remote (bool)
Linkedin specific
└── job_level (str)
Linkedin & Indeed specific
└── company_industry (str)
Indeed specific Indeed specific
├── company_country (str) ├── company_country (str)
└── company_addresses (str) └── company_addresses (str)
└── company_industry (str)
└── company_employees_label (str) └── company_employees_label (str)
└── company_revenue_label (str) └── company_revenue_label (str)
└── company_description (str) └── company_description (str)

View File

@ -208,6 +208,7 @@ def scrape_jobs(
"max_amount", "max_amount",
"currency", "currency",
"is_remote", "is_remote",
"job_level",
"job_function", "job_function",
"listing_type", "listing_type",
"emails", "emails",

View File

@ -244,9 +244,14 @@ class JobPost(BaseModel):
is_remote: bool | None = None is_remote: bool | None = None
listing_type: str | None = None listing_type: str | None = None
# linkedin specific
job_level: str | None = None
# linkedin and indeed specific
company_industry: str | None = None
# indeed specific # indeed specific
company_addresses: str | None = None company_addresses: str | None = None
company_industry: str | None = None
company_num_employees: str | None = None company_num_employees: str | None = None
company_revenue: str | None = None company_revenue: str | None = None
company_description: str | None = None company_description: str | None = None

View File

@ -219,6 +219,8 @@ class LinkedInScraper(Scraper):
job_url=f"{self.base_url}/jobs/view/{job_id}", job_url=f"{self.base_url}/jobs/view/{job_id}",
compensation=compensation, compensation=compensation,
job_type=job_details.get("job_type"), job_type=job_details.get("job_type"),
job_level=job_details.get("job_level"),
company_industry=job_details.get("company_industry"),
description=job_details.get("description"), description=job_details.get("description"),
job_url_direct=job_details.get("job_url_direct"), job_url_direct=job_details.get("job_url_direct"),
emails=extract_emails_from_text(job_details.get("description")), emails=extract_emails_from_text(job_details.get("description")),
@ -266,6 +268,8 @@ class LinkedInScraper(Scraper):
job_function = job_function_span.text.strip() job_function = job_function_span.text.strip()
return { return {
"description": description, "description": description,
"job_level": self._parse_job_level(soup),
"company_industry": self._parse_company_industry(soup),
"job_type": self._parse_job_type(soup), "job_type": self._parse_job_type(soup),
"job_url_direct": self._parse_job_url_direct(soup), "job_url_direct": self._parse_job_url_direct(soup),
"logo_photo_url": soup.find("img", {"class": "artdeco-entity-image"}).get( "logo_photo_url": soup.find("img", {"class": "artdeco-entity-image"}).get(
@ -325,6 +329,52 @@ class LinkedInScraper(Scraper):
return [get_enum_from_job_type(employment_type)] if employment_type else [] return [get_enum_from_job_type(employment_type)] if employment_type else []
@staticmethod
def _parse_job_level(soup_job_level: BeautifulSoup) -> str | None:
"""
Gets the job level from job page
:param soup_job_level:
:return: str
"""
h3_tag = soup_job_level.find(
"h3",
class_="description__job-criteria-subheader",
string=lambda text: "Seniority level" in text,
)
job_level = None
if h3_tag:
job_level_span = h3_tag.find_next_sibling(
"span",
class_="description__job-criteria-text description__job-criteria-text--criteria",
)
if job_level_span:
job_level = job_level_span.get_text(strip=True)
return job_level
@staticmethod
def _parse_company_industry(soup_industry: BeautifulSoup) -> str | None:
"""
Gets the company industry from job page
:param soup_industry:
:return: str
"""
h3_tag = soup_industry.find(
"h3",
class_="description__job-criteria-subheader",
string=lambda text: "Industries" in text,
)
industry = None
if h3_tag:
industry_span = h3_tag.find_next_sibling(
"span",
class_="description__job-criteria-text description__job-criteria-text--criteria",
)
if industry_span:
industry = industry_span.get_text(strip=True)
return industry
def _parse_job_url_direct(self, soup: BeautifulSoup) -> str | None: def _parse_job_url_direct(self, soup: BeautifulSoup) -> str | None:
""" """
Gets the job url direct from job page Gets the job url direct from job page