diff --git a/README.md b/README.md index a5baf0c..185822f 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ jobs = scrape_jobs( hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old) country_indeed='USA', # only needed for indeed / glassdoor - # linkedin_fetch_description=True # get full description and direct job url for linkedin (slower) + # linkedin_fetch_description=True # get full description , direct job url , company industry and job level (seniority level) for linkedin (slower) # proxies=["208.195.175.46:65095", "208.195.175.45:65095", "localhost"], ) @@ -150,10 +150,15 @@ JobPost ├── emails (str) └── is_remote (bool) +Linkedin specific +└── job_level (str) + +Linkedin & Indeed specific +└── company_industry (str) + Indeed specific ├── company_country (str) └── company_addresses (str) -└── company_industry (str) └── company_employees_label (str) └── company_revenue_label (str) └── company_description (str) diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py index b199347..a29f571 100644 --- a/src/jobspy/__init__.py +++ b/src/jobspy/__init__.py @@ -208,6 +208,7 @@ def scrape_jobs( "max_amount", "currency", "is_remote", + "job_level", "job_function", "listing_type", "emails", diff --git a/src/jobspy/jobs/__init__.py b/src/jobspy/jobs/__init__.py index 98eb458..c8af981 100644 --- a/src/jobspy/jobs/__init__.py +++ b/src/jobspy/jobs/__init__.py @@ -244,9 +244,14 @@ class JobPost(BaseModel): is_remote: bool | None = None listing_type: str | None = None + # linkedin specific + job_level: str | None = None + + # linkedin and indeed specific + company_industry: str | None = None + # indeed specific company_addresses: str | None = None - company_industry: str | None = None company_num_employees: str | None = None company_revenue: str | None = None company_description: str | None = None diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py index 3db5557..45bf31a 100644 --- a/src/jobspy/scrapers/linkedin/__init__.py +++ b/src/jobspy/scrapers/linkedin/__init__.py @@ -219,6 +219,8 @@ class LinkedInScraper(Scraper): job_url=f"{self.base_url}/jobs/view/{job_id}", compensation=compensation, job_type=job_details.get("job_type"), + job_level=job_details.get("job_level"), + company_industry=job_details.get("company_industry"), description=job_details.get("description"), job_url_direct=job_details.get("job_url_direct"), emails=extract_emails_from_text(job_details.get("description")), @@ -266,6 +268,8 @@ class LinkedInScraper(Scraper): job_function = job_function_span.text.strip() return { "description": description, + "job_level": self._parse_job_level(soup), + "company_industry": self._parse_company_industry(soup), "job_type": self._parse_job_type(soup), "job_url_direct": self._parse_job_url_direct(soup), "logo_photo_url": soup.find("img", {"class": "artdeco-entity-image"}).get( @@ -325,6 +329,52 @@ class LinkedInScraper(Scraper): return [get_enum_from_job_type(employment_type)] if employment_type else [] + @staticmethod + def _parse_job_level(soup_job_level: BeautifulSoup) -> str | None: + """ + Gets the job level from job page + :param soup_job_level: + :return: str + """ + h3_tag = soup_job_level.find( + "h3", + class_="description__job-criteria-subheader", + string=lambda text: "Seniority level" in text, + ) + job_level = None + if h3_tag: + job_level_span = h3_tag.find_next_sibling( + "span", + class_="description__job-criteria-text description__job-criteria-text--criteria", + ) + if job_level_span: + job_level = job_level_span.get_text(strip=True) + + return job_level + + @staticmethod + def _parse_company_industry(soup_industry: BeautifulSoup) -> str | None: + """ + Gets the company industry from job page + :param soup_industry: + :return: str + """ + h3_tag = soup_industry.find( + "h3", + class_="description__job-criteria-subheader", + string=lambda text: "Industries" in text, + ) + industry = None + if h3_tag: + industry_span = h3_tag.find_next_sibling( + "span", + class_="description__job-criteria-text description__job-criteria-text--criteria", + ) + if industry_span: + industry = industry_span.get_text(strip=True) + + return industry + def _parse_job_url_direct(self, soup: BeautifulSoup) -> str | None: """ Gets the job url direct from job page