mirror of https://github.com/Bunsly/JobSpy
Add company industry and job level to linkedin scraper (#166)
parent
edffe18e65
commit
48631ea271
|
@ -37,7 +37,7 @@ jobs = scrape_jobs(
|
||||||
hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old)
|
hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old)
|
||||||
country_indeed='USA', # only needed for indeed / glassdoor
|
country_indeed='USA', # only needed for indeed / glassdoor
|
||||||
|
|
||||||
# linkedin_fetch_description=True # get full description and direct job url for linkedin (slower)
|
# linkedin_fetch_description=True # get full description , direct job url , company industry and job level (seniority level) for linkedin (slower)
|
||||||
# proxies=["208.195.175.46:65095", "208.195.175.45:65095", "localhost"],
|
# proxies=["208.195.175.46:65095", "208.195.175.45:65095", "localhost"],
|
||||||
|
|
||||||
)
|
)
|
||||||
|
@ -150,10 +150,15 @@ JobPost
|
||||||
├── emails (str)
|
├── emails (str)
|
||||||
└── is_remote (bool)
|
└── is_remote (bool)
|
||||||
|
|
||||||
|
Linkedin specific
|
||||||
|
└── job_level (str)
|
||||||
|
|
||||||
|
Linkedin & Indeed specific
|
||||||
|
└── company_industry (str)
|
||||||
|
|
||||||
Indeed specific
|
Indeed specific
|
||||||
├── company_country (str)
|
├── company_country (str)
|
||||||
└── company_addresses (str)
|
└── company_addresses (str)
|
||||||
└── company_industry (str)
|
|
||||||
└── company_employees_label (str)
|
└── company_employees_label (str)
|
||||||
└── company_revenue_label (str)
|
└── company_revenue_label (str)
|
||||||
└── company_description (str)
|
└── company_description (str)
|
||||||
|
|
|
@ -208,6 +208,7 @@ def scrape_jobs(
|
||||||
"max_amount",
|
"max_amount",
|
||||||
"currency",
|
"currency",
|
||||||
"is_remote",
|
"is_remote",
|
||||||
|
"job_level",
|
||||||
"job_function",
|
"job_function",
|
||||||
"listing_type",
|
"listing_type",
|
||||||
"emails",
|
"emails",
|
||||||
|
|
|
@ -244,9 +244,14 @@ class JobPost(BaseModel):
|
||||||
is_remote: bool | None = None
|
is_remote: bool | None = None
|
||||||
listing_type: str | None = None
|
listing_type: str | None = None
|
||||||
|
|
||||||
|
# linkedin specific
|
||||||
|
job_level: str | None = None
|
||||||
|
|
||||||
|
# linkedin and indeed specific
|
||||||
|
company_industry: str | None = None
|
||||||
|
|
||||||
# indeed specific
|
# indeed specific
|
||||||
company_addresses: str | None = None
|
company_addresses: str | None = None
|
||||||
company_industry: str | None = None
|
|
||||||
company_num_employees: str | None = None
|
company_num_employees: str | None = None
|
||||||
company_revenue: str | None = None
|
company_revenue: str | None = None
|
||||||
company_description: str | None = None
|
company_description: str | None = None
|
||||||
|
|
|
@ -219,6 +219,8 @@ class LinkedInScraper(Scraper):
|
||||||
job_url=f"{self.base_url}/jobs/view/{job_id}",
|
job_url=f"{self.base_url}/jobs/view/{job_id}",
|
||||||
compensation=compensation,
|
compensation=compensation,
|
||||||
job_type=job_details.get("job_type"),
|
job_type=job_details.get("job_type"),
|
||||||
|
job_level=job_details.get("job_level"),
|
||||||
|
company_industry=job_details.get("company_industry"),
|
||||||
description=job_details.get("description"),
|
description=job_details.get("description"),
|
||||||
job_url_direct=job_details.get("job_url_direct"),
|
job_url_direct=job_details.get("job_url_direct"),
|
||||||
emails=extract_emails_from_text(job_details.get("description")),
|
emails=extract_emails_from_text(job_details.get("description")),
|
||||||
|
@ -266,6 +268,8 @@ class LinkedInScraper(Scraper):
|
||||||
job_function = job_function_span.text.strip()
|
job_function = job_function_span.text.strip()
|
||||||
return {
|
return {
|
||||||
"description": description,
|
"description": description,
|
||||||
|
"job_level": self._parse_job_level(soup),
|
||||||
|
"company_industry": self._parse_company_industry(soup),
|
||||||
"job_type": self._parse_job_type(soup),
|
"job_type": self._parse_job_type(soup),
|
||||||
"job_url_direct": self._parse_job_url_direct(soup),
|
"job_url_direct": self._parse_job_url_direct(soup),
|
||||||
"logo_photo_url": soup.find("img", {"class": "artdeco-entity-image"}).get(
|
"logo_photo_url": soup.find("img", {"class": "artdeco-entity-image"}).get(
|
||||||
|
@ -325,6 +329,52 @@ class LinkedInScraper(Scraper):
|
||||||
|
|
||||||
return [get_enum_from_job_type(employment_type)] if employment_type else []
|
return [get_enum_from_job_type(employment_type)] if employment_type else []
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _parse_job_level(soup_job_level: BeautifulSoup) -> str | None:
|
||||||
|
"""
|
||||||
|
Gets the job level from job page
|
||||||
|
:param soup_job_level:
|
||||||
|
:return: str
|
||||||
|
"""
|
||||||
|
h3_tag = soup_job_level.find(
|
||||||
|
"h3",
|
||||||
|
class_="description__job-criteria-subheader",
|
||||||
|
string=lambda text: "Seniority level" in text,
|
||||||
|
)
|
||||||
|
job_level = None
|
||||||
|
if h3_tag:
|
||||||
|
job_level_span = h3_tag.find_next_sibling(
|
||||||
|
"span",
|
||||||
|
class_="description__job-criteria-text description__job-criteria-text--criteria",
|
||||||
|
)
|
||||||
|
if job_level_span:
|
||||||
|
job_level = job_level_span.get_text(strip=True)
|
||||||
|
|
||||||
|
return job_level
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _parse_company_industry(soup_industry: BeautifulSoup) -> str | None:
|
||||||
|
"""
|
||||||
|
Gets the company industry from job page
|
||||||
|
:param soup_industry:
|
||||||
|
:return: str
|
||||||
|
"""
|
||||||
|
h3_tag = soup_industry.find(
|
||||||
|
"h3",
|
||||||
|
class_="description__job-criteria-subheader",
|
||||||
|
string=lambda text: "Industries" in text,
|
||||||
|
)
|
||||||
|
industry = None
|
||||||
|
if h3_tag:
|
||||||
|
industry_span = h3_tag.find_next_sibling(
|
||||||
|
"span",
|
||||||
|
class_="description__job-criteria-text description__job-criteria-text--criteria",
|
||||||
|
)
|
||||||
|
if industry_span:
|
||||||
|
industry = industry_span.get_text(strip=True)
|
||||||
|
|
||||||
|
return industry
|
||||||
|
|
||||||
def _parse_job_url_direct(self, soup: BeautifulSoup) -> str | None:
|
def _parse_job_url_direct(self, soup: BeautifulSoup) -> str | None:
|
||||||
"""
|
"""
|
||||||
Gets the job url direct from job page
|
Gets the job url direct from job page
|
||||||
|
|
Loading…
Reference in New Issue