mirror of
https://github.com/Bunsly/JobSpy.git
synced 2026-03-05 12:04:33 -08:00
Compare commits
4 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6330c14879 | ||
|
|
48631ea271 | ||
|
|
edffe18e65 | ||
|
|
0988230a24 |
@@ -37,7 +37,7 @@ jobs = scrape_jobs(
|
|||||||
hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old)
|
hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old)
|
||||||
country_indeed='USA', # only needed for indeed / glassdoor
|
country_indeed='USA', # only needed for indeed / glassdoor
|
||||||
|
|
||||||
# linkedin_fetch_description=True # get full description and direct job url for linkedin (slower)
|
# linkedin_fetch_description=True # get full description , direct job url , company industry and job level (seniority level) for linkedin (slower)
|
||||||
# proxies=["208.195.175.46:65095", "208.195.175.45:65095", "localhost"],
|
# proxies=["208.195.175.46:65095", "208.195.175.45:65095", "localhost"],
|
||||||
|
|
||||||
)
|
)
|
||||||
@@ -150,10 +150,15 @@ JobPost
|
|||||||
├── emails (str)
|
├── emails (str)
|
||||||
└── is_remote (bool)
|
└── is_remote (bool)
|
||||||
|
|
||||||
|
Linkedin specific
|
||||||
|
└── job_level (str)
|
||||||
|
|
||||||
|
Linkedin & Indeed specific
|
||||||
|
└── company_industry (str)
|
||||||
|
|
||||||
Indeed specific
|
Indeed specific
|
||||||
├── company_country (str)
|
├── company_country (str)
|
||||||
└── company_addresses (str)
|
└── company_addresses (str)
|
||||||
└── company_industry (str)
|
|
||||||
└── company_employees_label (str)
|
└── company_employees_label (str)
|
||||||
└── company_revenue_label (str)
|
└── company_revenue_label (str)
|
||||||
└── company_description (str)
|
└── company_description (str)
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "python-jobspy"
|
name = "python-jobspy"
|
||||||
version = "1.1.57"
|
version = "1.1.58"
|
||||||
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
||||||
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
||||||
homepage = "https://github.com/Bunsly/JobSpy"
|
homepage = "https://github.com/Bunsly/JobSpy"
|
||||||
|
|||||||
@@ -208,13 +208,15 @@ def scrape_jobs(
|
|||||||
"max_amount",
|
"max_amount",
|
||||||
"currency",
|
"currency",
|
||||||
"is_remote",
|
"is_remote",
|
||||||
|
"job_level",
|
||||||
"job_function",
|
"job_function",
|
||||||
|
"company_industry",
|
||||||
|
"listing_type",
|
||||||
"emails",
|
"emails",
|
||||||
"description",
|
"description",
|
||||||
"company_url",
|
"company_url",
|
||||||
"company_url_direct",
|
"company_url_direct",
|
||||||
"company_addresses",
|
"company_addresses",
|
||||||
"company_industry",
|
|
||||||
"company_num_employees",
|
"company_num_employees",
|
||||||
"company_revenue",
|
"company_revenue",
|
||||||
"company_description",
|
"company_description",
|
||||||
|
|||||||
@@ -242,10 +242,16 @@ class JobPost(BaseModel):
|
|||||||
date_posted: date | None = None
|
date_posted: date | None = None
|
||||||
emails: list[str] | None = None
|
emails: list[str] | None = None
|
||||||
is_remote: bool | None = None
|
is_remote: bool | None = None
|
||||||
|
listing_type: str | None = None
|
||||||
|
|
||||||
|
# linkedin specific
|
||||||
|
job_level: str | None = None
|
||||||
|
|
||||||
|
# linkedin and indeed specific
|
||||||
|
company_industry: str | None = None
|
||||||
|
|
||||||
# indeed specific
|
# indeed specific
|
||||||
company_addresses: str | None = None
|
company_addresses: str | None = None
|
||||||
company_industry: str | None = None
|
|
||||||
company_num_employees: str | None = None
|
company_num_employees: str | None = None
|
||||||
company_revenue: str | None = None
|
company_revenue: str | None = None
|
||||||
company_description: str | None = None
|
company_description: str | None = None
|
||||||
|
|||||||
@@ -189,6 +189,15 @@ class GlassdoorScraper(Scraper):
|
|||||||
except:
|
except:
|
||||||
description = None
|
description = None
|
||||||
company_url = f"{self.base_url}Overview/W-EI_IE{company_id}.htm"
|
company_url = f"{self.base_url}Overview/W-EI_IE{company_id}.htm"
|
||||||
|
company_logo = (
|
||||||
|
job_data["jobview"].get("overview", {}).get("squareLogoUrl", None)
|
||||||
|
)
|
||||||
|
listing_type = (
|
||||||
|
job_data["jobview"]
|
||||||
|
.get("header", {})
|
||||||
|
.get("adOrderSponsorshipLevel", "")
|
||||||
|
.lower()
|
||||||
|
)
|
||||||
return JobPost(
|
return JobPost(
|
||||||
id=str(job_id),
|
id=str(job_id),
|
||||||
title=title,
|
title=title,
|
||||||
@@ -201,6 +210,8 @@ class GlassdoorScraper(Scraper):
|
|||||||
is_remote=is_remote,
|
is_remote=is_remote,
|
||||||
description=description,
|
description=description,
|
||||||
emails=extract_emails_from_text(description) if description else None,
|
emails=extract_emails_from_text(description) if description else None,
|
||||||
|
logo_photo_url=company_logo,
|
||||||
|
listing_type=listing_type,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _fetch_job_description(self, job_id):
|
def _fetch_job_description(self, job_id):
|
||||||
|
|||||||
@@ -176,7 +176,7 @@ class IndeedScraper(Scraper):
|
|||||||
keys.append("DSQF7")
|
keys.append("DSQF7")
|
||||||
|
|
||||||
if keys:
|
if keys:
|
||||||
keys_str = '", "'.join(keys) # Prepare your keys string
|
keys_str = '", "'.join(keys)
|
||||||
filters_str = f"""
|
filters_str = f"""
|
||||||
filters: {{
|
filters: {{
|
||||||
composite: {{
|
composite: {{
|
||||||
@@ -244,6 +244,7 @@ class IndeedScraper(Scraper):
|
|||||||
.replace("Iv1", "")
|
.replace("Iv1", "")
|
||||||
.replace("_", " ")
|
.replace("_", " ")
|
||||||
.title()
|
.title()
|
||||||
|
.strip()
|
||||||
if employer_details.get("industry")
|
if employer_details.get("industry")
|
||||||
else None
|
else None
|
||||||
),
|
),
|
||||||
@@ -353,7 +354,6 @@ class IndeedScraper(Scraper):
|
|||||||
jobSearch(
|
jobSearch(
|
||||||
{what}
|
{what}
|
||||||
{location}
|
{location}
|
||||||
includeSponsoredResults: NONE
|
|
||||||
limit: 100
|
limit: 100
|
||||||
sort: DATE
|
sort: DATE
|
||||||
{cursor}
|
{cursor}
|
||||||
@@ -365,6 +365,9 @@ class IndeedScraper(Scraper):
|
|||||||
results {{
|
results {{
|
||||||
trackingKey
|
trackingKey
|
||||||
job {{
|
job {{
|
||||||
|
source {{
|
||||||
|
name
|
||||||
|
}}
|
||||||
key
|
key
|
||||||
title
|
title
|
||||||
datePublished
|
datePublished
|
||||||
|
|||||||
@@ -219,6 +219,8 @@ class LinkedInScraper(Scraper):
|
|||||||
job_url=f"{self.base_url}/jobs/view/{job_id}",
|
job_url=f"{self.base_url}/jobs/view/{job_id}",
|
||||||
compensation=compensation,
|
compensation=compensation,
|
||||||
job_type=job_details.get("job_type"),
|
job_type=job_details.get("job_type"),
|
||||||
|
job_level=job_details.get("job_level", "").lower(),
|
||||||
|
company_industry=job_details.get("company_industry"),
|
||||||
description=job_details.get("description"),
|
description=job_details.get("description"),
|
||||||
job_url_direct=job_details.get("job_url_direct"),
|
job_url_direct=job_details.get("job_url_direct"),
|
||||||
emails=extract_emails_from_text(job_details.get("description")),
|
emails=extract_emails_from_text(job_details.get("description")),
|
||||||
@@ -266,6 +268,8 @@ class LinkedInScraper(Scraper):
|
|||||||
job_function = job_function_span.text.strip()
|
job_function = job_function_span.text.strip()
|
||||||
return {
|
return {
|
||||||
"description": description,
|
"description": description,
|
||||||
|
"job_level": self._parse_job_level(soup),
|
||||||
|
"company_industry": self._parse_company_industry(soup),
|
||||||
"job_type": self._parse_job_type(soup),
|
"job_type": self._parse_job_type(soup),
|
||||||
"job_url_direct": self._parse_job_url_direct(soup),
|
"job_url_direct": self._parse_job_url_direct(soup),
|
||||||
"logo_photo_url": soup.find("img", {"class": "artdeco-entity-image"}).get(
|
"logo_photo_url": soup.find("img", {"class": "artdeco-entity-image"}).get(
|
||||||
@@ -325,6 +329,52 @@ class LinkedInScraper(Scraper):
|
|||||||
|
|
||||||
return [get_enum_from_job_type(employment_type)] if employment_type else []
|
return [get_enum_from_job_type(employment_type)] if employment_type else []
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _parse_job_level(soup_job_level: BeautifulSoup) -> str | None:
|
||||||
|
"""
|
||||||
|
Gets the job level from job page
|
||||||
|
:param soup_job_level:
|
||||||
|
:return: str
|
||||||
|
"""
|
||||||
|
h3_tag = soup_job_level.find(
|
||||||
|
"h3",
|
||||||
|
class_="description__job-criteria-subheader",
|
||||||
|
string=lambda text: "Seniority level" in text,
|
||||||
|
)
|
||||||
|
job_level = None
|
||||||
|
if h3_tag:
|
||||||
|
job_level_span = h3_tag.find_next_sibling(
|
||||||
|
"span",
|
||||||
|
class_="description__job-criteria-text description__job-criteria-text--criteria",
|
||||||
|
)
|
||||||
|
if job_level_span:
|
||||||
|
job_level = job_level_span.get_text(strip=True)
|
||||||
|
|
||||||
|
return job_level
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _parse_company_industry(soup_industry: BeautifulSoup) -> str | None:
|
||||||
|
"""
|
||||||
|
Gets the company industry from job page
|
||||||
|
:param soup_industry:
|
||||||
|
:return: str
|
||||||
|
"""
|
||||||
|
h3_tag = soup_industry.find(
|
||||||
|
"h3",
|
||||||
|
class_="description__job-criteria-subheader",
|
||||||
|
string=lambda text: "Industries" in text,
|
||||||
|
)
|
||||||
|
industry = None
|
||||||
|
if h3_tag:
|
||||||
|
industry_span = h3_tag.find_next_sibling(
|
||||||
|
"span",
|
||||||
|
class_="description__job-criteria-text description__job-criteria-text--criteria",
|
||||||
|
)
|
||||||
|
if industry_span:
|
||||||
|
industry = industry_span.get_text(strip=True)
|
||||||
|
|
||||||
|
return industry
|
||||||
|
|
||||||
def _parse_job_url_direct(self, soup: BeautifulSoup) -> str | None:
|
def _parse_job_url_direct(self, soup: BeautifulSoup) -> str | None:
|
||||||
"""
|
"""
|
||||||
Gets the job url direct from job page
|
Gets the job url direct from job page
|
||||||
|
|||||||
@@ -135,6 +135,7 @@ class ZipRecruiterScraper(Scraper):
|
|||||||
self.seen_urls.add(job_url)
|
self.seen_urls.add(job_url)
|
||||||
|
|
||||||
description = job.get("job_description", "").strip()
|
description = job.get("job_description", "").strip()
|
||||||
|
listing_type = job.get("buyer_type", "")
|
||||||
description = (
|
description = (
|
||||||
markdown_converter(description)
|
markdown_converter(description)
|
||||||
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN
|
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN
|
||||||
@@ -175,6 +176,7 @@ class ZipRecruiterScraper(Scraper):
|
|||||||
description=description_full if description_full else description,
|
description=description_full if description_full else description,
|
||||||
emails=extract_emails_from_text(description) if description else None,
|
emails=extract_emails_from_text(description) if description else None,
|
||||||
job_url_direct=job_url_direct,
|
job_url_direct=job_url_direct,
|
||||||
|
listing_type=listing_type,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _get_descr(self, job_url):
|
def _get_descr(self, job_url):
|
||||||
|
|||||||
Reference in New Issue
Block a user