enh(li): job function (#160)

pull/162/head v1.1.55
Cullen Watson 2024-05-28 16:01:29 -05:00 committed by GitHub
parent 6439f71433
commit 89a3ee231c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 20 additions and 5 deletions

View File

@ -13,9 +13,6 @@ work with us.*
- Aggregates the job postings in a Pandas DataFrame - Aggregates the job postings in a Pandas DataFrame
- Proxies support - Proxies support
[Video Guide for JobSpy](https://www.youtube.com/watch?v=RuP1HrAZnxs&pp=ygUgam9icyBzY3JhcGVyIGJvdCBsaW5rZWRpbiBpbmRlZWQ%3D) -
Updated for release v1.1.3
![jobspy](https://github.com/cullenwatson/JobSpy/assets/78247585/ec7ef355-05f6-4fd3-8161-a817e31c5c57) ![jobspy](https://github.com/cullenwatson/JobSpy/assets/78247585/ec7ef355-05f6-4fd3-8161-a817e31c5c57)
### Installation ### Installation
@ -46,7 +43,7 @@ jobs = scrape_jobs(
) )
print(f"Found {len(jobs)} jobs") print(f"Found {len(jobs)} jobs")
print(jobs.head()) print(jobs.head())
jobs.to_csv("jobs.csv", quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False) # to_xlsx jobs.to_csv("jobs.csv", quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False) # to_excel
``` ```
### Output ### Output

View File

@ -182,6 +182,7 @@ def scrape_jobs(
"max_amount", "max_amount",
"currency", "currency",
"is_remote", "is_remote",
"job_function",
"emails", "emails",
"description", "description",
"company_url", "company_url",

View File

@ -254,6 +254,9 @@ class JobPost(BaseModel):
logo_photo_url: str | None = None logo_photo_url: str | None = None
banner_photo_url: str | None = None banner_photo_url: str | None = None
# linkedin only atm
job_function: str | None = None
class JobResponse(BaseModel): class JobResponse(BaseModel):
jobs: list[JobPost] = [] jobs: list[JobPost] = []

View File

@ -224,6 +224,7 @@ class LinkedInScraper(Scraper):
job_url_direct=job_details.get("job_url_direct"), job_url_direct=job_details.get("job_url_direct"),
emails=extract_emails_from_text(job_details.get("description")), emails=extract_emails_from_text(job_details.get("description")),
logo_photo_url=job_details.get("logo_photo_url"), logo_photo_url=job_details.get("logo_photo_url"),
job_function=job_details.get("job_function"),
) )
def _get_id(self, url: str): def _get_id(self, url: str):
@ -247,7 +248,7 @@ class LinkedInScraper(Scraper):
response.raise_for_status() response.raise_for_status()
except: except:
return {} return {}
if response.url == "https://www.linkedin.com/signup": if "linkedin.com/signup" in response.url:
return {} return {}
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
@ -266,6 +267,18 @@ class LinkedInScraper(Scraper):
description = div_content.prettify(formatter="html") description = div_content.prettify(formatter="html")
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN: if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
description = markdown_converter(description) description = markdown_converter(description)
h3_tag = soup.find(
"h3", text=lambda text: text and "Job function" in text.strip()
)
job_function = None
if h3_tag:
job_function_span = h3_tag.find_next(
"span", class_="description__job-criteria-text"
)
if job_function_span:
job_function = job_function_span.text.strip()
return { return {
"description": description, "description": description,
"job_type": self._parse_job_type(soup), "job_type": self._parse_job_type(soup),
@ -273,6 +286,7 @@ class LinkedInScraper(Scraper):
"logo_photo_url": soup.find("img", {"class": "artdeco-entity-image"}).get( "logo_photo_url": soup.find("img", {"class": "artdeco-entity-image"}).get(
"data-delayed-url" "data-delayed-url"
), ),
"job_function": job_function,
} }
def _get_location(self, metadata_card: Optional[Tag]) -> Location: def _get_location(self, metadata_card: Optional[Tag]) -> Location: