mirror of https://github.com/Bunsly/JobSpy
parent
6439f71433
commit
89a3ee231c
|
@ -13,9 +13,6 @@ work with us.*
|
||||||
- Aggregates the job postings in a Pandas DataFrame
|
- Aggregates the job postings in a Pandas DataFrame
|
||||||
- Proxies support
|
- Proxies support
|
||||||
|
|
||||||
[Video Guide for JobSpy](https://www.youtube.com/watch?v=RuP1HrAZnxs&pp=ygUgam9icyBzY3JhcGVyIGJvdCBsaW5rZWRpbiBpbmRlZWQ%3D) -
|
|
||||||
Updated for release v1.1.3
|
|
||||||
|
|
||||||
![jobspy](https://github.com/cullenwatson/JobSpy/assets/78247585/ec7ef355-05f6-4fd3-8161-a817e31c5c57)
|
![jobspy](https://github.com/cullenwatson/JobSpy/assets/78247585/ec7ef355-05f6-4fd3-8161-a817e31c5c57)
|
||||||
|
|
||||||
### Installation
|
### Installation
|
||||||
|
@ -46,7 +43,7 @@ jobs = scrape_jobs(
|
||||||
)
|
)
|
||||||
print(f"Found {len(jobs)} jobs")
|
print(f"Found {len(jobs)} jobs")
|
||||||
print(jobs.head())
|
print(jobs.head())
|
||||||
jobs.to_csv("jobs.csv", quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False) # to_xlsx
|
jobs.to_csv("jobs.csv", quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False) # to_excel
|
||||||
```
|
```
|
||||||
|
|
||||||
### Output
|
### Output
|
||||||
|
|
|
@ -182,6 +182,7 @@ def scrape_jobs(
|
||||||
"max_amount",
|
"max_amount",
|
||||||
"currency",
|
"currency",
|
||||||
"is_remote",
|
"is_remote",
|
||||||
|
"job_function",
|
||||||
"emails",
|
"emails",
|
||||||
"description",
|
"description",
|
||||||
"company_url",
|
"company_url",
|
||||||
|
|
|
@ -254,6 +254,9 @@ class JobPost(BaseModel):
|
||||||
logo_photo_url: str | None = None
|
logo_photo_url: str | None = None
|
||||||
banner_photo_url: str | None = None
|
banner_photo_url: str | None = None
|
||||||
|
|
||||||
|
# linkedin only atm
|
||||||
|
job_function: str | None = None
|
||||||
|
|
||||||
|
|
||||||
class JobResponse(BaseModel):
|
class JobResponse(BaseModel):
|
||||||
jobs: list[JobPost] = []
|
jobs: list[JobPost] = []
|
||||||
|
|
|
@ -224,6 +224,7 @@ class LinkedInScraper(Scraper):
|
||||||
job_url_direct=job_details.get("job_url_direct"),
|
job_url_direct=job_details.get("job_url_direct"),
|
||||||
emails=extract_emails_from_text(job_details.get("description")),
|
emails=extract_emails_from_text(job_details.get("description")),
|
||||||
logo_photo_url=job_details.get("logo_photo_url"),
|
logo_photo_url=job_details.get("logo_photo_url"),
|
||||||
|
job_function=job_details.get("job_function"),
|
||||||
)
|
)
|
||||||
|
|
||||||
def _get_id(self, url: str):
|
def _get_id(self, url: str):
|
||||||
|
@ -247,7 +248,7 @@ class LinkedInScraper(Scraper):
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
except:
|
except:
|
||||||
return {}
|
return {}
|
||||||
if response.url == "https://www.linkedin.com/signup":
|
if "linkedin.com/signup" in response.url:
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
@ -266,6 +267,18 @@ class LinkedInScraper(Scraper):
|
||||||
description = div_content.prettify(formatter="html")
|
description = div_content.prettify(formatter="html")
|
||||||
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
||||||
description = markdown_converter(description)
|
description = markdown_converter(description)
|
||||||
|
|
||||||
|
h3_tag = soup.find(
|
||||||
|
"h3", text=lambda text: text and "Job function" in text.strip()
|
||||||
|
)
|
||||||
|
|
||||||
|
job_function = None
|
||||||
|
if h3_tag:
|
||||||
|
job_function_span = h3_tag.find_next(
|
||||||
|
"span", class_="description__job-criteria-text"
|
||||||
|
)
|
||||||
|
if job_function_span:
|
||||||
|
job_function = job_function_span.text.strip()
|
||||||
return {
|
return {
|
||||||
"description": description,
|
"description": description,
|
||||||
"job_type": self._parse_job_type(soup),
|
"job_type": self._parse_job_type(soup),
|
||||||
|
@ -273,6 +286,7 @@ class LinkedInScraper(Scraper):
|
||||||
"logo_photo_url": soup.find("img", {"class": "artdeco-entity-image"}).get(
|
"logo_photo_url": soup.find("img", {"class": "artdeco-entity-image"}).get(
|
||||||
"data-delayed-url"
|
"data-delayed-url"
|
||||||
),
|
),
|
||||||
|
"job_function": job_function,
|
||||||
}
|
}
|
||||||
|
|
||||||
def _get_location(self, metadata_card: Optional[Tag]) -> Location:
|
def _get_location(self, metadata_card: Optional[Tag]) -> Location:
|
||||||
|
|
Loading…
Reference in New Issue