mirror of
https://github.com/Bunsly/JobSpy.git
synced 2026-03-05 03:54:31 -08:00
Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
89a3ee231c | ||
|
|
6439f71433 | ||
|
|
7f6271b2e0 |
@@ -13,9 +13,6 @@ work with us.*
|
|||||||
- Aggregates the job postings in a Pandas DataFrame
|
- Aggregates the job postings in a Pandas DataFrame
|
||||||
- Proxies support
|
- Proxies support
|
||||||
|
|
||||||
[Video Guide for JobSpy](https://www.youtube.com/watch?v=RuP1HrAZnxs&pp=ygUgam9icyBzY3JhcGVyIGJvdCBsaW5rZWRpbiBpbmRlZWQ%3D) -
|
|
||||||
Updated for release v1.1.3
|
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
### Installation
|
### Installation
|
||||||
@@ -41,12 +38,12 @@ jobs = scrape_jobs(
|
|||||||
country_indeed='USA', # only needed for indeed / glassdoor
|
country_indeed='USA', # only needed for indeed / glassdoor
|
||||||
|
|
||||||
# linkedin_fetch_description=True # get full description and direct job url for linkedin (slower)
|
# linkedin_fetch_description=True # get full description and direct job url for linkedin (slower)
|
||||||
# proxies=["Efb5EA8OIk0BQb:wifi;us;@proxy.soax.com:9000", "localhost"],
|
# proxies=["208.195.175.46:65095", "208.195.175.45:65095", "localhost"],
|
||||||
|
|
||||||
)
|
)
|
||||||
print(f"Found {len(jobs)} jobs")
|
print(f"Found {len(jobs)} jobs")
|
||||||
print(jobs.head())
|
print(jobs.head())
|
||||||
jobs.to_csv("jobs.csv", quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False) # to_xlsx
|
jobs.to_csv("jobs.csv", quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False) # to_excel
|
||||||
```
|
```
|
||||||
|
|
||||||
### Output
|
### Output
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "python-jobspy"
|
name = "python-jobspy"
|
||||||
version = "1.1.54"
|
version = "1.1.55"
|
||||||
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
||||||
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
||||||
homepage = "https://github.com/Bunsly/JobSpy"
|
homepage = "https://github.com/Bunsly/JobSpy"
|
||||||
|
|||||||
@@ -182,6 +182,7 @@ def scrape_jobs(
|
|||||||
"max_amount",
|
"max_amount",
|
||||||
"currency",
|
"currency",
|
||||||
"is_remote",
|
"is_remote",
|
||||||
|
"job_function",
|
||||||
"emails",
|
"emails",
|
||||||
"description",
|
"description",
|
||||||
"company_url",
|
"company_url",
|
||||||
|
|||||||
@@ -254,6 +254,9 @@ class JobPost(BaseModel):
|
|||||||
logo_photo_url: str | None = None
|
logo_photo_url: str | None = None
|
||||||
banner_photo_url: str | None = None
|
banner_photo_url: str | None = None
|
||||||
|
|
||||||
|
# linkedin only atm
|
||||||
|
job_function: str | None = None
|
||||||
|
|
||||||
|
|
||||||
class JobResponse(BaseModel):
|
class JobResponse(BaseModel):
|
||||||
jobs: list[JobPost] = []
|
jobs: list[JobPost] = []
|
||||||
|
|||||||
@@ -13,7 +13,6 @@ import regex as re
|
|||||||
from typing import Optional
|
from typing import Optional
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
from threading import Lock
|
|
||||||
from bs4.element import Tag
|
from bs4.element import Tag
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from urllib.parse import urlparse, urlunparse, unquote
|
from urllib.parse import urlparse, urlunparse, unquote
|
||||||
@@ -71,8 +70,8 @@ class LinkedInScraper(Scraper):
|
|||||||
self.scraper_input = scraper_input
|
self.scraper_input = scraper_input
|
||||||
job_list: list[JobPost] = []
|
job_list: list[JobPost] = []
|
||||||
seen_urls = set()
|
seen_urls = set()
|
||||||
url_lock = Lock()
|
page = scraper_input.offset // 10 * 10 if scraper_input.offset else 0
|
||||||
page = scraper_input.offset // 25 + 25 if scraper_input.offset else 0
|
request_count = 0
|
||||||
seconds_old = (
|
seconds_old = (
|
||||||
scraper_input.hours_old * 3600 if scraper_input.hours_old else None
|
scraper_input.hours_old * 3600 if scraper_input.hours_old else None
|
||||||
)
|
)
|
||||||
@@ -80,7 +79,8 @@ class LinkedInScraper(Scraper):
|
|||||||
lambda: len(job_list) < scraper_input.results_wanted and page < 1000
|
lambda: len(job_list) < scraper_input.results_wanted and page < 1000
|
||||||
)
|
)
|
||||||
while continue_search():
|
while continue_search():
|
||||||
logger.info(f"LinkedIn search page: {page // 25 + 1}")
|
request_count += 1
|
||||||
|
logger.info(f"LinkedIn search page: {request_count}")
|
||||||
params = {
|
params = {
|
||||||
"keywords": scraper_input.search_term,
|
"keywords": scraper_input.search_term,
|
||||||
"location": scraper_input.location,
|
"location": scraper_input.location,
|
||||||
@@ -92,7 +92,7 @@ class LinkedInScraper(Scraper):
|
|||||||
else None
|
else None
|
||||||
),
|
),
|
||||||
"pageNum": 0,
|
"pageNum": 0,
|
||||||
"start": page + scraper_input.offset,
|
"start": page,
|
||||||
"f_AL": "true" if scraper_input.easy_apply else None,
|
"f_AL": "true" if scraper_input.easy_apply else None,
|
||||||
"f_C": (
|
"f_C": (
|
||||||
",".join(map(str, scraper_input.linkedin_company_ids))
|
",".join(map(str, scraper_input.linkedin_company_ids))
|
||||||
@@ -140,10 +140,9 @@ class LinkedInScraper(Scraper):
|
|||||||
job_id = href.split("-")[-1]
|
job_id = href.split("-")[-1]
|
||||||
job_url = f"{self.base_url}/jobs/view/{job_id}"
|
job_url = f"{self.base_url}/jobs/view/{job_id}"
|
||||||
|
|
||||||
with url_lock:
|
if job_url in seen_urls:
|
||||||
if job_url in seen_urls:
|
continue
|
||||||
continue
|
seen_urls.add(job_url)
|
||||||
seen_urls.add(job_url)
|
|
||||||
try:
|
try:
|
||||||
fetch_desc = scraper_input.linkedin_fetch_description
|
fetch_desc = scraper_input.linkedin_fetch_description
|
||||||
job_post = self._process_job(job_card, job_url, fetch_desc)
|
job_post = self._process_job(job_card, job_url, fetch_desc)
|
||||||
@@ -156,7 +155,7 @@ class LinkedInScraper(Scraper):
|
|||||||
|
|
||||||
if continue_search():
|
if continue_search():
|
||||||
time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
|
time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
|
||||||
page += self.jobs_per_page
|
page += len(job_list)
|
||||||
|
|
||||||
job_list = job_list[: scraper_input.results_wanted]
|
job_list = job_list[: scraper_input.results_wanted]
|
||||||
return JobResponse(jobs=job_list)
|
return JobResponse(jobs=job_list)
|
||||||
@@ -225,6 +224,7 @@ class LinkedInScraper(Scraper):
|
|||||||
job_url_direct=job_details.get("job_url_direct"),
|
job_url_direct=job_details.get("job_url_direct"),
|
||||||
emails=extract_emails_from_text(job_details.get("description")),
|
emails=extract_emails_from_text(job_details.get("description")),
|
||||||
logo_photo_url=job_details.get("logo_photo_url"),
|
logo_photo_url=job_details.get("logo_photo_url"),
|
||||||
|
job_function=job_details.get("job_function"),
|
||||||
)
|
)
|
||||||
|
|
||||||
def _get_id(self, url: str):
|
def _get_id(self, url: str):
|
||||||
@@ -248,7 +248,7 @@ class LinkedInScraper(Scraper):
|
|||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
except:
|
except:
|
||||||
return {}
|
return {}
|
||||||
if response.url == "https://www.linkedin.com/signup":
|
if "linkedin.com/signup" in response.url:
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
@@ -267,6 +267,18 @@ class LinkedInScraper(Scraper):
|
|||||||
description = div_content.prettify(formatter="html")
|
description = div_content.prettify(formatter="html")
|
||||||
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
||||||
description = markdown_converter(description)
|
description = markdown_converter(description)
|
||||||
|
|
||||||
|
h3_tag = soup.find(
|
||||||
|
"h3", text=lambda text: text and "Job function" in text.strip()
|
||||||
|
)
|
||||||
|
|
||||||
|
job_function = None
|
||||||
|
if h3_tag:
|
||||||
|
job_function_span = h3_tag.find_next(
|
||||||
|
"span", class_="description__job-criteria-text"
|
||||||
|
)
|
||||||
|
if job_function_span:
|
||||||
|
job_function = job_function_span.text.strip()
|
||||||
return {
|
return {
|
||||||
"description": description,
|
"description": description,
|
||||||
"job_type": self._parse_job_type(soup),
|
"job_type": self._parse_job_type(soup),
|
||||||
@@ -274,6 +286,7 @@ class LinkedInScraper(Scraper):
|
|||||||
"logo_photo_url": soup.find("img", {"class": "artdeco-entity-image"}).get(
|
"logo_photo_url": soup.find("img", {"class": "artdeco-entity-image"}).get(
|
||||||
"data-delayed-url"
|
"data-delayed-url"
|
||||||
),
|
),
|
||||||
|
"job_function": job_function,
|
||||||
}
|
}
|
||||||
|
|
||||||
def _get_location(self, metadata_card: Optional[Tag]) -> Location:
|
def _get_location(self, metadata_card: Optional[Tag]) -> Location:
|
||||||
|
|||||||
Reference in New Issue
Block a user