Compare commits

...

3 Commits

Author SHA1 Message Date
Cullen Watson
89a3ee231c enh(li): job function (#160) 2024-05-28 16:01:29 -05:00
Cullen
6439f71433 chore: version 2024-05-28 15:39:24 -05:00
adamagassi
7f6271b2e0 LinkedIn scraper fixes: (#159)
Correct initial page offset calculation
Separate page variable from request counter
Fix job offset starting value
Increment offset by number of jobs returned instead of expected value
2024-05-28 15:38:13 -05:00
5 changed files with 31 additions and 17 deletions

View File

@@ -13,9 +13,6 @@ work with us.*
- Aggregates the job postings in a Pandas DataFrame
- Proxies support
[Video Guide for JobSpy](https://www.youtube.com/watch?v=RuP1HrAZnxs&pp=ygUgam9icyBzY3JhcGVyIGJvdCBsaW5rZWRpbiBpbmRlZWQ%3D) -
Updated for release v1.1.3
![jobspy](https://github.com/cullenwatson/JobSpy/assets/78247585/ec7ef355-05f6-4fd3-8161-a817e31c5c57)
### Installation
@@ -41,12 +38,12 @@ jobs = scrape_jobs(
country_indeed='USA', # only needed for indeed / glassdoor
# linkedin_fetch_description=True # get full description and direct job url for linkedin (slower)
# proxies=["Efb5EA8OIk0BQb:wifi;us;@proxy.soax.com:9000", "localhost"],
# proxies=["208.195.175.46:65095", "208.195.175.45:65095", "localhost"],
)
print(f"Found {len(jobs)} jobs")
print(jobs.head())
jobs.to_csv("jobs.csv", quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False) # to_xlsx
jobs.to_csv("jobs.csv", quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False) # to_excel
```
### Output

View File

@@ -1,6 +1,6 @@
[tool.poetry]
name = "python-jobspy"
version = "1.1.54"
version = "1.1.55"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy"

View File

@@ -182,6 +182,7 @@ def scrape_jobs(
"max_amount",
"currency",
"is_remote",
"job_function",
"emails",
"description",
"company_url",

View File

@@ -254,6 +254,9 @@ class JobPost(BaseModel):
logo_photo_url: str | None = None
banner_photo_url: str | None = None
# linkedin only atm
job_function: str | None = None
class JobResponse(BaseModel):
jobs: list[JobPost] = []

View File

@@ -13,7 +13,6 @@ import regex as re
from typing import Optional
from datetime import datetime
from threading import Lock
from bs4.element import Tag
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urlunparse, unquote
@@ -71,8 +70,8 @@ class LinkedInScraper(Scraper):
self.scraper_input = scraper_input
job_list: list[JobPost] = []
seen_urls = set()
url_lock = Lock()
page = scraper_input.offset // 25 + 25 if scraper_input.offset else 0
page = scraper_input.offset // 10 * 10 if scraper_input.offset else 0
request_count = 0
seconds_old = (
scraper_input.hours_old * 3600 if scraper_input.hours_old else None
)
@@ -80,7 +79,8 @@ class LinkedInScraper(Scraper):
lambda: len(job_list) < scraper_input.results_wanted and page < 1000
)
while continue_search():
logger.info(f"LinkedIn search page: {page // 25 + 1}")
request_count += 1
logger.info(f"LinkedIn search page: {request_count}")
params = {
"keywords": scraper_input.search_term,
"location": scraper_input.location,
@@ -92,7 +92,7 @@ class LinkedInScraper(Scraper):
else None
),
"pageNum": 0,
"start": page + scraper_input.offset,
"start": page,
"f_AL": "true" if scraper_input.easy_apply else None,
"f_C": (
",".join(map(str, scraper_input.linkedin_company_ids))
@@ -140,7 +140,6 @@ class LinkedInScraper(Scraper):
job_id = href.split("-")[-1]
job_url = f"{self.base_url}/jobs/view/{job_id}"
with url_lock:
if job_url in seen_urls:
continue
seen_urls.add(job_url)
@@ -156,7 +155,7 @@ class LinkedInScraper(Scraper):
if continue_search():
time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
page += self.jobs_per_page
page += len(job_list)
job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list)
@@ -225,6 +224,7 @@ class LinkedInScraper(Scraper):
job_url_direct=job_details.get("job_url_direct"),
emails=extract_emails_from_text(job_details.get("description")),
logo_photo_url=job_details.get("logo_photo_url"),
job_function=job_details.get("job_function"),
)
def _get_id(self, url: str):
@@ -248,7 +248,7 @@ class LinkedInScraper(Scraper):
response.raise_for_status()
except:
return {}
if response.url == "https://www.linkedin.com/signup":
if "linkedin.com/signup" in response.url:
return {}
soup = BeautifulSoup(response.text, "html.parser")
@@ -267,6 +267,18 @@ class LinkedInScraper(Scraper):
description = div_content.prettify(formatter="html")
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
description = markdown_converter(description)
h3_tag = soup.find(
"h3", text=lambda text: text and "Job function" in text.strip()
)
job_function = None
if h3_tag:
job_function_span = h3_tag.find_next(
"span", class_="description__job-criteria-text"
)
if job_function_span:
job_function = job_function_span.text.strip()
return {
"description": description,
"job_type": self._parse_job_type(soup),
@@ -274,6 +286,7 @@ class LinkedInScraper(Scraper):
"logo_photo_url": soup.find("img", {"class": "artdeco-entity-image"}).get(
"data-delayed-url"
),
"job_function": job_function,
}
def _get_location(self, metadata_card: Optional[Tag]) -> Location: