add offset param & email extraction (#51)

* add offset param

* [enh]: extract emails
This commit is contained in:
Cullen Watson
2023-09-28 18:11:28 -05:00
committed by GitHub
parent 286b9e1256
commit af07c1ecbd
17 changed files with 1209 additions and 1126 deletions

View File

@@ -4,14 +4,16 @@ jobspy.scrapers.linkedin
This module contains routines to scrape LinkedIn.
"""
from typing import Optional, Tuple
from typing import Optional
from datetime import datetime
import traceback
import requests
from requests.exceptions import Timeout, ProxyError
import time
from requests.exceptions import ProxyError
from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup
from bs4.element import Tag
from threading import Lock
from .. import Scraper, ScraperInput, Site
from ..exceptions import LinkedInException
@@ -20,17 +22,20 @@ from ...jobs import (
Location,
JobResponse,
JobType,
Compensation,
CompensationInterval,
)
from ...utils import extract_emails_from_text
class LinkedInScraper(Scraper):
MAX_RETRIES = 3
DELAY = 10
def __init__(self, proxy: Optional[str] = None):
"""
Initializes LinkedInScraper with the LinkedIn job search url
"""
site = Site(Site.LINKEDIN)
self.country = "worldwide"
self.url = "https://www.linkedin.com"
super().__init__(site, proxy=proxy)
@@ -40,12 +45,12 @@ class LinkedInScraper(Scraper):
:param scraper_input:
:return: job_response
"""
self.country = "worldwide"
job_list: list[JobPost] = []
seen_urls = set()
page, processed_jobs, job_count = 0, 0, 0
url_lock = Lock()
page = scraper_input.offset // 25 + 25 if scraper_input.offset else 0
def job_type_code(job_type):
def job_type_code(job_type_enum):
mapping = {
JobType.FULL_TIME: "F",
JobType.PART_TIME: "P",
@@ -54,117 +59,125 @@ class LinkedInScraper(Scraper):
JobType.TEMPORARY: "T",
}
return mapping.get(job_type, "")
return mapping.get(job_type_enum, "")
with requests.Session() as session:
while len(job_list) < scraper_input.results_wanted:
params = {
"keywords": scraper_input.search_term,
"location": scraper_input.location,
"distance": scraper_input.distance,
"f_WT": 2 if scraper_input.is_remote else None,
"f_JT": job_type_code(scraper_input.job_type)
if scraper_input.job_type
else None,
"pageNum": page,
"f_AL": "true" if scraper_input.easy_apply else None,
}
while len(job_list) < scraper_input.results_wanted and page < 1000:
params = {
"keywords": scraper_input.search_term,
"location": scraper_input.location,
"distance": scraper_input.distance,
"f_WT": 2 if scraper_input.is_remote else None,
"f_JT": job_type_code(scraper_input.job_type)
if scraper_input.job_type
else None,
"pageNum": 0,
page: page + scraper_input.offset,
"f_AL": "true" if scraper_input.easy_apply else None,
}
params = {k: v for k, v in params.items() if v is not None}
params = {k: v for k, v in params.items() if v is not None}
params = {k: v for k, v in params.items() if v is not None}
retries = 0
while retries < self.MAX_RETRIES:
try:
response = session.get(
f"{self.url}/jobs/search",
response = requests.get(
f"{self.url}/jobs-guest/jobs/api/seeMoreJobPostings/search?",
params=params,
allow_redirects=True,
proxies=self.proxy,
timeout=10,
)
response.raise_for_status()
break
except requests.HTTPError as e:
raise LinkedInException(
f"bad response status code: {response.status_code}"
)
if hasattr(e, 'response') and e.response is not None:
if e.response.status_code == 429:
time.sleep(self.DELAY)
retries += 1
continue
else:
raise LinkedInException(f"bad response status code: {e.response.status_code}")
else:
raise
except ProxyError as e:
raise LinkedInException("bad proxy")
except (ProxyError, Exception) as e:
except Exception as e:
raise LinkedInException(str(e))
else:
# Raise an exception if the maximum number of retries is reached
raise LinkedInException("Max retries reached, failed to get a valid response")
soup = BeautifulSoup(response.text, "html.parser")
soup = BeautifulSoup(response.text, "html.parser")
if page == 0:
job_count_text = soup.find(
"span", class_="results-context-header__job-count"
).text
job_count = int("".join(filter(str.isdigit, job_count_text)))
with ThreadPoolExecutor(max_workers=5) as executor:
futures = []
for job_card in soup.find_all("div", class_="base-search-card"):
job_url = None
href_tag = job_card.find("a", class_="base-card__full-link")
if href_tag and "href" in href_tag.attrs:
href = href_tag.attrs["href"].split("?")[0]
job_id = href.split("-")[-1]
job_url = f"{self.url}/jobs/view/{job_id}"
for job_card in soup.find_all(
"div",
class_="base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card",
):
processed_jobs += 1
data_entity_urn = job_card.get("data-entity-urn", "")
job_id = (
data_entity_urn.split(":")[-1] if data_entity_urn else "N/A"
)
job_url = f"{self.url}/jobs/view/{job_id}"
if job_url in seen_urls:
continue
seen_urls.add(job_url)
job_info = job_card.find("div", class_="base-search-card__info")
if job_info is None:
continue
title_tag = job_info.find("h3", class_="base-search-card__title")
title = title_tag.text.strip() if title_tag else "N/A"
with url_lock:
if job_url in seen_urls:
continue
seen_urls.add(job_url)
company_tag = job_info.find("a", class_="hidden-nested-link")
company = company_tag.text.strip() if company_tag else "N/A"
futures.append(executor.submit(self.process_job, job_card, job_url))
metadata_card = job_info.find(
"div", class_="base-search-card__metadata"
)
location: Location = self.get_location(metadata_card)
datetime_tag = metadata_card.find(
"time", class_="job-search-card__listdate"
)
description, job_type = self.get_description(job_url)
if datetime_tag:
datetime_str = datetime_tag["datetime"]
try:
date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")
except Exception as e:
date_posted = None
else:
date_posted = None
job_post = JobPost(
title=title,
description=description,
company_name=company,
location=location,
date_posted=date_posted,
job_url=job_url,
job_type=job_type,
compensation=Compensation(
interval=CompensationInterval.YEARLY, currency=None
),
)
job_list.append(job_post)
if processed_jobs >= job_count:
break
if len(job_list) >= scraper_input.results_wanted:
break
if processed_jobs >= job_count:
break
if len(job_list) >= scraper_input.results_wanted:
break
page += 1
for future in as_completed(futures):
try:
job_post = future.result()
if job_post:
job_list.append(job_post)
except Exception as e:
raise LinkedInException("Exception occurred while processing jobs")
page += 25
job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list)
def get_description(self, job_page_url: str) -> Optional[str]:
def process_job(self, job_card: Tag, job_url: str) -> Optional[JobPost]:
title_tag = job_card.find("span", class_="sr-only")
title = title_tag.get_text(strip=True) if title_tag else "N/A"
company_tag = job_card.find("h4", class_="base-search-card__subtitle")
company_a_tag = company_tag.find("a") if company_tag else None
company = company_a_tag.get_text(strip=True) if company_a_tag else "N/A"
metadata_card = job_card.find("div", class_="base-search-card__metadata")
location = self.get_location(metadata_card)
datetime_tag = metadata_card.find("time", class_="job-search-card__listdate") if metadata_card else None
date_posted = None
if datetime_tag and "datetime" in datetime_tag.attrs:
datetime_str = datetime_tag["datetime"]
try:
date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")
except Exception as e:
date_posted = None
benefits_tag = job_card.find("span", class_="result-benefits__text")
benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None
description, job_type = self.get_job_description(job_url)
return JobPost(
title=title,
description=description,
company_name=company,
location=location,
date_posted=date_posted,
job_url=job_url,
job_type=job_type,
benefits=benefits,
emails=extract_emails_from_text(description)
)
def get_job_description(self, job_page_url: str) -> tuple[None, None] | tuple[
str | None, tuple[str | None, JobType | None]]:
"""
Retrieves job description by going to the job page url
:param job_page_url:
@@ -181,19 +194,19 @@ class LinkedInScraper(Scraper):
"div", class_=lambda x: x and "show-more-less-html__markup" in x
)
text_content = None
description = None
if div_content:
text_content = " ".join(div_content.get_text().split()).strip()
description = " ".join(div_content.get_text().split()).strip()
def get_job_type(
soup: BeautifulSoup,
) -> Tuple[Optional[str], Optional[JobType]]:
soup_job_type: BeautifulSoup,
) -> JobType | None:
"""
Gets the job type from job page
:param soup:
:param soup_job_type:
:return: JobType
"""
h3_tag = soup.find(
h3_tag = soup_job_type.find(
"h3",
class_="description__job-criteria-subheader",
string=lambda text: "Employment type" in text,
@@ -212,7 +225,7 @@ class LinkedInScraper(Scraper):
return LinkedInScraper.get_enum_from_value(employment_type)
return text_content, get_job_type(soup)
return description, get_job_type(soup)
@staticmethod
def get_enum_from_value(value_str):