2023-09-07 09:28:17 -07:00
|
|
|
"""
|
|
|
|
jobspy.scrapers.linkedin
|
|
|
|
~~~~~~~~~~~~~~~~~~~
|
|
|
|
|
|
|
|
This module contains routines to scrape LinkedIn.
|
|
|
|
"""
|
2023-09-28 16:11:28 -07:00
|
|
|
from typing import Optional
|
2023-08-31 08:29:43 -07:00
|
|
|
from datetime import datetime
|
2023-07-08 07:34:55 -07:00
|
|
|
|
|
|
|
import requests
|
2023-09-28 16:11:28 -07:00
|
|
|
import time
|
|
|
|
from requests.exceptions import ProxyError
|
2023-07-08 07:34:55 -07:00
|
|
|
from bs4 import BeautifulSoup
|
2023-07-10 20:07:19 -07:00
|
|
|
from bs4.element import Tag
|
2023-09-28 16:11:28 -07:00
|
|
|
from threading import Lock
|
2023-11-08 13:51:07 -08:00
|
|
|
from urllib.parse import urlparse, urlunparse
|
2023-07-08 07:34:55 -07:00
|
|
|
|
2023-09-03 07:29:25 -07:00
|
|
|
from .. import Scraper, ScraperInput, Site
|
2023-11-09 12:57:15 -08:00
|
|
|
from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type, currency_parser
|
2023-09-07 09:28:17 -07:00
|
|
|
from ..exceptions import LinkedInException
|
2023-11-09 12:57:15 -08:00
|
|
|
from ...jobs import JobPost, Location, JobResponse, JobType, Country, Compensation
|
2023-09-28 16:33:14 -07:00
|
|
|
|
|
|
|
|
2023-07-08 07:34:55 -07:00
|
|
|
class LinkedInScraper(Scraper):
|
2023-09-28 16:11:28 -07:00
|
|
|
MAX_RETRIES = 3
|
|
|
|
DELAY = 10
|
|
|
|
|
2023-09-07 09:28:17 -07:00
|
|
|
def __init__(self, proxy: Optional[str] = None):
|
2023-07-10 20:07:19 -07:00
|
|
|
"""
|
|
|
|
Initializes LinkedInScraper with the LinkedIn job search url
|
|
|
|
"""
|
2023-07-08 07:34:55 -07:00
|
|
|
site = Site(Site.LINKEDIN)
|
2023-09-28 16:11:28 -07:00
|
|
|
self.country = "worldwide"
|
2023-09-05 10:17:22 -07:00
|
|
|
self.url = "https://www.linkedin.com"
|
2023-09-07 09:28:17 -07:00
|
|
|
super().__init__(site, proxy=proxy)
|
2023-07-08 07:34:55 -07:00
|
|
|
|
|
|
|
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
2023-07-10 20:07:19 -07:00
|
|
|
"""
|
|
|
|
Scrapes LinkedIn for jobs with scraper_input criteria
|
|
|
|
:param scraper_input:
|
|
|
|
:return: job_response
|
|
|
|
"""
|
2023-07-08 07:34:55 -07:00
|
|
|
job_list: list[JobPost] = []
|
2023-07-10 20:07:19 -07:00
|
|
|
seen_urls = set()
|
2023-09-28 16:11:28 -07:00
|
|
|
url_lock = Lock()
|
|
|
|
page = scraper_input.offset // 25 + 25 if scraper_input.offset else 0
|
2023-07-10 20:07:19 -07:00
|
|
|
|
2023-09-28 16:11:28 -07:00
|
|
|
def job_type_code(job_type_enum):
|
2023-07-11 06:24:59 -07:00
|
|
|
mapping = {
|
|
|
|
JobType.FULL_TIME: "F",
|
|
|
|
JobType.PART_TIME: "P",
|
|
|
|
JobType.INTERNSHIP: "I",
|
|
|
|
JobType.CONTRACT: "C",
|
|
|
|
JobType.TEMPORARY: "T",
|
|
|
|
}
|
|
|
|
|
2023-09-28 16:11:28 -07:00
|
|
|
return mapping.get(job_type_enum, "")
|
|
|
|
|
|
|
|
while len(job_list) < scraper_input.results_wanted and page < 1000:
|
|
|
|
params = {
|
|
|
|
"keywords": scraper_input.search_term,
|
|
|
|
"location": scraper_input.location,
|
|
|
|
"distance": scraper_input.distance,
|
|
|
|
"f_WT": 2 if scraper_input.is_remote else None,
|
|
|
|
"f_JT": job_type_code(scraper_input.job_type)
|
|
|
|
if scraper_input.job_type
|
|
|
|
else None,
|
|
|
|
"pageNum": 0,
|
2023-11-08 13:51:07 -08:00
|
|
|
"start": page + scraper_input.offset,
|
2023-09-28 16:11:28 -07:00
|
|
|
"f_AL": "true" if scraper_input.easy_apply else None,
|
|
|
|
}
|
|
|
|
|
|
|
|
params = {k: v for k, v in params.items() if v is not None}
|
|
|
|
retries = 0
|
|
|
|
while retries < self.MAX_RETRIES:
|
2023-09-07 09:28:17 -07:00
|
|
|
try:
|
2023-09-28 16:11:28 -07:00
|
|
|
response = requests.get(
|
|
|
|
f"{self.url}/jobs-guest/jobs/api/seeMoreJobPostings/search?",
|
2023-09-07 09:28:17 -07:00
|
|
|
params=params,
|
|
|
|
allow_redirects=True,
|
|
|
|
proxies=self.proxy,
|
|
|
|
timeout=10,
|
|
|
|
)
|
|
|
|
response.raise_for_status()
|
2023-09-28 16:11:28 -07:00
|
|
|
|
|
|
|
break
|
2023-09-07 09:28:17 -07:00
|
|
|
except requests.HTTPError as e:
|
2023-10-10 09:23:04 -07:00
|
|
|
if hasattr(e, "response") and e.response is not None:
|
2023-11-08 13:51:07 -08:00
|
|
|
if e.response.status_code in (429, 502):
|
2023-09-28 16:11:28 -07:00
|
|
|
time.sleep(self.DELAY)
|
|
|
|
retries += 1
|
|
|
|
continue
|
|
|
|
else:
|
2023-10-10 09:23:04 -07:00
|
|
|
raise LinkedInException(
|
|
|
|
f"bad response status code: {e.response.status_code}"
|
|
|
|
)
|
2023-09-28 16:11:28 -07:00
|
|
|
else:
|
|
|
|
raise
|
2023-09-07 09:28:17 -07:00
|
|
|
except ProxyError as e:
|
|
|
|
raise LinkedInException("bad proxy")
|
2023-09-28 16:11:28 -07:00
|
|
|
except Exception as e:
|
2023-09-07 09:28:17 -07:00
|
|
|
raise LinkedInException(str(e))
|
2023-09-28 16:11:28 -07:00
|
|
|
else:
|
|
|
|
# Raise an exception if the maximum number of retries is reached
|
2023-10-10 09:23:04 -07:00
|
|
|
raise LinkedInException(
|
|
|
|
"Max retries reached, failed to get a valid response"
|
|
|
|
)
|
2023-07-10 20:07:19 -07:00
|
|
|
|
2023-09-28 16:11:28 -07:00
|
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
2023-07-10 20:07:19 -07:00
|
|
|
|
2023-11-08 13:51:07 -08:00
|
|
|
for job_card in soup.find_all("div", class_="base-search-card"):
|
|
|
|
job_url = None
|
|
|
|
href_tag = job_card.find("a", class_="base-card__full-link")
|
|
|
|
if href_tag and "href" in href_tag.attrs:
|
|
|
|
href = href_tag.attrs["href"].split("?")[0]
|
|
|
|
job_id = href.split("-")[-1]
|
|
|
|
job_url = f"{self.url}/jobs/view/{job_id}"
|
|
|
|
|
|
|
|
with url_lock:
|
|
|
|
if job_url in seen_urls:
|
|
|
|
continue
|
|
|
|
seen_urls.add(job_url)
|
|
|
|
|
|
|
|
# Call process_job directly without threading
|
|
|
|
try:
|
|
|
|
job_post = self.process_job(job_card, job_url)
|
|
|
|
if job_post:
|
|
|
|
job_list.append(job_post)
|
|
|
|
except Exception as e:
|
|
|
|
raise LinkedInException("Exception occurred while processing jobs")
|
|
|
|
|
2023-09-28 16:11:28 -07:00
|
|
|
page += 25
|
2023-07-10 20:07:19 -07:00
|
|
|
|
|
|
|
job_list = job_list[: scraper_input.results_wanted]
|
2023-09-07 09:28:17 -07:00
|
|
|
return JobResponse(jobs=job_list)
|
2023-07-08 07:34:55 -07:00
|
|
|
|
2023-09-28 16:11:28 -07:00
|
|
|
def process_job(self, job_card: Tag, job_url: str) -> Optional[JobPost]:
|
2023-11-09 12:57:15 -08:00
|
|
|
salary_tag = job_card.find('span', class_='job-search-card__salary-info')
|
|
|
|
|
|
|
|
compensation = None
|
|
|
|
if salary_tag:
|
|
|
|
salary_text = salary_tag.get_text(separator=' ').strip()
|
|
|
|
salary_values = [currency_parser(value) for value in salary_text.split('-')]
|
|
|
|
salary_min = salary_values[0]
|
|
|
|
salary_max = salary_values[1]
|
|
|
|
currency = salary_text[0] if salary_text[0] != '$' else 'USD'
|
|
|
|
|
|
|
|
compensation = Compensation(
|
|
|
|
min_amount=int(salary_min),
|
|
|
|
max_amount=int(salary_max),
|
|
|
|
currency=currency,
|
|
|
|
)
|
|
|
|
|
2023-09-28 16:11:28 -07:00
|
|
|
title_tag = job_card.find("span", class_="sr-only")
|
|
|
|
title = title_tag.get_text(strip=True) if title_tag else "N/A"
|
|
|
|
|
|
|
|
company_tag = job_card.find("h4", class_="base-search-card__subtitle")
|
|
|
|
company_a_tag = company_tag.find("a") if company_tag else None
|
2023-11-08 13:51:07 -08:00
|
|
|
company_url = (
|
|
|
|
urlunparse(urlparse(company_a_tag.get("href"))._replace(query=""))
|
|
|
|
if company_a_tag and company_a_tag.has_attr("href")
|
|
|
|
else ""
|
|
|
|
)
|
2023-09-28 16:11:28 -07:00
|
|
|
company = company_a_tag.get_text(strip=True) if company_a_tag else "N/A"
|
|
|
|
|
|
|
|
metadata_card = job_card.find("div", class_="base-search-card__metadata")
|
|
|
|
location = self.get_location(metadata_card)
|
|
|
|
|
2023-10-10 09:23:04 -07:00
|
|
|
datetime_tag = (
|
|
|
|
metadata_card.find("time", class_="job-search-card__listdate")
|
|
|
|
if metadata_card
|
|
|
|
else None
|
|
|
|
)
|
2023-09-28 16:11:28 -07:00
|
|
|
date_posted = None
|
|
|
|
if datetime_tag and "datetime" in datetime_tag.attrs:
|
|
|
|
datetime_str = datetime_tag["datetime"]
|
|
|
|
try:
|
|
|
|
date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")
|
|
|
|
except Exception as e:
|
|
|
|
date_posted = None
|
|
|
|
benefits_tag = job_card.find("span", class_="result-benefits__text")
|
|
|
|
benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None
|
|
|
|
|
|
|
|
description, job_type = self.get_job_description(job_url)
|
2023-11-08 13:51:07 -08:00
|
|
|
# description, job_type = None, []
|
2023-09-28 16:11:28 -07:00
|
|
|
|
|
|
|
return JobPost(
|
|
|
|
title=title,
|
|
|
|
description=description,
|
|
|
|
company_name=company,
|
2023-11-08 13:51:07 -08:00
|
|
|
company_url=company_url,
|
2023-09-28 16:11:28 -07:00
|
|
|
location=location,
|
|
|
|
date_posted=date_posted,
|
|
|
|
job_url=job_url,
|
|
|
|
job_type=job_type,
|
2023-11-09 12:57:15 -08:00
|
|
|
compensation=compensation,
|
2023-09-28 16:11:28 -07:00
|
|
|
benefits=benefits,
|
2023-10-10 09:23:04 -07:00
|
|
|
emails=extract_emails_from_text(description) if description else None,
|
|
|
|
num_urgent_words=count_urgent_words(description) if description else None,
|
2023-09-28 16:11:28 -07:00
|
|
|
)
|
|
|
|
|
2023-10-10 09:23:04 -07:00
|
|
|
def get_job_description(
|
|
|
|
self, job_page_url: str
|
|
|
|
) -> tuple[None, None] | tuple[str | None, tuple[str | None, JobType | None]]:
|
2023-08-26 05:07:29 -07:00
|
|
|
"""
|
|
|
|
Retrieves job description by going to the job page url
|
|
|
|
:param job_page_url:
|
|
|
|
:return: description or None
|
|
|
|
"""
|
2023-09-06 07:47:11 -07:00
|
|
|
try:
|
2023-09-07 09:28:17 -07:00
|
|
|
response = requests.get(job_page_url, timeout=5, proxies=self.proxy)
|
|
|
|
response.raise_for_status()
|
2023-11-08 13:51:07 -08:00
|
|
|
except requests.HTTPError as e:
|
|
|
|
if hasattr(e, "response") and e.response is not None:
|
|
|
|
if e.response.status_code in (429, 502):
|
|
|
|
time.sleep(self.DELAY)
|
|
|
|
return None, None
|
2023-09-07 09:28:17 -07:00
|
|
|
except Exception as e:
|
2023-08-31 12:01:47 -07:00
|
|
|
return None, None
|
2023-11-08 13:51:07 -08:00
|
|
|
if response.url == "https://www.linkedin.com/signup":
|
|
|
|
return None, None
|
2023-08-26 05:07:29 -07:00
|
|
|
|
|
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
div_content = soup.find(
|
|
|
|
"div", class_=lambda x: x and "show-more-less-html__markup" in x
|
|
|
|
)
|
|
|
|
|
2023-09-28 16:11:28 -07:00
|
|
|
description = None
|
2023-08-26 05:07:29 -07:00
|
|
|
if div_content:
|
2023-09-28 16:11:28 -07:00
|
|
|
description = " ".join(div_content.get_text().split()).strip()
|
2023-08-31 12:01:47 -07:00
|
|
|
|
|
|
|
def get_job_type(
|
2023-10-10 09:23:04 -07:00
|
|
|
soup_job_type: BeautifulSoup,
|
|
|
|
) -> list[JobType] | None:
|
2023-08-31 12:01:47 -07:00
|
|
|
"""
|
|
|
|
Gets the job type from job page
|
2023-09-28 16:11:28 -07:00
|
|
|
:param soup_job_type:
|
2023-08-31 12:01:47 -07:00
|
|
|
:return: JobType
|
|
|
|
"""
|
2023-09-28 16:11:28 -07:00
|
|
|
h3_tag = soup_job_type.find(
|
2023-08-31 12:01:47 -07:00
|
|
|
"h3",
|
|
|
|
class_="description__job-criteria-subheader",
|
|
|
|
string=lambda text: "Employment type" in text,
|
|
|
|
)
|
|
|
|
|
|
|
|
employment_type = None
|
|
|
|
if h3_tag:
|
|
|
|
employment_type_span = h3_tag.find_next_sibling(
|
|
|
|
"span",
|
|
|
|
class_="description__job-criteria-text description__job-criteria-text--criteria",
|
|
|
|
)
|
|
|
|
if employment_type_span:
|
|
|
|
employment_type = employment_type_span.get_text(strip=True)
|
|
|
|
employment_type = employment_type.lower()
|
|
|
|
employment_type = employment_type.replace("-", "")
|
|
|
|
|
2023-11-08 13:51:07 -08:00
|
|
|
return [get_enum_from_job_type(employment_type)] if employment_type else []
|
2023-08-31 12:01:47 -07:00
|
|
|
|
2023-09-28 16:11:28 -07:00
|
|
|
return description, get_job_type(soup)
|
2023-08-26 05:07:29 -07:00
|
|
|
|
2023-09-05 10:17:22 -07:00
|
|
|
def get_location(self, metadata_card: Optional[Tag]) -> Location:
|
2023-07-10 20:07:19 -07:00
|
|
|
"""
|
|
|
|
Extracts the location data from the job metadata card.
|
|
|
|
:param metadata_card
|
|
|
|
:return: location
|
|
|
|
"""
|
2023-10-30 17:57:36 -07:00
|
|
|
location = Location(country=Country.from_string(self.country))
|
2023-07-08 07:34:55 -07:00
|
|
|
if metadata_card is not None:
|
|
|
|
location_tag = metadata_card.find(
|
|
|
|
"span", class_="job-search-card__location"
|
|
|
|
)
|
|
|
|
location_string = location_tag.text.strip() if location_tag else "N/A"
|
|
|
|
parts = location_string.split(", ")
|
|
|
|
if len(parts) == 2:
|
|
|
|
city, state = parts
|
|
|
|
location = Location(
|
|
|
|
city=city,
|
|
|
|
state=state,
|
2023-10-30 17:57:36 -07:00
|
|
|
country=Country.from_string(self.country),
|
2023-07-08 07:34:55 -07:00
|
|
|
)
|
2023-11-10 14:59:42 -08:00
|
|
|
elif len(parts) == 3:
|
|
|
|
city, state, country = parts
|
|
|
|
location = Location(
|
|
|
|
city=city,
|
|
|
|
state=state,
|
|
|
|
country=Country.from_string(country),
|
|
|
|
)
|
2023-07-08 07:34:55 -07:00
|
|
|
|
|
|
|
return location
|