JobSpy/src/jobspy/scrapers/linkedin/__init__.py

307 lines
11 KiB
Python
Raw Normal View History

"""
jobspy.scrapers.linkedin
~~~~~~~~~~~~~~~~~~~
This module contains routines to scrape LinkedIn.
"""
2024-01-28 19:50:41 -08:00
import time
2024-01-09 17:32:51 -08:00
import random
from typing import Optional
2023-08-31 08:29:43 -07:00
from datetime import datetime
2023-07-08 07:34:55 -07:00
import requests
from requests.exceptions import ProxyError
from threading import Lock
2024-01-28 19:50:41 -08:00
from bs4.element import Tag
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urlunparse
2023-07-08 07:34:55 -07:00
2023-09-03 07:29:25 -07:00
from .. import Scraper, ScraperInput, Site
from ..exceptions import LinkedInException
2024-01-09 17:32:51 -08:00
from ..utils import create_session
2024-01-28 19:50:41 -08:00
from ...jobs import (
JobPost,
Location,
JobResponse,
JobType,
Country,
Compensation
)
from ..utils import (
count_urgent_words,
extract_emails_from_text,
get_enum_from_job_type,
currency_parser,
modify_and_get_description
)
2023-09-28 16:33:14 -07:00
2023-07-08 07:34:55 -07:00
class LinkedInScraper(Scraper):
2024-01-09 17:32:51 -08:00
DELAY = 3
def __init__(self, proxy: Optional[str] = None):
"""
Initializes LinkedInScraper with the LinkedIn job search url
"""
2023-07-08 07:34:55 -07:00
site = Site(Site.LINKEDIN)
self.country = "worldwide"
2023-09-05 10:17:22 -07:00
self.url = "https://www.linkedin.com"
super().__init__(site, proxy=proxy)
2023-07-08 07:34:55 -07:00
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Scrapes LinkedIn for jobs with scraper_input criteria
:param scraper_input:
:return: job_response
"""
2023-07-08 07:34:55 -07:00
job_list: list[JobPost] = []
seen_urls = set()
url_lock = Lock()
page = scraper_input.offset // 25 + 25 if scraper_input.offset else 0
def job_type_code(job_type_enum):
2023-07-11 06:24:59 -07:00
mapping = {
JobType.FULL_TIME: "F",
JobType.PART_TIME: "P",
JobType.INTERNSHIP: "I",
JobType.CONTRACT: "C",
JobType.TEMPORARY: "T",
}
return mapping.get(job_type_enum, "")
while len(job_list) < scraper_input.results_wanted and page < 1000:
2024-01-09 17:32:51 -08:00
session = create_session(is_tls=False, has_retry=True, delay=5)
params = {
"keywords": scraper_input.search_term,
"location": scraper_input.location,
"distance": scraper_input.distance,
"f_WT": 2 if scraper_input.is_remote else None,
"f_JT": job_type_code(scraper_input.job_type)
if scraper_input.job_type
else None,
"pageNum": 0,
"start": page + scraper_input.offset,
"f_AL": "true" if scraper_input.easy_apply else None,
}
params = {k: v for k, v in params.items() if v is not None}
2024-01-09 17:32:51 -08:00
try:
response = session.get(
f"{self.url}/jobs-guest/jobs/api/seeMoreJobPostings/search?",
params=params,
allow_redirects=True,
proxies=self.proxy,
headers=self.headers(),
timeout=10,
)
2024-01-09 17:32:51 -08:00
response.raise_for_status()
except requests.HTTPError as e:
raise LinkedInException(f"bad response status code: {e.response.status_code}")
except ProxyError as e:
raise LinkedInException("bad proxy")
except Exception as e:
raise LinkedInException(str(e))
soup = BeautifulSoup(response.text, "html.parser")
2024-01-10 12:01:10 -08:00
job_cards = soup.find_all("div", class_="base-search-card")
if len(job_cards) == 0:
return JobResponse(jobs=job_list)
2024-01-10 12:01:10 -08:00
for job_card in job_cards:
job_url = None
href_tag = job_card.find("a", class_="base-card__full-link")
if href_tag and "href" in href_tag.attrs:
href = href_tag.attrs["href"].split("?")[0]
job_id = href.split("-")[-1]
job_url = f"{self.url}/jobs/view/{job_id}"
with url_lock:
if job_url in seen_urls:
continue
seen_urls.add(job_url)
# Call process_job directly without threading
try:
2024-01-22 18:22:32 -08:00
job_post = self.process_job(job_card, job_url, scraper_input.full_description)
if job_post:
job_list.append(job_post)
except Exception as e:
raise LinkedInException("Exception occurred while processing jobs")
page += 25
2024-01-09 17:32:51 -08:00
time.sleep(random.uniform(LinkedInScraper.DELAY, LinkedInScraper.DELAY + 2))
job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list)
2023-07-08 07:34:55 -07:00
2024-01-22 18:22:32 -08:00
def process_job(self, job_card: Tag, job_url: str, full_descr: bool) -> Optional[JobPost]:
2023-11-09 12:57:15 -08:00
salary_tag = job_card.find('span', class_='job-search-card__salary-info')
compensation = None
if salary_tag:
salary_text = salary_tag.get_text(separator=' ').strip()
salary_values = [currency_parser(value) for value in salary_text.split('-')]
salary_min = salary_values[0]
salary_max = salary_values[1]
currency = salary_text[0] if salary_text[0] != '$' else 'USD'
compensation = Compensation(
min_amount=int(salary_min),
max_amount=int(salary_max),
currency=currency,
)
title_tag = job_card.find("span", class_="sr-only")
title = title_tag.get_text(strip=True) if title_tag else "N/A"
company_tag = job_card.find("h4", class_="base-search-card__subtitle")
company_a_tag = company_tag.find("a") if company_tag else None
company_url = (
urlunparse(urlparse(company_a_tag.get("href"))._replace(query=""))
if company_a_tag and company_a_tag.has_attr("href")
else ""
)
company = company_a_tag.get_text(strip=True) if company_a_tag else "N/A"
metadata_card = job_card.find("div", class_="base-search-card__metadata")
location = self.get_location(metadata_card)
datetime_tag = (
metadata_card.find("time", class_="job-search-card__listdate")
if metadata_card
else None
)
2024-01-22 18:22:32 -08:00
date_posted = description = job_type = None
if datetime_tag and "datetime" in datetime_tag.attrs:
datetime_str = datetime_tag["datetime"]
try:
date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")
except Exception as e:
date_posted = None
benefits_tag = job_card.find("span", class_="result-benefits__text")
benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None
2024-01-22 18:22:32 -08:00
if full_descr:
description, job_type = self.get_job_description(job_url)
return JobPost(
title=title,
company_name=company,
company_url=company_url,
location=location,
date_posted=date_posted,
job_url=job_url,
2023-11-09 12:57:15 -08:00
compensation=compensation,
benefits=benefits,
2024-01-22 18:22:32 -08:00
job_type=job_type,
description=description,
emails=extract_emails_from_text(description) if description else None,
num_urgent_words=count_urgent_words(description) if description else None,
)
def get_job_description(
self, job_page_url: str
) -> tuple[None, None] | tuple[str | None, tuple[str | None, JobType | None]]:
2023-08-26 05:07:29 -07:00
"""
Retrieves job description by going to the job page url
:param job_page_url:
:return: description or None
"""
2023-09-06 07:47:11 -07:00
try:
2024-01-09 17:32:51 -08:00
session = create_session(is_tls=False, has_retry=True)
response = session.get(job_page_url, timeout=5, proxies=self.proxy)
response.raise_for_status()
except requests.HTTPError as e:
return None, None
except Exception as e:
2023-08-31 12:01:47 -07:00
return None, None
if response.url == "https://www.linkedin.com/signup":
return None, None
2023-08-26 05:07:29 -07:00
soup = BeautifulSoup(response.text, "html.parser")
div_content = soup.find(
"div", class_=lambda x: x and "show-more-less-html__markup" in x
)
description = None
2023-08-26 05:07:29 -07:00
if div_content:
2024-01-28 19:50:41 -08:00
description = modify_and_get_description(div_content)
2023-08-31 12:01:47 -07:00
def get_job_type(
soup_job_type: BeautifulSoup,
) -> list[JobType] | None:
2023-08-31 12:01:47 -07:00
"""
Gets the job type from job page
:param soup_job_type:
2023-08-31 12:01:47 -07:00
:return: JobType
"""
h3_tag = soup_job_type.find(
2023-08-31 12:01:47 -07:00
"h3",
class_="description__job-criteria-subheader",
string=lambda text: "Employment type" in text,
)
employment_type = None
if h3_tag:
employment_type_span = h3_tag.find_next_sibling(
"span",
class_="description__job-criteria-text description__job-criteria-text--criteria",
)
if employment_type_span:
employment_type = employment_type_span.get_text(strip=True)
employment_type = employment_type.lower()
employment_type = employment_type.replace("-", "")
return [get_enum_from_job_type(employment_type)] if employment_type else []
2023-08-31 12:01:47 -07:00
return description, get_job_type(soup)
2023-08-26 05:07:29 -07:00
2023-09-05 10:17:22 -07:00
def get_location(self, metadata_card: Optional[Tag]) -> Location:
"""
Extracts the location data from the job metadata card.
:param metadata_card
:return: location
"""
2023-10-30 17:57:36 -07:00
location = Location(country=Country.from_string(self.country))
2023-07-08 07:34:55 -07:00
if metadata_card is not None:
location_tag = metadata_card.find(
"span", class_="job-search-card__location"
)
location_string = location_tag.text.strip() if location_tag else "N/A"
parts = location_string.split(", ")
if len(parts) == 2:
city, state = parts
location = Location(
city=city,
state=state,
2023-10-30 17:57:36 -07:00
country=Country.from_string(self.country),
2023-07-08 07:34:55 -07:00
)
2023-11-10 14:59:42 -08:00
elif len(parts) == 3:
city, state, country = parts
location = Location(
city=city,
state=state,
country=Country.from_string(country),
)
2023-07-08 07:34:55 -07:00
return location
2024-01-09 17:32:51 -08:00
@staticmethod
def headers() -> dict:
return {
'authority': 'www.linkedin.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'max-age=0',
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
# 'sec-ch-ua-mobile': '?0',
# 'sec-ch-ua-platform': '"macOS"',
# 'sec-fetch-dest': 'document',
# 'sec-fetch-mode': 'navigate',
# 'sec-fetch-site': 'none',
# 'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}