2023-08-31 12:01:47 -07:00
|
|
|
from typing import Optional, Tuple
|
2023-08-31 08:29:43 -07:00
|
|
|
from datetime import datetime
|
2023-07-08 07:34:55 -07:00
|
|
|
|
|
|
|
import requests
|
|
|
|
from bs4 import BeautifulSoup
|
2023-07-10 20:07:19 -07:00
|
|
|
from bs4.element import Tag
|
2023-07-08 07:34:55 -07:00
|
|
|
|
2023-09-03 07:29:25 -07:00
|
|
|
from .. import Scraper, ScraperInput, Site
|
2023-09-03 18:05:31 -07:00
|
|
|
from ...jobs import (
|
|
|
|
JobPost,
|
|
|
|
Location,
|
|
|
|
JobResponse,
|
|
|
|
JobType,
|
|
|
|
Compensation,
|
|
|
|
CompensationInterval,
|
|
|
|
)
|
2023-07-08 07:34:55 -07:00
|
|
|
|
|
|
|
|
|
|
|
class LinkedInScraper(Scraper):
|
|
|
|
def __init__(self):
|
2023-07-10 20:07:19 -07:00
|
|
|
"""
|
|
|
|
Initializes LinkedInScraper with the LinkedIn job search url
|
|
|
|
"""
|
2023-07-08 07:34:55 -07:00
|
|
|
site = Site(Site.LINKEDIN)
|
2023-08-26 03:55:59 -07:00
|
|
|
url = "https://www.linkedin.com"
|
|
|
|
super().__init__(site, url)
|
2023-07-08 07:34:55 -07:00
|
|
|
|
|
|
|
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
2023-07-10 20:07:19 -07:00
|
|
|
"""
|
|
|
|
Scrapes LinkedIn for jobs with scraper_input criteria
|
|
|
|
:param scraper_input:
|
|
|
|
:return: job_response
|
|
|
|
"""
|
2023-07-08 07:34:55 -07:00
|
|
|
job_list: list[JobPost] = []
|
2023-07-10 20:07:19 -07:00
|
|
|
seen_urls = set()
|
|
|
|
page, processed_jobs, job_count = 0, 0, 0
|
|
|
|
|
2023-07-11 06:24:59 -07:00
|
|
|
def job_type_code(job_type):
|
|
|
|
mapping = {
|
|
|
|
JobType.FULL_TIME: "F",
|
|
|
|
JobType.PART_TIME: "P",
|
|
|
|
JobType.INTERNSHIP: "I",
|
|
|
|
JobType.CONTRACT: "C",
|
|
|
|
JobType.TEMPORARY: "T",
|
|
|
|
}
|
|
|
|
|
|
|
|
return mapping.get(job_type, "")
|
|
|
|
|
2023-07-10 20:07:19 -07:00
|
|
|
with requests.Session() as session:
|
|
|
|
while len(job_list) < scraper_input.results_wanted:
|
|
|
|
params = {
|
2023-07-11 03:42:20 -07:00
|
|
|
"keywords": scraper_input.search_term,
|
2023-07-10 20:07:19 -07:00
|
|
|
"location": scraper_input.location,
|
|
|
|
"distance": scraper_input.distance,
|
2023-07-11 03:42:20 -07:00
|
|
|
"f_WT": 2 if scraper_input.is_remote else None,
|
2023-07-11 06:24:59 -07:00
|
|
|
"f_JT": job_type_code(scraper_input.job_type)
|
|
|
|
if scraper_input.job_type
|
|
|
|
else None,
|
2023-07-11 03:42:20 -07:00
|
|
|
"pageNum": page,
|
2023-08-17 13:44:52 -07:00
|
|
|
"f_AL": "true" if scraper_input.easy_apply else None,
|
2023-07-10 20:07:19 -07:00
|
|
|
}
|
|
|
|
|
2023-08-19 16:46:03 -07:00
|
|
|
params = {k: v for k, v in params.items() if v is not None}
|
2023-08-26 03:55:59 -07:00
|
|
|
response = session.get(
|
|
|
|
f"{self.url}/jobs/search", params=params, allow_redirects=True
|
|
|
|
)
|
2023-07-10 20:07:19 -07:00
|
|
|
|
2023-09-03 07:29:25 -07:00
|
|
|
if response.status_code != 200:
|
2023-07-10 20:07:19 -07:00
|
|
|
return JobResponse(
|
|
|
|
success=False,
|
|
|
|
error=f"Response returned {response.status_code}",
|
|
|
|
)
|
|
|
|
|
|
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
|
|
|
|
if page == 0:
|
|
|
|
job_count_text = soup.find(
|
|
|
|
"span", class_="results-context-header__job-count"
|
|
|
|
).text
|
|
|
|
job_count = int("".join(filter(str.isdigit, job_count_text)))
|
|
|
|
|
|
|
|
for job_card in soup.find_all(
|
|
|
|
"div",
|
|
|
|
class_="base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card",
|
|
|
|
):
|
2023-07-11 06:51:19 -07:00
|
|
|
processed_jobs += 1
|
2023-07-11 03:42:20 -07:00
|
|
|
data_entity_urn = job_card.get("data-entity-urn", "")
|
|
|
|
job_id = (
|
|
|
|
data_entity_urn.split(":")[-1] if data_entity_urn else "N/A"
|
|
|
|
)
|
2023-08-26 03:55:59 -07:00
|
|
|
job_url = f"{self.url}/jobs/view/{job_id}"
|
2023-07-10 20:07:19 -07:00
|
|
|
if job_url in seen_urls:
|
|
|
|
continue
|
|
|
|
seen_urls.add(job_url)
|
|
|
|
job_info = job_card.find("div", class_="base-search-card__info")
|
|
|
|
if job_info is None:
|
|
|
|
continue
|
|
|
|
title_tag = job_info.find("h3", class_="base-search-card__title")
|
|
|
|
title = title_tag.text.strip() if title_tag else "N/A"
|
|
|
|
|
|
|
|
company_tag = job_info.find("a", class_="hidden-nested-link")
|
|
|
|
company = company_tag.text.strip() if company_tag else "N/A"
|
|
|
|
|
|
|
|
metadata_card = job_info.find(
|
|
|
|
"div", class_="base-search-card__metadata"
|
|
|
|
)
|
|
|
|
location: Location = LinkedInScraper.get_location(metadata_card)
|
|
|
|
|
|
|
|
datetime_tag = metadata_card.find(
|
|
|
|
"time", class_="job-search-card__listdate"
|
|
|
|
)
|
2023-08-31 12:01:47 -07:00
|
|
|
description, job_type = LinkedInScraper.get_description(job_url)
|
2023-07-10 20:07:19 -07:00
|
|
|
if datetime_tag:
|
|
|
|
datetime_str = datetime_tag["datetime"]
|
|
|
|
date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")
|
|
|
|
else:
|
|
|
|
date_posted = None
|
|
|
|
|
|
|
|
job_post = JobPost(
|
|
|
|
title=title,
|
2023-08-26 05:07:29 -07:00
|
|
|
description=description,
|
2023-07-10 20:07:19 -07:00
|
|
|
company_name=company,
|
|
|
|
location=location,
|
|
|
|
date_posted=date_posted,
|
2023-07-11 03:42:20 -07:00
|
|
|
job_url=job_url,
|
2023-08-31 12:01:47 -07:00
|
|
|
job_type=job_type,
|
2023-09-03 18:05:31 -07:00
|
|
|
compensation=Compensation(
|
|
|
|
interval=CompensationInterval.YEARLY, currency="USD"
|
|
|
|
),
|
2023-07-10 20:07:19 -07:00
|
|
|
)
|
|
|
|
job_list.append(job_post)
|
2023-07-11 06:51:19 -07:00
|
|
|
if (
|
|
|
|
len(job_list) >= scraper_input.results_wanted
|
|
|
|
or processed_jobs >= job_count
|
|
|
|
):
|
2023-07-10 20:07:19 -07:00
|
|
|
break
|
|
|
|
if (
|
|
|
|
len(job_list) >= scraper_input.results_wanted
|
|
|
|
or processed_jobs >= job_count
|
|
|
|
):
|
|
|
|
break
|
|
|
|
|
|
|
|
page += 1
|
|
|
|
|
|
|
|
job_list = job_list[: scraper_input.results_wanted]
|
2023-07-08 07:34:55 -07:00
|
|
|
job_response = JobResponse(
|
2023-07-10 15:47:22 -07:00
|
|
|
success=True,
|
2023-07-08 07:34:55 -07:00
|
|
|
jobs=job_list,
|
2023-07-11 10:02:50 -07:00
|
|
|
total_results=job_count,
|
2023-07-08 07:34:55 -07:00
|
|
|
)
|
|
|
|
return job_response
|
|
|
|
|
2023-08-26 05:07:29 -07:00
|
|
|
@staticmethod
|
|
|
|
def get_description(job_page_url: str) -> Optional[str]:
|
|
|
|
"""
|
|
|
|
Retrieves job description by going to the job page url
|
|
|
|
:param job_page_url:
|
|
|
|
:return: description or None
|
|
|
|
"""
|
|
|
|
response = requests.get(job_page_url, allow_redirects=True)
|
|
|
|
if response.status_code not in range(200, 400):
|
2023-08-31 12:01:47 -07:00
|
|
|
return None, None
|
2023-08-26 05:07:29 -07:00
|
|
|
|
|
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
div_content = soup.find(
|
|
|
|
"div", class_=lambda x: x and "show-more-less-html__markup" in x
|
|
|
|
)
|
|
|
|
|
|
|
|
text_content = None
|
|
|
|
if div_content:
|
|
|
|
text_content = " ".join(div_content.get_text().split()).strip()
|
2023-08-31 12:01:47 -07:00
|
|
|
|
|
|
|
def get_job_type(
|
|
|
|
soup: BeautifulSoup,
|
|
|
|
) -> Tuple[Optional[str], Optional[JobType]]:
|
|
|
|
"""
|
|
|
|
Gets the job type from job page
|
|
|
|
:param soup:
|
|
|
|
:return: JobType
|
|
|
|
"""
|
|
|
|
h3_tag = soup.find(
|
|
|
|
"h3",
|
|
|
|
class_="description__job-criteria-subheader",
|
|
|
|
string=lambda text: "Employment type" in text,
|
|
|
|
)
|
|
|
|
|
|
|
|
employment_type = None
|
|
|
|
if h3_tag:
|
|
|
|
employment_type_span = h3_tag.find_next_sibling(
|
|
|
|
"span",
|
|
|
|
class_="description__job-criteria-text description__job-criteria-text--criteria",
|
|
|
|
)
|
|
|
|
if employment_type_span:
|
|
|
|
employment_type = employment_type_span.get_text(strip=True)
|
|
|
|
employment_type = employment_type.lower()
|
|
|
|
employment_type = employment_type.replace("-", "")
|
|
|
|
|
|
|
|
return JobType(employment_type)
|
|
|
|
|
|
|
|
return text_content, get_job_type(soup)
|
2023-08-26 05:07:29 -07:00
|
|
|
|
2023-07-08 07:34:55 -07:00
|
|
|
@staticmethod
|
2023-07-10 20:07:19 -07:00
|
|
|
def get_location(metadata_card: Optional[Tag]) -> Location:
|
|
|
|
"""
|
|
|
|
Extracts the location data from the job metadata card.
|
|
|
|
:param metadata_card
|
|
|
|
:return: location
|
|
|
|
"""
|
2023-07-08 07:34:55 -07:00
|
|
|
if metadata_card is not None:
|
|
|
|
location_tag = metadata_card.find(
|
|
|
|
"span", class_="job-search-card__location"
|
|
|
|
)
|
|
|
|
location_string = location_tag.text.strip() if location_tag else "N/A"
|
|
|
|
parts = location_string.split(", ")
|
|
|
|
if len(parts) == 2:
|
|
|
|
city, state = parts
|
|
|
|
location = Location(
|
|
|
|
city=city,
|
|
|
|
state=state,
|
|
|
|
)
|
|
|
|
|
|
|
|
return location
|