FEAT: Allow LinkedIn scraper to get external job apply url

pull/140/head
Lluis 2024-04-30 08:52:18 +02:00
parent 5d3df732e6
commit 878da53b06
1 changed files with 26 additions and 3 deletions

View File

@ -9,6 +9,8 @@ from __future__ import annotations
import time
import random
import regex as re
import urllib.parse
from typing import Optional
from datetime import datetime
@ -51,6 +53,7 @@ class LinkedInScraper(Scraper):
super().__init__(Site(Site.LINKEDIN), proxy=proxy)
self.scraper_input = None
self.country = "worldwide"
self.job_url_direct_regex = re.compile(r'(?<=\?url=)[^"]+')
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
@ -203,7 +206,7 @@ class LinkedInScraper(Scraper):
date_posted = None
benefits_tag = job_card.find("span", class_="result-benefits__text")
if full_descr:
description, job_type = self._get_job_description(job_url)
description, job_type, job_url_direct = self._get_job_description(job_url)
return JobPost(
title=title,
@ -212,6 +215,7 @@ class LinkedInScraper(Scraper):
location=location,
date_posted=date_posted,
job_url=job_url,
job_url_direct=job_url_direct,
compensation=compensation,
job_type=job_type,
description=description,
@ -220,7 +224,9 @@ class LinkedInScraper(Scraper):
def _get_job_description(
self, job_page_url: str
) -> tuple[None, None] | tuple[str | None, tuple[str | None, JobType | None]]:
) -> tuple[None, None, None] | tuple[
str | None, tuple[str | None, JobType | None], str | None
]:
"""
Retrieves job description by going to the job page url
:param job_page_url:
@ -253,7 +259,7 @@ class LinkedInScraper(Scraper):
description = div_content.prettify(formatter="html")
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
description = markdown_converter(description)
return description, self._parse_job_type(soup)
return description, self._parse_job_type(soup), self._parse_job_url_direct(soup)
def _get_location(self, metadata_card: Optional[Tag]) -> Location:
"""
@ -306,6 +312,23 @@ class LinkedInScraper(Scraper):
return [get_enum_from_job_type(employment_type)] if employment_type else []
def _parse_job_url_direct(self, soup: BeautifulSoup) -> str | None:
"""
Gets the job url direct from job page
:param soup:
:return: str
"""
job_url_direct = None
job_url_direct_content = soup.find("code", id="applyUrl")
if job_url_direct_content:
job_url_direct_match = self.job_url_direct_regex.search(
job_url_direct_content.decode_contents().strip()
)
if job_url_direct_match:
job_url_direct = urllib.parse.unquote(job_url_direct_match.group())
return job_url_direct
@staticmethod
def job_type_code(job_type_enum: JobType) -> str:
return {