FEAT: Allow LinkedIn scraper to get external job apply url

pull/140/head
Lluis 2024-04-30 08:52:18 +02:00
parent 5d3df732e6
commit 878da53b06
1 changed files with 26 additions and 3 deletions

View File

@ -9,6 +9,8 @@ from __future__ import annotations
import time import time
import random import random
import regex as re
import urllib.parse
from typing import Optional from typing import Optional
from datetime import datetime from datetime import datetime
@ -51,6 +53,7 @@ class LinkedInScraper(Scraper):
super().__init__(Site(Site.LINKEDIN), proxy=proxy) super().__init__(Site(Site.LINKEDIN), proxy=proxy)
self.scraper_input = None self.scraper_input = None
self.country = "worldwide" self.country = "worldwide"
self.job_url_direct_regex = re.compile(r'(?<=\?url=)[^"]+')
def scrape(self, scraper_input: ScraperInput) -> JobResponse: def scrape(self, scraper_input: ScraperInput) -> JobResponse:
""" """
@ -203,7 +206,7 @@ class LinkedInScraper(Scraper):
date_posted = None date_posted = None
benefits_tag = job_card.find("span", class_="result-benefits__text") benefits_tag = job_card.find("span", class_="result-benefits__text")
if full_descr: if full_descr:
description, job_type = self._get_job_description(job_url) description, job_type, job_url_direct = self._get_job_description(job_url)
return JobPost( return JobPost(
title=title, title=title,
@ -212,6 +215,7 @@ class LinkedInScraper(Scraper):
location=location, location=location,
date_posted=date_posted, date_posted=date_posted,
job_url=job_url, job_url=job_url,
job_url_direct=job_url_direct,
compensation=compensation, compensation=compensation,
job_type=job_type, job_type=job_type,
description=description, description=description,
@ -220,7 +224,9 @@ class LinkedInScraper(Scraper):
def _get_job_description( def _get_job_description(
self, job_page_url: str self, job_page_url: str
) -> tuple[None, None] | tuple[str | None, tuple[str | None, JobType | None]]: ) -> tuple[None, None, None] | tuple[
str | None, tuple[str | None, JobType | None], str | None
]:
""" """
Retrieves job description by going to the job page url Retrieves job description by going to the job page url
:param job_page_url: :param job_page_url:
@ -253,7 +259,7 @@ class LinkedInScraper(Scraper):
description = div_content.prettify(formatter="html") description = div_content.prettify(formatter="html")
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN: if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
description = markdown_converter(description) description = markdown_converter(description)
return description, self._parse_job_type(soup) return description, self._parse_job_type(soup), self._parse_job_url_direct(soup)
def _get_location(self, metadata_card: Optional[Tag]) -> Location: def _get_location(self, metadata_card: Optional[Tag]) -> Location:
""" """
@ -306,6 +312,23 @@ class LinkedInScraper(Scraper):
return [get_enum_from_job_type(employment_type)] if employment_type else [] return [get_enum_from_job_type(employment_type)] if employment_type else []
def _parse_job_url_direct(self, soup: BeautifulSoup) -> str | None:
"""
Gets the job url direct from job page
:param soup:
:return: str
"""
job_url_direct = None
job_url_direct_content = soup.find("code", id="applyUrl")
if job_url_direct_content:
job_url_direct_match = self.job_url_direct_regex.search(
job_url_direct_content.decode_contents().strip()
)
if job_url_direct_match:
job_url_direct = urllib.parse.unquote(job_url_direct_match.group())
return job_url_direct
@staticmethod @staticmethod
def job_type_code(job_type_enum: JobType) -> str: def job_type_code(job_type_enum: JobType) -> str:
return { return {