mirror of https://github.com/Bunsly/JobSpy
FEAT: Allow LinkedIn scraper to get external job apply url
parent
5d3df732e6
commit
878da53b06
|
@ -9,6 +9,8 @@ from __future__ import annotations
|
|||
|
||||
import time
|
||||
import random
|
||||
import regex as re
|
||||
import urllib.parse
|
||||
from typing import Optional
|
||||
from datetime import datetime
|
||||
|
||||
|
@ -51,6 +53,7 @@ class LinkedInScraper(Scraper):
|
|||
super().__init__(Site(Site.LINKEDIN), proxy=proxy)
|
||||
self.scraper_input = None
|
||||
self.country = "worldwide"
|
||||
self.job_url_direct_regex = re.compile(r'(?<=\?url=)[^"]+')
|
||||
|
||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
||||
"""
|
||||
|
@ -203,7 +206,7 @@ class LinkedInScraper(Scraper):
|
|||
date_posted = None
|
||||
benefits_tag = job_card.find("span", class_="result-benefits__text")
|
||||
if full_descr:
|
||||
description, job_type = self._get_job_description(job_url)
|
||||
description, job_type, job_url_direct = self._get_job_description(job_url)
|
||||
|
||||
return JobPost(
|
||||
title=title,
|
||||
|
@ -212,6 +215,7 @@ class LinkedInScraper(Scraper):
|
|||
location=location,
|
||||
date_posted=date_posted,
|
||||
job_url=job_url,
|
||||
job_url_direct=job_url_direct,
|
||||
compensation=compensation,
|
||||
job_type=job_type,
|
||||
description=description,
|
||||
|
@ -220,7 +224,9 @@ class LinkedInScraper(Scraper):
|
|||
|
||||
def _get_job_description(
|
||||
self, job_page_url: str
|
||||
) -> tuple[None, None] | tuple[str | None, tuple[str | None, JobType | None]]:
|
||||
) -> tuple[None, None, None] | tuple[
|
||||
str | None, tuple[str | None, JobType | None], str | None
|
||||
]:
|
||||
"""
|
||||
Retrieves job description by going to the job page url
|
||||
:param job_page_url:
|
||||
|
@ -253,7 +259,7 @@ class LinkedInScraper(Scraper):
|
|||
description = div_content.prettify(formatter="html")
|
||||
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
||||
description = markdown_converter(description)
|
||||
return description, self._parse_job_type(soup)
|
||||
return description, self._parse_job_type(soup), self._parse_job_url_direct(soup)
|
||||
|
||||
def _get_location(self, metadata_card: Optional[Tag]) -> Location:
|
||||
"""
|
||||
|
@ -306,6 +312,23 @@ class LinkedInScraper(Scraper):
|
|||
|
||||
return [get_enum_from_job_type(employment_type)] if employment_type else []
|
||||
|
||||
def _parse_job_url_direct(self, soup: BeautifulSoup) -> str | None:
|
||||
"""
|
||||
Gets the job url direct from job page
|
||||
:param soup:
|
||||
:return: str
|
||||
"""
|
||||
job_url_direct = None
|
||||
job_url_direct_content = soup.find("code", id="applyUrl")
|
||||
if job_url_direct_content:
|
||||
job_url_direct_match = self.job_url_direct_regex.search(
|
||||
job_url_direct_content.decode_contents().strip()
|
||||
)
|
||||
if job_url_direct_match:
|
||||
job_url_direct = urllib.parse.unquote(job_url_direct_match.group())
|
||||
|
||||
return job_url_direct
|
||||
|
||||
@staticmethod
|
||||
def job_type_code(job_type_enum: JobType) -> str:
|
||||
return {
|
||||
|
|
Loading…
Reference in New Issue