mirror of https://github.com/Bunsly/JobSpy
FEAT: Allow LinkedIn scraper to get external job apply url
parent
5d3df732e6
commit
878da53b06
|
@ -9,6 +9,8 @@ from __future__ import annotations
|
||||||
|
|
||||||
import time
|
import time
|
||||||
import random
|
import random
|
||||||
|
import regex as re
|
||||||
|
import urllib.parse
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
|
@ -51,6 +53,7 @@ class LinkedInScraper(Scraper):
|
||||||
super().__init__(Site(Site.LINKEDIN), proxy=proxy)
|
super().__init__(Site(Site.LINKEDIN), proxy=proxy)
|
||||||
self.scraper_input = None
|
self.scraper_input = None
|
||||||
self.country = "worldwide"
|
self.country = "worldwide"
|
||||||
|
self.job_url_direct_regex = re.compile(r'(?<=\?url=)[^"]+')
|
||||||
|
|
||||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
||||||
"""
|
"""
|
||||||
|
@ -203,7 +206,7 @@ class LinkedInScraper(Scraper):
|
||||||
date_posted = None
|
date_posted = None
|
||||||
benefits_tag = job_card.find("span", class_="result-benefits__text")
|
benefits_tag = job_card.find("span", class_="result-benefits__text")
|
||||||
if full_descr:
|
if full_descr:
|
||||||
description, job_type = self._get_job_description(job_url)
|
description, job_type, job_url_direct = self._get_job_description(job_url)
|
||||||
|
|
||||||
return JobPost(
|
return JobPost(
|
||||||
title=title,
|
title=title,
|
||||||
|
@ -212,6 +215,7 @@ class LinkedInScraper(Scraper):
|
||||||
location=location,
|
location=location,
|
||||||
date_posted=date_posted,
|
date_posted=date_posted,
|
||||||
job_url=job_url,
|
job_url=job_url,
|
||||||
|
job_url_direct=job_url_direct,
|
||||||
compensation=compensation,
|
compensation=compensation,
|
||||||
job_type=job_type,
|
job_type=job_type,
|
||||||
description=description,
|
description=description,
|
||||||
|
@ -220,7 +224,9 @@ class LinkedInScraper(Scraper):
|
||||||
|
|
||||||
def _get_job_description(
|
def _get_job_description(
|
||||||
self, job_page_url: str
|
self, job_page_url: str
|
||||||
) -> tuple[None, None] | tuple[str | None, tuple[str | None, JobType | None]]:
|
) -> tuple[None, None, None] | tuple[
|
||||||
|
str | None, tuple[str | None, JobType | None], str | None
|
||||||
|
]:
|
||||||
"""
|
"""
|
||||||
Retrieves job description by going to the job page url
|
Retrieves job description by going to the job page url
|
||||||
:param job_page_url:
|
:param job_page_url:
|
||||||
|
@ -253,7 +259,7 @@ class LinkedInScraper(Scraper):
|
||||||
description = div_content.prettify(formatter="html")
|
description = div_content.prettify(formatter="html")
|
||||||
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
||||||
description = markdown_converter(description)
|
description = markdown_converter(description)
|
||||||
return description, self._parse_job_type(soup)
|
return description, self._parse_job_type(soup), self._parse_job_url_direct(soup)
|
||||||
|
|
||||||
def _get_location(self, metadata_card: Optional[Tag]) -> Location:
|
def _get_location(self, metadata_card: Optional[Tag]) -> Location:
|
||||||
"""
|
"""
|
||||||
|
@ -306,6 +312,23 @@ class LinkedInScraper(Scraper):
|
||||||
|
|
||||||
return [get_enum_from_job_type(employment_type)] if employment_type else []
|
return [get_enum_from_job_type(employment_type)] if employment_type else []
|
||||||
|
|
||||||
|
def _parse_job_url_direct(self, soup: BeautifulSoup) -> str | None:
|
||||||
|
"""
|
||||||
|
Gets the job url direct from job page
|
||||||
|
:param soup:
|
||||||
|
:return: str
|
||||||
|
"""
|
||||||
|
job_url_direct = None
|
||||||
|
job_url_direct_content = soup.find("code", id="applyUrl")
|
||||||
|
if job_url_direct_content:
|
||||||
|
job_url_direct_match = self.job_url_direct_regex.search(
|
||||||
|
job_url_direct_content.decode_contents().strip()
|
||||||
|
)
|
||||||
|
if job_url_direct_match:
|
||||||
|
job_url_direct = urllib.parse.unquote(job_url_direct_match.group())
|
||||||
|
|
||||||
|
return job_url_direct
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def job_type_code(job_type_enum: JobType) -> str:
|
def job_type_code(job_type_enum: JobType) -> str:
|
||||||
return {
|
return {
|
||||||
|
|
Loading…
Reference in New Issue