mirror of https://github.com/Bunsly/JobSpy
format: jobspy/scrapers/linkedin
parent
0654232db4
commit
2e421ff4e2
|
@ -4,6 +4,7 @@ jobspy.scrapers.linkedin
|
||||||
|
|
||||||
This module contains routines to scrape LinkedIn.
|
This module contains routines to scrape LinkedIn.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import time
|
import time
|
||||||
|
@ -26,14 +27,14 @@ from ...jobs import (
|
||||||
JobType,
|
JobType,
|
||||||
Country,
|
Country,
|
||||||
Compensation,
|
Compensation,
|
||||||
DescriptionFormat
|
DescriptionFormat,
|
||||||
)
|
)
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
logger,
|
logger,
|
||||||
extract_emails_from_text,
|
extract_emails_from_text,
|
||||||
get_enum_from_job_type,
|
get_enum_from_job_type,
|
||||||
currency_parser,
|
currency_parser,
|
||||||
markdown_converter
|
markdown_converter,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -63,26 +64,32 @@ class LinkedInScraper(Scraper):
|
||||||
url_lock = Lock()
|
url_lock = Lock()
|
||||||
page = scraper_input.offset // 25 + 25 if scraper_input.offset else 0
|
page = scraper_input.offset // 25 + 25 if scraper_input.offset else 0
|
||||||
seconds_old = (
|
seconds_old = (
|
||||||
scraper_input.hours_old * 3600
|
scraper_input.hours_old * 3600 if scraper_input.hours_old else None
|
||||||
if scraper_input.hours_old
|
)
|
||||||
else None
|
continue_search = (
|
||||||
|
lambda: len(job_list) < scraper_input.results_wanted and page < 1000
|
||||||
)
|
)
|
||||||
continue_search = lambda: len(job_list) < scraper_input.results_wanted and page < 1000
|
|
||||||
while continue_search():
|
while continue_search():
|
||||||
logger.info(f'LinkedIn search page: {page // 25 + 1}')
|
logger.info(f"LinkedIn search page: {page // 25 + 1}")
|
||||||
session = create_session(is_tls=False, has_retry=True, delay=5)
|
session = create_session(is_tls=False, has_retry=True, delay=5)
|
||||||
params = {
|
params = {
|
||||||
"keywords": scraper_input.search_term,
|
"keywords": scraper_input.search_term,
|
||||||
"location": scraper_input.location,
|
"location": scraper_input.location,
|
||||||
"distance": scraper_input.distance,
|
"distance": scraper_input.distance,
|
||||||
"f_WT": 2 if scraper_input.is_remote else None,
|
"f_WT": 2 if scraper_input.is_remote else None,
|
||||||
"f_JT": self.job_type_code(scraper_input.job_type)
|
"f_JT": (
|
||||||
|
self.job_type_code(scraper_input.job_type)
|
||||||
if scraper_input.job_type
|
if scraper_input.job_type
|
||||||
else None,
|
else None
|
||||||
|
),
|
||||||
"pageNum": 0,
|
"pageNum": 0,
|
||||||
"start": page + scraper_input.offset,
|
"start": page + scraper_input.offset,
|
||||||
"f_AL": "true" if scraper_input.easy_apply else None,
|
"f_AL": "true" if scraper_input.easy_apply else None,
|
||||||
"f_C": ','.join(map(str, scraper_input.linkedin_company_ids)) if scraper_input.linkedin_company_ids else None,
|
"f_C": (
|
||||||
|
",".join(map(str, scraper_input.linkedin_company_ids))
|
||||||
|
if scraper_input.linkedin_company_ids
|
||||||
|
else None
|
||||||
|
),
|
||||||
}
|
}
|
||||||
if seconds_old is not None:
|
if seconds_old is not None:
|
||||||
params["f_TPR"] = f"r{seconds_old}"
|
params["f_TPR"] = f"r{seconds_old}"
|
||||||
|
@ -99,15 +106,19 @@ class LinkedInScraper(Scraper):
|
||||||
)
|
)
|
||||||
if response.status_code not in range(200, 400):
|
if response.status_code not in range(200, 400):
|
||||||
if response.status_code == 429:
|
if response.status_code == 429:
|
||||||
logger.error(f'429 Response - Blocked by LinkedIn for too many requests')
|
err = (
|
||||||
|
f"429 Response - Blocked by LinkedIn for too many requests"
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
logger.error(f'LinkedIn response status code {response.status_code}')
|
err = f"LinkedIn response status code {response.status_code}"
|
||||||
|
err += f" - {response.text}"
|
||||||
|
logger.error(err)
|
||||||
return JobResponse(jobs=job_list)
|
return JobResponse(jobs=job_list)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if "Proxy responded with" in str(e):
|
if "Proxy responded with" in str(e):
|
||||||
logger.error(f'LinkedIn: Bad proxy')
|
logger.error(f"LinkedIn: Bad proxy")
|
||||||
else:
|
else:
|
||||||
logger.error(f'LinkedIn: {str(e)}')
|
logger.error(f"LinkedIn: {str(e)}")
|
||||||
return JobResponse(jobs=job_list)
|
return JobResponse(jobs=job_list)
|
||||||
|
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
@ -128,7 +139,8 @@ class LinkedInScraper(Scraper):
|
||||||
continue
|
continue
|
||||||
seen_urls.add(job_url)
|
seen_urls.add(job_url)
|
||||||
try:
|
try:
|
||||||
job_post = self._process_job(job_card, job_url, scraper_input.linkedin_fetch_description)
|
fetch_desc = scraper_input.linkedin_fetch_description
|
||||||
|
job_post = self._process_job(job_card, job_url, fetch_desc)
|
||||||
if job_post:
|
if job_post:
|
||||||
job_list.append(job_post)
|
job_list.append(job_post)
|
||||||
if not continue_search():
|
if not continue_search():
|
||||||
|
@ -143,8 +155,10 @@ class LinkedInScraper(Scraper):
|
||||||
job_list = job_list[: scraper_input.results_wanted]
|
job_list = job_list[: scraper_input.results_wanted]
|
||||||
return JobResponse(jobs=job_list)
|
return JobResponse(jobs=job_list)
|
||||||
|
|
||||||
def _process_job(self, job_card: Tag, job_url: str, full_descr: bool) -> Optional[JobPost]:
|
def _process_job(
|
||||||
salary_tag = job_card.find('span', class_='job-search-card__salary-info')
|
self, job_card: Tag, job_url: str, full_descr: bool
|
||||||
|
) -> Optional[JobPost]:
|
||||||
|
salary_tag = job_card.find("span", class_="job-search-card__salary-info")
|
||||||
|
|
||||||
compensation = None
|
compensation = None
|
||||||
if salary_tag:
|
if salary_tag:
|
||||||
|
@ -214,7 +228,9 @@ class LinkedInScraper(Scraper):
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
session = create_session(is_tls=False, has_retry=True)
|
session = create_session(is_tls=False, has_retry=True)
|
||||||
response = session.get(job_page_url, headers=self.headers, timeout=5, proxies=self.proxy)
|
response = session.get(
|
||||||
|
job_page_url, headers=self.headers, timeout=5, proxies=self.proxy
|
||||||
|
)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
except:
|
except:
|
||||||
return None, None
|
return None, None
|
||||||
|
@ -227,10 +243,12 @@ class LinkedInScraper(Scraper):
|
||||||
)
|
)
|
||||||
description = None
|
description = None
|
||||||
if div_content is not None:
|
if div_content is not None:
|
||||||
|
|
||||||
def remove_attributes(tag):
|
def remove_attributes(tag):
|
||||||
for attr in list(tag.attrs):
|
for attr in list(tag.attrs):
|
||||||
del tag[attr]
|
del tag[attr]
|
||||||
return tag
|
return tag
|
||||||
|
|
||||||
div_content = remove_attributes(div_content)
|
div_content = remove_attributes(div_content)
|
||||||
description = div_content.prettify(formatter="html")
|
description = div_content.prettify(formatter="html")
|
||||||
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
||||||
|
@ -259,11 +277,8 @@ class LinkedInScraper(Scraper):
|
||||||
)
|
)
|
||||||
elif len(parts) == 3:
|
elif len(parts) == 3:
|
||||||
city, state, country = parts
|
city, state, country = parts
|
||||||
location = Location(
|
|
||||||
city=city,
|
|
||||||
state=state,
|
|
||||||
country = Country.from_string(country)
|
country = Country.from_string(country)
|
||||||
)
|
location = Location(city=city, state=state, country=country)
|
||||||
return location
|
return location
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
Loading…
Reference in New Issue