mirror of https://github.com/Bunsly/JobSpy
updated linkdin scrapper to support multi locations and each max retry it will continue to the next location
parent
f1c39e47bd
commit
cb625f325f
|
@ -14,18 +14,19 @@ async def main():
|
||||||
search_term="software engineer",
|
search_term="software engineer",
|
||||||
google_search_term="software engineer jobs near Tel Aviv Israel since yesterday",
|
google_search_term="software engineer jobs near Tel Aviv Israel since yesterday",
|
||||||
location="Central, Israel",
|
location="Central, Israel",
|
||||||
locations=["Rehovot"],
|
# locations=["Rehovot"],
|
||||||
# locations=["Tel Aviv, Israel","Ramat Gan, Israel","Central, Israel","Rehovot ,Israel"],
|
locations=["Tel Aviv, Israel", "Ramat Gan, Israel",
|
||||||
results_wanted=5,
|
"Central, Israel", "Rehovot ,Israel"],
|
||||||
|
results_wanted=200,
|
||||||
hours_old=200,
|
hours_old=200,
|
||||||
country_indeed='israel',
|
country_indeed='israel',
|
||||||
)
|
)
|
||||||
print(f"Found {len(jobs)} jobs")
|
print(f"Found {len(jobs)} jobs")
|
||||||
|
|
||||||
new_jobs = jobRepository.insertManyIfNotFound(jobs)
|
newJobs = jobRepository.insertManyIfNotFound(jobs)
|
||||||
|
|
||||||
for new_job in new_jobs:
|
for newJob in newJobs:
|
||||||
await telegramBot.sendJob(new_job)
|
await telegramBot.sendJob(newJob)
|
||||||
|
|
||||||
# Run the async main function
|
# Run the async main function
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -17,7 +17,8 @@ from datetime import datetime
|
||||||
from bs4.element import Tag
|
from bs4.element import Tag
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from urllib.parse import urlparse, urlunparse, unquote
|
from urllib.parse import urlparse, urlunparse, unquote
|
||||||
|
from requests.exceptions import RetryError, RequestException
|
||||||
|
from urllib3.exceptions import MaxRetryError
|
||||||
from .constants import headers
|
from .constants import headers
|
||||||
from .. import Scraper, ScraperInput, Site
|
from .. import Scraper, ScraperInput, Site
|
||||||
from ..exceptions import LinkedInException
|
from ..exceptions import LinkedInException
|
||||||
|
@ -82,87 +83,105 @@ class LinkedInScraper(Scraper):
|
||||||
scraper_input.hours_old * 3600 if scraper_input.hours_old else None
|
scraper_input.hours_old * 3600 if scraper_input.hours_old else None
|
||||||
)
|
)
|
||||||
continue_search = (
|
continue_search = (
|
||||||
lambda: len(job_list) < scraper_input.results_wanted and start < 1000
|
lambda: len(
|
||||||
|
job_list) < scraper_input.results_wanted and start < 1000
|
||||||
)
|
)
|
||||||
while continue_search():
|
for location in scraper_input.locations:
|
||||||
request_count += 1
|
logger.info(f"start searching for location: {location}")
|
||||||
logger.info(
|
while continue_search():
|
||||||
f"search page: {request_count} / {math.ceil(scraper_input.results_wanted / 10)}"
|
request_count += 1
|
||||||
)
|
logger.info(
|
||||||
params = {
|
f"search page: {
|
||||||
"keywords": scraper_input.search_term,
|
request_count} / {math.ceil(scraper_input.results_wanted / 10)}"
|
||||||
"location": ",".join(scraper_input.locations),
|
|
||||||
"distance": scraper_input.distance,
|
|
||||||
"f_WT": 2 if scraper_input.is_remote else None,
|
|
||||||
"f_JT": (
|
|
||||||
self.job_type_code(scraper_input.job_type)
|
|
||||||
if scraper_input.job_type
|
|
||||||
else None
|
|
||||||
),
|
|
||||||
"pageNum": 0,
|
|
||||||
"start": start,
|
|
||||||
"f_AL": "true" if scraper_input.easy_apply else None,
|
|
||||||
"f_C": (
|
|
||||||
",".join(map(str, scraper_input.linkedin_company_ids))
|
|
||||||
if scraper_input.linkedin_company_ids
|
|
||||||
else None
|
|
||||||
),
|
|
||||||
}
|
|
||||||
if seconds_old is not None:
|
|
||||||
params["f_TPR"] = f"r{seconds_old}"
|
|
||||||
|
|
||||||
params = {k: v for k, v in params.items() if v is not None}
|
|
||||||
try:
|
|
||||||
response = self.session.get(
|
|
||||||
f"{self.base_url}/jobs-guest/jobs/api/seeMoreJobPostings/search?",
|
|
||||||
params=params,
|
|
||||||
timeout=10,
|
|
||||||
)
|
)
|
||||||
if response.status_code not in range(200, 400):
|
params = {
|
||||||
if response.status_code == 429:
|
"keywords": scraper_input.search_term,
|
||||||
err = (
|
"location": location,
|
||||||
f"429 Response - Blocked by LinkedIn for too many requests"
|
"distance": scraper_input.distance,
|
||||||
)
|
"f_WT": 2 if scraper_input.is_remote else None,
|
||||||
|
"f_JT": (
|
||||||
|
self.job_type_code(scraper_input.job_type)
|
||||||
|
if scraper_input.job_type
|
||||||
|
else None
|
||||||
|
),
|
||||||
|
"pageNum": 0,
|
||||||
|
"start": start,
|
||||||
|
"f_AL": "true" if scraper_input.easy_apply else None,
|
||||||
|
"f_C": (
|
||||||
|
",".join(map(str, scraper_input.linkedin_company_ids))
|
||||||
|
if scraper_input.linkedin_company_ids
|
||||||
|
else None
|
||||||
|
),
|
||||||
|
}
|
||||||
|
if seconds_old is not None:
|
||||||
|
params["f_TPR"] = f"r{seconds_old}"
|
||||||
|
|
||||||
|
params = {k: v for k, v in params.items() if v is not None}
|
||||||
|
try:
|
||||||
|
response = self.session.get(
|
||||||
|
f"{self.base_url}/jobs-guest/jobs/api/seeMoreJobPostings/search?",
|
||||||
|
params=params,
|
||||||
|
timeout=10,
|
||||||
|
)
|
||||||
|
if response.status_code not in range(200, 400):
|
||||||
|
if response.status_code == 429:
|
||||||
|
err = (
|
||||||
|
f"429 Response - Blocked by LinkedIn for too many requests"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
err = f"LinkedIn response status code {
|
||||||
|
response.status_code}"
|
||||||
|
err += f" - {response.text}"
|
||||||
|
logger.error(err)
|
||||||
|
return JobResponse(jobs=job_list)
|
||||||
|
except MaxRetryError as e:
|
||||||
|
"""Raised when the maximum number of retries is exceeded."""
|
||||||
|
logger.error(f"RetryError: {str(e)}")
|
||||||
|
logger.error(f"MaxRetryError for location: {location}")
|
||||||
|
break
|
||||||
|
except RetryError as e:
|
||||||
|
"""Custom retries logic failed"""
|
||||||
|
logger.error(f"RetryError: {str(e)}")
|
||||||
|
logger.error(f"RetryError for location: {location}")
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
if "Proxy responded with" in str(e):
|
||||||
|
logger.error(f"LinkedIn: Bad proxy")
|
||||||
else:
|
else:
|
||||||
err = f"LinkedIn response status code {response.status_code}"
|
logger.error(f"LinkedIn: {str(e)}")
|
||||||
err += f" - {response.text}"
|
|
||||||
logger.error(err)
|
|
||||||
return JobResponse(jobs=job_list)
|
return JobResponse(jobs=job_list)
|
||||||
except Exception as e:
|
|
||||||
if "Proxy responded with" in str(e):
|
|
||||||
logger.error(f"LinkedIn: Bad proxy")
|
|
||||||
else:
|
|
||||||
logger.error(f"LinkedIn: {str(e)}")
|
|
||||||
return JobResponse(jobs=job_list)
|
|
||||||
|
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
job_cards = soup.find_all("div", class_="base-search-card")
|
job_cards = soup.find_all("div", class_="base-search-card")
|
||||||
if len(job_cards) == 0:
|
if len(job_cards) == 0:
|
||||||
return JobResponse(jobs=job_list)
|
break
|
||||||
|
|
||||||
for job_card in job_cards:
|
for job_card in job_cards:
|
||||||
href_tag = job_card.find("a", class_="base-card__full-link")
|
href_tag = job_card.find(
|
||||||
if href_tag and "href" in href_tag.attrs:
|
"a", class_="base-card__full-link")
|
||||||
href = href_tag.attrs["href"].split("?")[0]
|
if href_tag and "href" in href_tag.attrs:
|
||||||
job_id = href.split("-")[-1]
|
href = href_tag.attrs["href"].split("?")[0]
|
||||||
|
job_id = href.split("-")[-1]
|
||||||
|
|
||||||
if job_id in seen_ids:
|
if job_id in seen_ids:
|
||||||
continue
|
continue
|
||||||
seen_ids.add(job_id)
|
seen_ids.add(job_id)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
fetch_desc = scraper_input.linkedin_fetch_description
|
fetch_desc = scraper_input.linkedin_fetch_description
|
||||||
job_post = self._process_job(job_card, job_id, fetch_desc)
|
job_post = self._process_job(
|
||||||
if job_post:
|
job_card, job_id, fetch_desc)
|
||||||
job_list.append(job_post)
|
if job_post:
|
||||||
if not continue_search():
|
job_list.append(job_post)
|
||||||
break
|
if not continue_search():
|
||||||
except Exception as e:
|
break
|
||||||
raise LinkedInException(str(e))
|
except Exception as e:
|
||||||
|
raise LinkedInException(str(e))
|
||||||
|
|
||||||
if continue_search():
|
if continue_search():
|
||||||
time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
|
time.sleep(random.uniform(
|
||||||
start += len(job_list)
|
self.delay, self.delay + self.band_delay))
|
||||||
|
start += len(job_list)
|
||||||
|
|
||||||
job_list = job_list[: scraper_input.results_wanted]
|
job_list = job_list[: scraper_input.results_wanted]
|
||||||
return JobResponse(jobs=job_list)
|
return JobResponse(jobs=job_list)
|
||||||
|
@ -170,12 +189,14 @@ class LinkedInScraper(Scraper):
|
||||||
def _process_job(
|
def _process_job(
|
||||||
self, job_card: Tag, job_id: str, full_descr: bool
|
self, job_card: Tag, job_id: str, full_descr: bool
|
||||||
) -> Optional[JobPost]:
|
) -> Optional[JobPost]:
|
||||||
salary_tag = job_card.find("span", class_="job-search-card__salary-info")
|
salary_tag = job_card.find(
|
||||||
|
"span", class_="job-search-card__salary-info")
|
||||||
|
|
||||||
compensation = None
|
compensation = None
|
||||||
if salary_tag:
|
if salary_tag:
|
||||||
salary_text = salary_tag.get_text(separator=" ").strip()
|
salary_text = salary_tag.get_text(separator=" ").strip()
|
||||||
salary_values = [currency_parser(value) for value in salary_text.split("-")]
|
salary_values = [currency_parser(value)
|
||||||
|
for value in salary_text.split("-")]
|
||||||
salary_min = salary_values[0]
|
salary_min = salary_values[0]
|
||||||
salary_max = salary_values[1]
|
salary_max = salary_values[1]
|
||||||
currency = salary_text[0] if salary_text[0] != "$" else "USD"
|
currency = salary_text[0] if salary_text[0] != "$" else "USD"
|
||||||
|
@ -196,9 +217,11 @@ class LinkedInScraper(Scraper):
|
||||||
if company_a_tag and company_a_tag.has_attr("href")
|
if company_a_tag and company_a_tag.has_attr("href")
|
||||||
else ""
|
else ""
|
||||||
)
|
)
|
||||||
company = company_a_tag.get_text(strip=True) if company_a_tag else "N/A"
|
company = company_a_tag.get_text(
|
||||||
|
strip=True) if company_a_tag else "N/A"
|
||||||
|
|
||||||
metadata_card = job_card.find("div", class_="base-search-card__metadata")
|
metadata_card = job_card.find(
|
||||||
|
"div", class_="base-search-card__metadata")
|
||||||
location = self._get_location(metadata_card)
|
location = self._get_location(metadata_card)
|
||||||
|
|
||||||
datetime_tag = (
|
datetime_tag = (
|
||||||
|
|
Loading…
Reference in New Issue