updated linkdin scrapper to support multi locations and each max retry it will continue to the next location

pull/231/head
Yariv Menachem 2024-12-12 12:05:38 +02:00
parent f1c39e47bd
commit cb625f325f
2 changed files with 107 additions and 83 deletions

View File

@ -14,18 +14,19 @@ async def main():
search_term="software engineer", search_term="software engineer",
google_search_term="software engineer jobs near Tel Aviv Israel since yesterday", google_search_term="software engineer jobs near Tel Aviv Israel since yesterday",
location="Central, Israel", location="Central, Israel",
locations=["Rehovot"], # locations=["Rehovot"],
# locations=["Tel Aviv, Israel","Ramat Gan, Israel","Central, Israel","Rehovot ,Israel"], locations=["Tel Aviv, Israel", "Ramat Gan, Israel",
results_wanted=5, "Central, Israel", "Rehovot ,Israel"],
results_wanted=200,
hours_old=200, hours_old=200,
country_indeed='israel', country_indeed='israel',
) )
print(f"Found {len(jobs)} jobs") print(f"Found {len(jobs)} jobs")
new_jobs = jobRepository.insertManyIfNotFound(jobs) newJobs = jobRepository.insertManyIfNotFound(jobs)
for new_job in new_jobs: for newJob in newJobs:
await telegramBot.sendJob(new_job) await telegramBot.sendJob(newJob)
# Run the async main function # Run the async main function
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -17,7 +17,8 @@ from datetime import datetime
from bs4.element import Tag from bs4.element import Tag
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urlparse, urlunparse, unquote from urllib.parse import urlparse, urlunparse, unquote
from requests.exceptions import RetryError, RequestException
from urllib3.exceptions import MaxRetryError
from .constants import headers from .constants import headers
from .. import Scraper, ScraperInput, Site from .. import Scraper, ScraperInput, Site
from ..exceptions import LinkedInException from ..exceptions import LinkedInException
@ -82,87 +83,105 @@ class LinkedInScraper(Scraper):
scraper_input.hours_old * 3600 if scraper_input.hours_old else None scraper_input.hours_old * 3600 if scraper_input.hours_old else None
) )
continue_search = ( continue_search = (
lambda: len(job_list) < scraper_input.results_wanted and start < 1000 lambda: len(
job_list) < scraper_input.results_wanted and start < 1000
) )
while continue_search(): for location in scraper_input.locations:
request_count += 1 logger.info(f"start searching for location: {location}")
logger.info( while continue_search():
f"search page: {request_count} / {math.ceil(scraper_input.results_wanted / 10)}" request_count += 1
) logger.info(
params = { f"search page: {
"keywords": scraper_input.search_term, request_count} / {math.ceil(scraper_input.results_wanted / 10)}"
"location": ",".join(scraper_input.locations),
"distance": scraper_input.distance,
"f_WT": 2 if scraper_input.is_remote else None,
"f_JT": (
self.job_type_code(scraper_input.job_type)
if scraper_input.job_type
else None
),
"pageNum": 0,
"start": start,
"f_AL": "true" if scraper_input.easy_apply else None,
"f_C": (
",".join(map(str, scraper_input.linkedin_company_ids))
if scraper_input.linkedin_company_ids
else None
),
}
if seconds_old is not None:
params["f_TPR"] = f"r{seconds_old}"
params = {k: v for k, v in params.items() if v is not None}
try:
response = self.session.get(
f"{self.base_url}/jobs-guest/jobs/api/seeMoreJobPostings/search?",
params=params,
timeout=10,
) )
if response.status_code not in range(200, 400): params = {
if response.status_code == 429: "keywords": scraper_input.search_term,
err = ( "location": location,
f"429 Response - Blocked by LinkedIn for too many requests" "distance": scraper_input.distance,
) "f_WT": 2 if scraper_input.is_remote else None,
"f_JT": (
self.job_type_code(scraper_input.job_type)
if scraper_input.job_type
else None
),
"pageNum": 0,
"start": start,
"f_AL": "true" if scraper_input.easy_apply else None,
"f_C": (
",".join(map(str, scraper_input.linkedin_company_ids))
if scraper_input.linkedin_company_ids
else None
),
}
if seconds_old is not None:
params["f_TPR"] = f"r{seconds_old}"
params = {k: v for k, v in params.items() if v is not None}
try:
response = self.session.get(
f"{self.base_url}/jobs-guest/jobs/api/seeMoreJobPostings/search?",
params=params,
timeout=10,
)
if response.status_code not in range(200, 400):
if response.status_code == 429:
err = (
f"429 Response - Blocked by LinkedIn for too many requests"
)
else:
err = f"LinkedIn response status code {
response.status_code}"
err += f" - {response.text}"
logger.error(err)
return JobResponse(jobs=job_list)
except MaxRetryError as e:
"""Raised when the maximum number of retries is exceeded."""
logger.error(f"RetryError: {str(e)}")
logger.error(f"MaxRetryError for location: {location}")
break
except RetryError as e:
"""Custom retries logic failed"""
logger.error(f"RetryError: {str(e)}")
logger.error(f"RetryError for location: {location}")
break
except Exception as e:
if "Proxy responded with" in str(e):
logger.error(f"LinkedIn: Bad proxy")
else: else:
err = f"LinkedIn response status code {response.status_code}" logger.error(f"LinkedIn: {str(e)}")
err += f" - {response.text}"
logger.error(err)
return JobResponse(jobs=job_list) return JobResponse(jobs=job_list)
except Exception as e:
if "Proxy responded with" in str(e):
logger.error(f"LinkedIn: Bad proxy")
else:
logger.error(f"LinkedIn: {str(e)}")
return JobResponse(jobs=job_list)
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
job_cards = soup.find_all("div", class_="base-search-card") job_cards = soup.find_all("div", class_="base-search-card")
if len(job_cards) == 0: if len(job_cards) == 0:
return JobResponse(jobs=job_list) break
for job_card in job_cards: for job_card in job_cards:
href_tag = job_card.find("a", class_="base-card__full-link") href_tag = job_card.find(
if href_tag and "href" in href_tag.attrs: "a", class_="base-card__full-link")
href = href_tag.attrs["href"].split("?")[0] if href_tag and "href" in href_tag.attrs:
job_id = href.split("-")[-1] href = href_tag.attrs["href"].split("?")[0]
job_id = href.split("-")[-1]
if job_id in seen_ids: if job_id in seen_ids:
continue continue
seen_ids.add(job_id) seen_ids.add(job_id)
try: try:
fetch_desc = scraper_input.linkedin_fetch_description fetch_desc = scraper_input.linkedin_fetch_description
job_post = self._process_job(job_card, job_id, fetch_desc) job_post = self._process_job(
if job_post: job_card, job_id, fetch_desc)
job_list.append(job_post) if job_post:
if not continue_search(): job_list.append(job_post)
break if not continue_search():
except Exception as e: break
raise LinkedInException(str(e)) except Exception as e:
raise LinkedInException(str(e))
if continue_search(): if continue_search():
time.sleep(random.uniform(self.delay, self.delay + self.band_delay)) time.sleep(random.uniform(
start += len(job_list) self.delay, self.delay + self.band_delay))
start += len(job_list)
job_list = job_list[: scraper_input.results_wanted] job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list) return JobResponse(jobs=job_list)
@ -170,12 +189,14 @@ class LinkedInScraper(Scraper):
def _process_job( def _process_job(
self, job_card: Tag, job_id: str, full_descr: bool self, job_card: Tag, job_id: str, full_descr: bool
) -> Optional[JobPost]: ) -> Optional[JobPost]:
salary_tag = job_card.find("span", class_="job-search-card__salary-info") salary_tag = job_card.find(
"span", class_="job-search-card__salary-info")
compensation = None compensation = None
if salary_tag: if salary_tag:
salary_text = salary_tag.get_text(separator=" ").strip() salary_text = salary_tag.get_text(separator=" ").strip()
salary_values = [currency_parser(value) for value in salary_text.split("-")] salary_values = [currency_parser(value)
for value in salary_text.split("-")]
salary_min = salary_values[0] salary_min = salary_values[0]
salary_max = salary_values[1] salary_max = salary_values[1]
currency = salary_text[0] if salary_text[0] != "$" else "USD" currency = salary_text[0] if salary_text[0] != "$" else "USD"
@ -196,9 +217,11 @@ class LinkedInScraper(Scraper):
if company_a_tag and company_a_tag.has_attr("href") if company_a_tag and company_a_tag.has_attr("href")
else "" else ""
) )
company = company_a_tag.get_text(strip=True) if company_a_tag else "N/A" company = company_a_tag.get_text(
strip=True) if company_a_tag else "N/A"
metadata_card = job_card.find("div", class_="base-search-card__metadata") metadata_card = job_card.find(
"div", class_="base-search-card__metadata")
location = self._get_location(metadata_card) location = self._get_location(metadata_card)
datetime_tag = ( datetime_tag = (