updated linkdin scrapper to support multi locations and each max retry it will continue to the next location

2024-12-12 12:05:38 +02:00 · 2024-12-12 12:05:38 +02:00 · cb625f325f
parent f1c39e47bd
commit cb625f325f
2 changed files with 107 additions and 83 deletions
--- a/src/jobspy/main.py
+++ b/src/jobspy/main.py
@ -14,18 +14,19 @@ async def main():
        search_term="software engineer",
        google_search_term="software engineer jobs near Tel Aviv Israel since yesterday",
        location="Central, Israel",
-        locations=["Rehovot"],
+        # locations=["Rehovot"],
-        # locations=["Tel Aviv, Israel","Ramat Gan, Israel","Central, Israel","Rehovot ,Israel"],
+        locations=["Tel Aviv, Israel", "Ramat Gan, Israel",
-        results_wanted=5,
+                   "Central, Israel", "Rehovot ,Israel"],
        results_wanted=200,
        hours_old=200,
        country_indeed='israel',
    )
    print(f"Found {len(jobs)} jobs")
-    new_jobs = jobRepository.insertManyIfNotFound(jobs)
+    newJobs = jobRepository.insertManyIfNotFound(jobs)
-    for new_job in new_jobs:
+    for newJob in newJobs:
-        await telegramBot.sendJob(new_job)
+        await telegramBot.sendJob(newJob)
 # Run the async main function
 if __name__ == "__main__":
--- a/src/jobspy/scrapers/linkedin/init.py
+++ b/src/jobspy/scrapers/linkedin/init.py
@ -17,7 +17,8 @@ from datetime import datetime
 from bs4.element import Tag
 from bs4 import BeautifulSoup
 from urllib.parse import urlparse, urlunparse, unquote
-
+from requests.exceptions import RetryError, RequestException
 from urllib3.exceptions import MaxRetryError
 from .constants import headers
 from .. import Scraper, ScraperInput, Site
 from ..exceptions import LinkedInException
@ -82,87 +83,105 @@ class LinkedInScraper(Scraper):
            scraper_input.hours_old * 3600 if scraper_input.hours_old else None
        )
        continue_search = (
-            lambda: len(job_list) < scraper_input.results_wanted and start < 1000
+            lambda: len(
                job_list) < scraper_input.results_wanted and start < 1000
        )
-        while continue_search():
+        for location in scraper_input.locations:
-            request_count += 1
+            logger.info(f"start searching for location: {location}")
-            logger.info(
+            while continue_search():
-                f"search page: {request_count} / {math.ceil(scraper_input.results_wanted / 10)}"
+                request_count += 1
-            )
+                logger.info(
-            params = {
+                    f"search page: {
-                "keywords": scraper_input.search_term,
+                        request_count} / {math.ceil(scraper_input.results_wanted / 10)}"
                "location": ",".join(scraper_input.locations),
                "distance": scraper_input.distance,
                "f_WT": 2 if scraper_input.is_remote else None,
                "f_JT": (
                    self.job_type_code(scraper_input.job_type)
                    if scraper_input.job_type
                    else None
                ),
                "pageNum": 0,
                "start": start,
                "f_AL": "true" if scraper_input.easy_apply else None,
                "f_C": (
                    ",".join(map(str, scraper_input.linkedin_company_ids))
                    if scraper_input.linkedin_company_ids
                    else None
                ),
            }
            if seconds_old is not None:
                params["f_TPR"] = f"r{seconds_old}"
            params = {k: v for k, v in params.items() if v is not None}
            try:
                response = self.session.get(
                    f"{self.base_url}/jobs-guest/jobs/api/seeMoreJobPostings/search?",
                    params=params,
                    timeout=10,
                )
-                if response.status_code not in range(200, 400):
+                params = {
-                    if response.status_code == 429:
+                    "keywords": scraper_input.search_term,
-                        err = (
+                    "location": location,
-                            f"429 Response - Blocked by LinkedIn for too many requests"
+                    "distance": scraper_input.distance,
-                        )
+                    "f_WT": 2 if scraper_input.is_remote else None,
                    "f_JT": (
                        self.job_type_code(scraper_input.job_type)
                        if scraper_input.job_type
                        else None
                    ),
                    "pageNum": 0,
                    "start": start,
                    "f_AL": "true" if scraper_input.easy_apply else None,
                    "f_C": (
                        ",".join(map(str, scraper_input.linkedin_company_ids))
                        if scraper_input.linkedin_company_ids
                        else None
                    ),
                }
                if seconds_old is not None:
                    params["f_TPR"] = f"r{seconds_old}"
                params = {k: v for k, v in params.items() if v is not None}
                try:
                    response = self.session.get(
                        f"{self.base_url}/jobs-guest/jobs/api/seeMoreJobPostings/search?",
                        params=params,
                        timeout=10,
                    )
                    if response.status_code not in range(200, 400):
                        if response.status_code == 429:
                            err = (
                                f"429 Response - Blocked by LinkedIn for too many requests"
                            )
                        else:
                            err = f"LinkedIn response status code {
                                response.status_code}"
                            err += f" - {response.text}"
                        logger.error(err)
                        return JobResponse(jobs=job_list)
                except MaxRetryError as e:
                    """Raised when the maximum number of retries is exceeded."""
                    logger.error(f"RetryError: {str(e)}")
                    logger.error(f"MaxRetryError for location: {location}")
                    break
                except RetryError as e:
                    """Custom retries logic failed"""
                    logger.error(f"RetryError: {str(e)}")
                    logger.error(f"RetryError for location: {location}")
                    break
                except Exception as e:
                    if "Proxy responded with" in str(e):
                        logger.error(f"LinkedIn: Bad proxy")
                    else:
-                        err = f"LinkedIn response status code {response.status_code}"
+                        logger.error(f"LinkedIn: {str(e)}")
                        err += f" - {response.text}"
                    logger.error(err)
                    return JobResponse(jobs=job_list)
            except Exception as e:
                if "Proxy responded with" in str(e):
                    logger.error(f"LinkedIn: Bad proxy")
                else:
                    logger.error(f"LinkedIn: {str(e)}")
                return JobResponse(jobs=job_list)
-            soup = BeautifulSoup(response.text, "html.parser")
+                soup = BeautifulSoup(response.text, "html.parser")
-            job_cards = soup.find_all("div", class_="base-search-card")
+                job_cards = soup.find_all("div", class_="base-search-card")
-            if len(job_cards) == 0:
+                if len(job_cards) == 0:
-                return JobResponse(jobs=job_list)
+                    break
-            for job_card in job_cards:
+                for job_card in job_cards:
-                href_tag = job_card.find("a", class_="base-card__full-link")
+                    href_tag = job_card.find(
-                if href_tag and "href" in href_tag.attrs:
+                        "a", class_="base-card__full-link")
-                    href = href_tag.attrs["href"].split("?")[0]
+                    if href_tag and "href" in href_tag.attrs:
-                    job_id = href.split("-")[-1]
+                        href = href_tag.attrs["href"].split("?")[0]
                        job_id = href.split("-")[-1]
-                    if job_id in seen_ids:
+                        if job_id in seen_ids:
-                        continue
+                            continue
-                    seen_ids.add(job_id)
+                        seen_ids.add(job_id)
-                    try:
+                        try:
-                        fetch_desc = scraper_input.linkedin_fetch_description
+                            fetch_desc = scraper_input.linkedin_fetch_description
-                        job_post = self._process_job(job_card, job_id, fetch_desc)
+                            job_post = self._process_job(
-                        if job_post:
+                                job_card, job_id, fetch_desc)
-                            job_list.append(job_post)
+                            if job_post:
-                        if not continue_search():
+                                job_list.append(job_post)
-                            break
+                            if not continue_search():
-                    except Exception as e:
+                                break
-                        raise LinkedInException(str(e))
+                        except Exception as e:
                            raise LinkedInException(str(e))
-            if continue_search():
+                if continue_search():
-                time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
+                    time.sleep(random.uniform(
-                start += len(job_list)
+                        self.delay, self.delay + self.band_delay))
                    start += len(job_list)
        job_list = job_list[: scraper_input.results_wanted]
        return JobResponse(jobs=job_list)
@ -170,12 +189,14 @@ class LinkedInScraper(Scraper):
    def _process_job(
        self, job_card: Tag, job_id: str, full_descr: bool
    ) -> Optional[JobPost]:
-        salary_tag = job_card.find("span", class_="job-search-card__salary-info")
+        salary_tag = job_card.find(
            "span", class_="job-search-card__salary-info")
        compensation = None
        if salary_tag:
            salary_text = salary_tag.get_text(separator=" ").strip()
-            salary_values = [currency_parser(value) for value in salary_text.split("-")]
+            salary_values = [currency_parser(value)
                             for value in salary_text.split("-")]
            salary_min = salary_values[0]
            salary_max = salary_values[1]
            currency = salary_text[0] if salary_text[0] != "$" else "USD"
@ -196,9 +217,11 @@ class LinkedInScraper(Scraper):
            if company_a_tag and company_a_tag.has_attr("href")
            else ""
        )
-        company = company_a_tag.get_text(strip=True) if company_a_tag else "N/A"
+        company = company_a_tag.get_text(
            strip=True) if company_a_tag else "N/A"
-        metadata_card = job_card.find("div", class_="base-search-card__metadata")
+        metadata_card = job_card.find(
            "div", class_="base-search-card__metadata")
        location = self._get_location(metadata_card)
        datetime_tag = (