fix: linkedin no results (#80 )

chore: version
linkedin fix delays (#79 )
2026-03-05 03:54:31 -08:00 · 2024-01-10 14:01:10 -06:00 · 2024-01-09 19:33:56 -06:00 · 2024-01-09 19:32:51 -06:00 · 2024-01-03 12:04:50 -06:00 · 2024-01-03 12:04:32 -06:00
4 changed files with 73 additions and 56 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "python-jobspy"
-version = "1.1.30"
+version = "1.1.33"
 description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
 authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
 homepage = "https://github.com/Bunsly/JobSpy"
--- a/src/jobspy/scrapers/glassdoor/init.py
+++ b/src/jobspy/scrapers/glassdoor/init.py
@@ -26,7 +26,7 @@ class GlassdoorScraper(Scraper):
        """
        Initializes GlassdoorScraper with the Glassdoor job search url
        """
-        site = Site(Site.ZIP_RECRUITER)
+        site = Site(Site.GLASSDOOR)
        super().__init__(site, proxy=proxy)
        self.url = None
@@ -49,7 +49,7 @@ class GlassdoorScraper(Scraper):
            payload = self.add_payload(
                scraper_input, location_id, location_type, page_num, cursor
            )
-            session = create_session(self.proxy, is_tls=False)
+            session = create_session(self.proxy, is_tls=False, has_retry=True)
            response = session.post(
                f"{self.url}/graph", headers=self.headers(), timeout=10, data=payload
            )
@@ -171,7 +171,7 @@ class GlassdoorScraper(Scraper):
        if not location or is_remote:
            return "11047", "STATE"  # remote options
        url = f"{self.url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}"
-        session = create_session(self.proxy)
+        session = create_session(self.proxy, has_retry=True)
        response = session.get(url)
        if response.status_code != 200:
            raise GlassdoorException(
@@ -194,7 +194,7 @@ class GlassdoorScraper(Scraper):
        location_type: str,
        page_num: int,
        cursor: str | None = None,
-    ) -> dict[str, str | Any]:
+    ) -> str:
        payload = {
            "operationName": "JobSearchResultsQuery",
            "variables": {
--- a/src/jobspy/scrapers/linkedin/init.py
+++ b/src/jobspy/scrapers/linkedin/init.py
@@ -4,6 +4,7 @@ jobspy.scrapers.linkedin
 This module contains routines to scrape LinkedIn.
 """
 import random
 from typing import Optional
 from datetime import datetime
@@ -16,14 +17,14 @@ from threading import Lock
 from urllib.parse import urlparse, urlunparse
 from .. import Scraper, ScraperInput, Site
 from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type, currency_parser
 from ..exceptions import LinkedInException
 from ..utils import create_session
 from ...jobs import JobPost, Location, JobResponse, JobType, Country, Compensation
 from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type, currency_parser
 class LinkedInScraper(Scraper):
-    MAX_RETRIES = 3
+    DELAY = 3
    DELAY = 10
    def __init__(self, proxy: Optional[str] = None):
        """
@@ -57,6 +58,7 @@ class LinkedInScraper(Scraper):
            return mapping.get(job_type_enum, "")
        while len(job_list) < scraper_input.results_wanted and page < 1000:
            session = create_session(is_tls=False, has_retry=True, delay=5)
            params = {
                "keywords": scraper_input.search_term,
                "location": scraper_input.location,
@@ -71,44 +73,30 @@ class LinkedInScraper(Scraper):
            }
            params = {k: v for k, v in params.items() if v is not None}
-            retries = 0
+            try:
-            while retries < self.MAX_RETRIES:
+                response = session.get(
-                try:
+                    f"{self.url}/jobs-guest/jobs/api/seeMoreJobPostings/search?",
-                    response = requests.get(
+                    params=params,
-                        f"{self.url}/jobs-guest/jobs/api/seeMoreJobPostings/search?",
+                    allow_redirects=True,
-                        params=params,
+                    proxies=self.proxy,
-                        allow_redirects=True,
+                    headers=self.headers(),
-                        proxies=self.proxy,
+                    timeout=10,
                        timeout=10,
                    )
                    response.raise_for_status()
                    break
                except requests.HTTPError as e:
                    if hasattr(e, "response") and e.response is not None:
                        if e.response.status_code in (429, 502):
                            time.sleep(self.DELAY)
                            retries += 1
                            continue
                        else:
                            raise LinkedInException(
                                f"bad response status code: {e.response.status_code}"
                            )
                    else:
                        raise
                except ProxyError as e:
                    raise LinkedInException("bad proxy")
                except Exception as e:
                    raise LinkedInException(str(e))
            else:
                # Raise an exception if the maximum number of retries is reached
                raise LinkedInException(
                    "Max retries reached, failed to get a valid response"
                )
                response.raise_for_status()
            except requests.HTTPError as e:
                raise LinkedInException(f"bad response status code: {e.response.status_code}")
            except ProxyError as e:
                raise LinkedInException("bad proxy")
            except Exception as e:
                raise LinkedInException(str(e))
            soup = BeautifulSoup(response.text, "html.parser")
            job_cards = soup.find_all("div", class_="base-search-card")
            if len(job_cards) == 0:
                return JobResponse(jobs=job_list)
-            for job_card in soup.find_all("div", class_="base-search-card"):
+            for job_card in job_cards:
                job_url = None
                href_tag = job_card.find("a", class_="base-card__full-link")
                if href_tag and "href" in href_tag.attrs:
@@ -130,6 +118,7 @@ class LinkedInScraper(Scraper):
                    raise LinkedInException("Exception occurred while processing jobs")
            page += 25
            time.sleep(random.uniform(LinkedInScraper.DELAY, LinkedInScraper.DELAY + 2))
        job_list = job_list[: scraper_input.results_wanted]
        return JobResponse(jobs=job_list)
@@ -181,22 +170,22 @@ class LinkedInScraper(Scraper):
        benefits_tag = job_card.find("span", class_="result-benefits__text")
        benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None
-        description, job_type = self.get_job_description(job_url)
+        # removed to speed up scraping
-        # description, job_type = None, []
+        # description, job_type = self.get_job_description(job_url)
        return JobPost(
            title=title,
            description=description,
            company_name=company,
            company_url=company_url,
            location=location,
            date_posted=date_posted,
            job_url=job_url,
            job_type=job_type,
            compensation=compensation,
            benefits=benefits,
-            emails=extract_emails_from_text(description) if description else None,
+            # job_type=job_type,
-            num_urgent_words=count_urgent_words(description) if description else None,
+            # description=description,
            # emails=extract_emails_from_text(description) if description else None,
            # num_urgent_words=count_urgent_words(description) if description else None,
        )
    def get_job_description(
@@ -208,12 +197,10 @@ class LinkedInScraper(Scraper):
        :return: description or None
        """
        try:
-            response = requests.get(job_page_url, timeout=5, proxies=self.proxy)
+            session = create_session(is_tls=False, has_retry=True)
            response = session.get(job_page_url, timeout=5, proxies=self.proxy)
            response.raise_for_status()
        except requests.HTTPError as e:
            if hasattr(e, "response") and e.response is not None:
                if e.response.status_code in (429, 502):
                    time.sleep(self.DELAY)
            return None, None
        except Exception as e:
            return None, None
@@ -287,3 +274,21 @@ class LinkedInScraper(Scraper):
                )
        return location
    @staticmethod
    def headers() -> dict:
        return {
            'authority': 'www.linkedin.com',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'accept-language': 'en-US,en;q=0.9',
            'cache-control': 'max-age=0',
            'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
            # 'sec-ch-ua-mobile': '?0',
            # 'sec-ch-ua-platform': '"macOS"',
            # 'sec-fetch-dest': 'document',
            # 'sec-fetch-mode': 'navigate',
            # 'sec-fetch-site': 'none',
            # 'sec-fetch-user': '?1',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        }
--- a/src/jobspy/scrapers/utils.py
+++ b/src/jobspy/scrapers/utils.py
@@ -1,8 +1,10 @@
 import re
 import numpy as np
 import requests
 import tls_client
 import requests
 from requests.adapters import HTTPAdapter, Retry
 from ..jobs import JobType
@@ -27,11 +29,11 @@ def extract_emails_from_text(text: str) -> list[str] | None:
    return email_regex.findall(text)
-def create_session(proxy: dict | None = None, is_tls: bool = True):
+def create_session(proxy: dict | None = None, is_tls: bool = True, has_retry: bool = False, delay: int = 1) -> requests.Session:
    """
-    Creates a tls client session
+    Creates a requests session with optional tls, proxy, and retry settings.
-    :return: A session object with or without proxies.
+    :return: A session object
    """
    if is_tls:
        session = tls_client.Session(
@@ -44,6 +46,16 @@ def create_session(proxy: dict | None = None, is_tls: bool = True):
        session.allow_redirects = True
        if proxy:
            session.proxies.update(proxy)
        if has_retry:
            retries = Retry(total=3,
                            connect=3,
                            status=3,
                            status_forcelist=[500, 502, 503, 504, 429],
                            backoff_factor=delay)
            adapter = HTTPAdapter(max_retries=retries)
            session.mount('http://', adapter)
            session.mount('https://', adapter)
    return session
Author	SHA1	Message	Date
Cullen Watson	a7ad616567	fix: linkedin no results (#80 )	2024-01-10 14:01:10 -06:00
cullenwatson	53bc33a43a	chore: version	2024-01-09 19:33:56 -06:00
Cullen Watson	22870438c7	linkedin fix delays (#79 )	2024-01-09 19:32:51 -06:00
Cullen Watson	aeb93b99f5	Update pyproject.toml	2024-01-03 12:04:50 -06:00
Cullen Watson	a5916edcdd	fix(glassdoor): add retry adapter (#77 )	2024-01-03 12:04:32 -06:00