changed indeed jobTitle scraper

2023-11-06 20:01:37 -05:00 · 2023-11-06 20:01:37 -05:00 · 6606345e84
parent 2b7fea40a5
commit 6606345e84
1 changed files with 22 additions and 25 deletions
--- a/src/jobspy/scrapers/indeed/init.py
+++ b/src/jobspy/scrapers/indeed/init.py
@ -9,6 +9,7 @@ import math
 import io
 import json
 from datetime import datetime
 import html
 import urllib.parse
 from bs4 import BeautifulSoup
@ -147,7 +148,7 @@ class IndeedScraper(Scraper):
                    description = " ".join(li.text for li in li_elements)
            job_post = JobPost(
-                title=job["normTitle"],
+                title=job["displayTitle"],
                description=description,
                company_name=job["company"],
                location=Location(
@ -210,6 +211,7 @@ class IndeedScraper(Scraper):
        )
        return job_response
    def get_description(self, job_page_url: str) -> str | None:
        """
        Retrieves job description by going to the job page url
@ -235,33 +237,28 @@ class IndeedScraper(Scraper):
        if response.status_code not in range(200, 400):
            return None
-        soup = BeautifulSoup(response.text, "html.parser")
+        # Search for job description in the response content
-        script_tag = soup.find(
+        job_desc_pattern = re.compile(r'"sanitizedJobDescription":"(.*?)"\s*,', re.DOTALL)
-            "script", text=lambda x: x and "window._initialData" in x
+        job_desc_match = job_desc_pattern.search(response.text)
        )
-        if not script_tag:
+        # If a match is found, parse the HTML to extract the text
        if job_desc_match:
            # Extracting the job description HTML content
            job_desc_html = job_desc_match.group(1)
            # Unescape HTML entities
            job_desc_html = html.unescape(job_desc_html)
            # Replace escaped forward slashes and remove line breaks
            job_desc_html = job_desc_html.replace('\\/', '/').replace('\\n', ' ')
            # Parse the HTML content with BeautifulSoup
            soup = BeautifulSoup(job_desc_html, "html.parser")
            # Extract text content from the HTML, with whitespace normalized
            text_content = ' '.join(soup.get_text(separator=" ").split())
            # Further clean up to remove any tags that might have been missed
            clean_text = re.sub(r'<[^>]+>', '', text_content)
            return clean_text.strip()
        else:
            return None
        script_code = script_tag.string
        match = re.search(r"window\._initialData\s*=\s*({.*?})\s*;", script_code, re.S)
        if not match:
            return None
        json_string = match.group(1)
        data = json.loads(json_string)
        try:
            job_description = data["jobInfoWrapperModel"]["jobInfoModel"][
                "sanitizedJobDescription"
            ]
        except (KeyError, TypeError, IndexError):
            return None
        soup = BeautifulSoup(job_description, "html.parser")
        text_content = " ".join(soup.get_text(separator=" ").split()).strip()
        return text_content
    @staticmethod
    def get_job_type(job: dict) -> list[JobType] | None: