changed indeed jobTitle scraper

pull/91/head
WillBlears 2023-11-06 20:01:37 -05:00
parent 2b7fea40a5
commit 6606345e84
1 changed files with 22 additions and 25 deletions

View File

@ -9,6 +9,7 @@ import math
import io import io
import json import json
from datetime import datetime from datetime import datetime
import html
import urllib.parse import urllib.parse
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -147,7 +148,7 @@ class IndeedScraper(Scraper):
description = " ".join(li.text for li in li_elements) description = " ".join(li.text for li in li_elements)
job_post = JobPost( job_post = JobPost(
title=job["normTitle"], title=job["displayTitle"],
description=description, description=description,
company_name=job["company"], company_name=job["company"],
location=Location( location=Location(
@ -210,6 +211,7 @@ class IndeedScraper(Scraper):
) )
return job_response return job_response
def get_description(self, job_page_url: str) -> str | None: def get_description(self, job_page_url: str) -> str | None:
""" """
Retrieves job description by going to the job page url Retrieves job description by going to the job page url
@ -235,33 +237,28 @@ class IndeedScraper(Scraper):
if response.status_code not in range(200, 400): if response.status_code not in range(200, 400):
return None return None
soup = BeautifulSoup(response.text, "html.parser") # Search for job description in the response content
script_tag = soup.find( job_desc_pattern = re.compile(r'"sanitizedJobDescription":"(.*?)"\s*,', re.DOTALL)
"script", text=lambda x: x and "window._initialData" in x job_desc_match = job_desc_pattern.search(response.text)
)
if not script_tag: # If a match is found, parse the HTML to extract the text
if job_desc_match:
# Extracting the job description HTML content
job_desc_html = job_desc_match.group(1)
# Unescape HTML entities
job_desc_html = html.unescape(job_desc_html)
# Replace escaped forward slashes and remove line breaks
job_desc_html = job_desc_html.replace('\\/', '/').replace('\\n', ' ')
# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(job_desc_html, "html.parser")
# Extract text content from the HTML, with whitespace normalized
text_content = ' '.join(soup.get_text(separator=" ").split())
# Further clean up to remove any tags that might have been missed
clean_text = re.sub(r'<[^>]+>', '', text_content)
return clean_text.strip()
else:
return None return None
script_code = script_tag.string
match = re.search(r"window\._initialData\s*=\s*({.*?})\s*;", script_code, re.S)
if not match:
return None
json_string = match.group(1)
data = json.loads(json_string)
try:
job_description = data["jobInfoWrapperModel"]["jobInfoModel"][
"sanitizedJobDescription"
]
except (KeyError, TypeError, IndexError):
return None
soup = BeautifulSoup(job_description, "html.parser")
text_content = " ".join(soup.get_text(separator=" ").split()).strip()
return text_content
@staticmethod @staticmethod
def get_job_type(job: dict) -> list[JobType] | None: def get_job_type(job: dict) -> list[JobType] | None: