changed indeed jobTitle scraper

pull/91/head
WillBlears 2023-11-06 20:01:37 -05:00
parent 2b7fea40a5
commit 6606345e84
1 changed files with 22 additions and 25 deletions

View File

@ -9,6 +9,7 @@ import math
import io
import json
from datetime import datetime
import html
import urllib.parse
from bs4 import BeautifulSoup
@ -147,7 +148,7 @@ class IndeedScraper(Scraper):
description = " ".join(li.text for li in li_elements)
job_post = JobPost(
title=job["normTitle"],
title=job["displayTitle"],
description=description,
company_name=job["company"],
location=Location(
@ -210,6 +211,7 @@ class IndeedScraper(Scraper):
)
return job_response
def get_description(self, job_page_url: str) -> str | None:
"""
Retrieves job description by going to the job page url
@ -235,33 +237,28 @@ class IndeedScraper(Scraper):
if response.status_code not in range(200, 400):
return None
soup = BeautifulSoup(response.text, "html.parser")
script_tag = soup.find(
"script", text=lambda x: x and "window._initialData" in x
)
# Search for job description in the response content
job_desc_pattern = re.compile(r'"sanitizedJobDescription":"(.*?)"\s*,', re.DOTALL)
job_desc_match = job_desc_pattern.search(response.text)
if not script_tag:
# If a match is found, parse the HTML to extract the text
if job_desc_match:
# Extracting the job description HTML content
job_desc_html = job_desc_match.group(1)
# Unescape HTML entities
job_desc_html = html.unescape(job_desc_html)
# Replace escaped forward slashes and remove line breaks
job_desc_html = job_desc_html.replace('\\/', '/').replace('\\n', ' ')
# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(job_desc_html, "html.parser")
# Extract text content from the HTML, with whitespace normalized
text_content = ' '.join(soup.get_text(separator=" ").split())
# Further clean up to remove any tags that might have been missed
clean_text = re.sub(r'<[^>]+>', '', text_content)
return clean_text.strip()
else:
return None
script_code = script_tag.string
match = re.search(r"window\._initialData\s*=\s*({.*?})\s*;", script_code, re.S)
if not match:
return None
json_string = match.group(1)
data = json.loads(json_string)
try:
job_description = data["jobInfoWrapperModel"]["jobInfoModel"][
"sanitizedJobDescription"
]
except (KeyError, TypeError, IndexError):
return None
soup = BeautifulSoup(job_description, "html.parser")
text_content = " ".join(soup.get_text(separator=" ").split()).strip()
return text_content
@staticmethod
def get_job_type(job: dict) -> list[JobType] | None: