mirror of https://github.com/Bunsly/JobSpy
parent
df339610fa
commit
ccb0c17660
|
@ -1,6 +1,6 @@
|
|||
[tool.poetry]
|
||||
name = "python-jobspy"
|
||||
version = "1.1.55"
|
||||
version = "1.1.56"
|
||||
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
||||
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
||||
homepage = "https://github.com/Bunsly/JobSpy"
|
||||
|
|
|
@ -19,7 +19,7 @@ from urllib.parse import urlparse, urlunparse, unquote
|
|||
|
||||
from .. import Scraper, ScraperInput, Site
|
||||
from ..exceptions import LinkedInException
|
||||
from ..utils import create_session
|
||||
from ..utils import create_session, remove_attributes
|
||||
from ...jobs import (
|
||||
JobPost,
|
||||
Location,
|
||||
|
@ -257,12 +257,6 @@ class LinkedInScraper(Scraper):
|
|||
)
|
||||
description = None
|
||||
if div_content is not None:
|
||||
|
||||
def remove_attributes(tag):
|
||||
for attr in list(tag.attrs):
|
||||
del tag[attr]
|
||||
return tag
|
||||
|
||||
div_content = remove_attributes(div_content)
|
||||
description = div_content.prettify(formatter="html")
|
||||
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
||||
|
|
|
@ -93,6 +93,7 @@ class TLSRotating(RotatingProxySession, tls_client.Session):
|
|||
else:
|
||||
self.proxies = {}
|
||||
response = tls_client.Session.execute_request(self, *args, **kwargs)
|
||||
response.ok = response.status_code in range(200, 400)
|
||||
return response
|
||||
|
||||
|
||||
|
@ -178,3 +179,9 @@ def currency_parser(cur_str):
|
|||
num = float(cur_str)
|
||||
|
||||
return np.round(num, 2)
|
||||
|
||||
|
||||
def remove_attributes(tag):
|
||||
for attr in list(tag.attrs):
|
||||
del tag[attr]
|
||||
return tag
|
||||
|
|
|
@ -7,19 +7,24 @@ This module contains routines to scrape ZipRecruiter.
|
|||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import math
|
||||
import re
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Optional, Tuple, Any
|
||||
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .. import Scraper, ScraperInput, Site
|
||||
from ..utils import (
|
||||
logger,
|
||||
extract_emails_from_text,
|
||||
create_session,
|
||||
markdown_converter,
|
||||
remove_attributes,
|
||||
)
|
||||
from ...jobs import (
|
||||
JobPost,
|
||||
|
@ -151,6 +156,8 @@ class ZipRecruiterScraper(Scraper):
|
|||
comp_min = int(job["compensation_min"]) if "compensation_min" in job else None
|
||||
comp_max = int(job["compensation_max"]) if "compensation_max" in job else None
|
||||
comp_currency = job.get("compensation_currency")
|
||||
description_full, job_url_direct = self._get_descr(job_url)
|
||||
|
||||
return JobPost(
|
||||
id=str(job["listing_key"]),
|
||||
title=title,
|
||||
|
@ -165,10 +172,42 @@ class ZipRecruiterScraper(Scraper):
|
|||
),
|
||||
date_posted=date_posted,
|
||||
job_url=job_url,
|
||||
description=description,
|
||||
description=description_full if description_full else description,
|
||||
emails=extract_emails_from_text(description) if description else None,
|
||||
job_url_direct=job_url_direct,
|
||||
)
|
||||
|
||||
def _get_descr(self, job_url):
|
||||
res = self.session.get(job_url, headers=self.headers, allow_redirects=True)
|
||||
description_full = job_url_direct = None
|
||||
if res.ok:
|
||||
soup = BeautifulSoup(res.text, "html.parser")
|
||||
job_descr_div = soup.find("div", class_="job_description")
|
||||
company_descr_section = soup.find("section", class_="company_description")
|
||||
job_description_clean = (
|
||||
remove_attributes(job_descr_div).prettify(formatter="html")
|
||||
if job_descr_div
|
||||
else ""
|
||||
)
|
||||
company_description_clean = (
|
||||
remove_attributes(company_descr_section).prettify(formatter="html")
|
||||
if company_descr_section
|
||||
else ""
|
||||
)
|
||||
description_full = job_description_clean + company_description_clean
|
||||
script_tag = soup.find("script", type="application/json")
|
||||
if script_tag:
|
||||
job_json = json.loads(script_tag.string)
|
||||
job_url_val = job_json["model"]["saveJobURL"]
|
||||
m = re.search(r"job_url=(.+)", job_url_val)
|
||||
if m:
|
||||
job_url_direct = m.group(1)
|
||||
|
||||
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
||||
description_full = markdown_converter(description_full)
|
||||
|
||||
return description_full, job_url_direct
|
||||
|
||||
def _get_cookies(self):
|
||||
data = "event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple"
|
||||
url = f"{self.api_url}/jobs-app/event"
|
||||
|
|
Loading…
Reference in New Issue