mirror of
https://github.com/Bunsly/JobSpy.git
synced 2026-03-05 03:54:31 -08:00
Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ccb0c17660 | ||
|
|
df339610fa | ||
|
|
c501006bd8 |
@@ -76,7 +76,7 @@ Optional
|
|||||||
├── job_type (str):
|
├── job_type (str):
|
||||||
| fulltime, parttime, internship, contract
|
| fulltime, parttime, internship, contract
|
||||||
│
|
│
|
||||||
├── proxies ():
|
├── proxies (list):
|
||||||
| in format ['user:pass@host:port', 'localhost']
|
| in format ['user:pass@host:port', 'localhost']
|
||||||
| each job board will round robin through the proxies
|
| each job board will round robin through the proxies
|
||||||
│
|
│
|
||||||
@@ -140,13 +140,14 @@ JobPost
|
|||||||
│ ├── state (str)
|
│ ├── state (str)
|
||||||
├── description (str)
|
├── description (str)
|
||||||
├── job_type (str): fulltime, parttime, internship, contract
|
├── job_type (str): fulltime, parttime, internship, contract
|
||||||
|
├── job_function (str)
|
||||||
├── compensation (object)
|
├── compensation (object)
|
||||||
│ ├── interval (str): yearly, monthly, weekly, daily, hourly
|
│ ├── interval (str): yearly, monthly, weekly, daily, hourly
|
||||||
│ ├── min_amount (int)
|
│ ├── min_amount (int)
|
||||||
│ ├── max_amount (int)
|
│ ├── max_amount (int)
|
||||||
│ └── currency (enum)
|
│ └── currency (enum)
|
||||||
└── date_posted (date)
|
├── date_posted (date)
|
||||||
└── emails (str)
|
├── emails (str)
|
||||||
└── is_remote (bool)
|
└── is_remote (bool)
|
||||||
|
|
||||||
Indeed specific
|
Indeed specific
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "python-jobspy"
|
name = "python-jobspy"
|
||||||
version = "1.1.55"
|
version = "1.1.56"
|
||||||
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
||||||
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
||||||
homepage = "https://github.com/Bunsly/JobSpy"
|
homepage = "https://github.com/Bunsly/JobSpy"
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ from urllib.parse import urlparse, urlunparse, unquote
|
|||||||
|
|
||||||
from .. import Scraper, ScraperInput, Site
|
from .. import Scraper, ScraperInput, Site
|
||||||
from ..exceptions import LinkedInException
|
from ..exceptions import LinkedInException
|
||||||
from ..utils import create_session
|
from ..utils import create_session, remove_attributes
|
||||||
from ...jobs import (
|
from ...jobs import (
|
||||||
JobPost,
|
JobPost,
|
||||||
Location,
|
Location,
|
||||||
@@ -257,12 +257,6 @@ class LinkedInScraper(Scraper):
|
|||||||
)
|
)
|
||||||
description = None
|
description = None
|
||||||
if div_content is not None:
|
if div_content is not None:
|
||||||
|
|
||||||
def remove_attributes(tag):
|
|
||||||
for attr in list(tag.attrs):
|
|
||||||
del tag[attr]
|
|
||||||
return tag
|
|
||||||
|
|
||||||
div_content = remove_attributes(div_content)
|
div_content = remove_attributes(div_content)
|
||||||
description = div_content.prettify(formatter="html")
|
description = div_content.prettify(formatter="html")
|
||||||
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
||||||
|
|||||||
@@ -93,6 +93,7 @@ class TLSRotating(RotatingProxySession, tls_client.Session):
|
|||||||
else:
|
else:
|
||||||
self.proxies = {}
|
self.proxies = {}
|
||||||
response = tls_client.Session.execute_request(self, *args, **kwargs)
|
response = tls_client.Session.execute_request(self, *args, **kwargs)
|
||||||
|
response.ok = response.status_code in range(200, 400)
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
@@ -178,3 +179,9 @@ def currency_parser(cur_str):
|
|||||||
num = float(cur_str)
|
num = float(cur_str)
|
||||||
|
|
||||||
return np.round(num, 2)
|
return np.round(num, 2)
|
||||||
|
|
||||||
|
|
||||||
|
def remove_attributes(tag):
|
||||||
|
for attr in list(tag.attrs):
|
||||||
|
del tag[attr]
|
||||||
|
return tag
|
||||||
|
|||||||
@@ -7,19 +7,24 @@ This module contains routines to scrape ZipRecruiter.
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
import math
|
import math
|
||||||
|
import re
|
||||||
import time
|
import time
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Optional, Tuple, Any
|
from typing import Optional, Tuple, Any
|
||||||
|
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from .. import Scraper, ScraperInput, Site
|
from .. import Scraper, ScraperInput, Site
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
logger,
|
logger,
|
||||||
extract_emails_from_text,
|
extract_emails_from_text,
|
||||||
create_session,
|
create_session,
|
||||||
markdown_converter,
|
markdown_converter,
|
||||||
|
remove_attributes,
|
||||||
)
|
)
|
||||||
from ...jobs import (
|
from ...jobs import (
|
||||||
JobPost,
|
JobPost,
|
||||||
@@ -151,6 +156,8 @@ class ZipRecruiterScraper(Scraper):
|
|||||||
comp_min = int(job["compensation_min"]) if "compensation_min" in job else None
|
comp_min = int(job["compensation_min"]) if "compensation_min" in job else None
|
||||||
comp_max = int(job["compensation_max"]) if "compensation_max" in job else None
|
comp_max = int(job["compensation_max"]) if "compensation_max" in job else None
|
||||||
comp_currency = job.get("compensation_currency")
|
comp_currency = job.get("compensation_currency")
|
||||||
|
description_full, job_url_direct = self._get_descr(job_url)
|
||||||
|
|
||||||
return JobPost(
|
return JobPost(
|
||||||
id=str(job["listing_key"]),
|
id=str(job["listing_key"]),
|
||||||
title=title,
|
title=title,
|
||||||
@@ -165,10 +172,42 @@ class ZipRecruiterScraper(Scraper):
|
|||||||
),
|
),
|
||||||
date_posted=date_posted,
|
date_posted=date_posted,
|
||||||
job_url=job_url,
|
job_url=job_url,
|
||||||
description=description,
|
description=description_full if description_full else description,
|
||||||
emails=extract_emails_from_text(description) if description else None,
|
emails=extract_emails_from_text(description) if description else None,
|
||||||
|
job_url_direct=job_url_direct,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _get_descr(self, job_url):
|
||||||
|
res = self.session.get(job_url, headers=self.headers, allow_redirects=True)
|
||||||
|
description_full = job_url_direct = None
|
||||||
|
if res.ok:
|
||||||
|
soup = BeautifulSoup(res.text, "html.parser")
|
||||||
|
job_descr_div = soup.find("div", class_="job_description")
|
||||||
|
company_descr_section = soup.find("section", class_="company_description")
|
||||||
|
job_description_clean = (
|
||||||
|
remove_attributes(job_descr_div).prettify(formatter="html")
|
||||||
|
if job_descr_div
|
||||||
|
else ""
|
||||||
|
)
|
||||||
|
company_description_clean = (
|
||||||
|
remove_attributes(company_descr_section).prettify(formatter="html")
|
||||||
|
if company_descr_section
|
||||||
|
else ""
|
||||||
|
)
|
||||||
|
description_full = job_description_clean + company_description_clean
|
||||||
|
script_tag = soup.find("script", type="application/json")
|
||||||
|
if script_tag:
|
||||||
|
job_json = json.loads(script_tag.string)
|
||||||
|
job_url_val = job_json["model"]["saveJobURL"]
|
||||||
|
m = re.search(r"job_url=(.+)", job_url_val)
|
||||||
|
if m:
|
||||||
|
job_url_direct = m.group(1)
|
||||||
|
|
||||||
|
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
||||||
|
description_full = markdown_converter(description_full)
|
||||||
|
|
||||||
|
return description_full, job_url_direct
|
||||||
|
|
||||||
def _get_cookies(self):
|
def _get_cookies(self):
|
||||||
data = "event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple"
|
data = "event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple"
|
||||||
url = f"{self.api_url}/jobs-app/event"
|
url = f"{self.api_url}/jobs-app/event"
|
||||||
|
|||||||
Reference in New Issue
Block a user