mirror of
https://github.com/Bunsly/JobSpy.git
synced 2026-03-05 20:14:32 -08:00
remove duplicates - gsheets (#29)
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
import re
|
||||
import json
|
||||
from typing import Optional, Tuple, List
|
||||
from datetime import datetime
|
||||
|
||||
import tls_client
|
||||
import urllib.parse
|
||||
@@ -14,6 +15,8 @@ from api.core.scrapers import Scraper, ScraperInput, Site, StatusException
|
||||
|
||||
from concurrent.futures import ThreadPoolExecutor, Future
|
||||
import math
|
||||
import traceback
|
||||
import sys
|
||||
|
||||
|
||||
class ParsingException(Exception):
|
||||
@@ -69,6 +72,8 @@ class IndeedScraper(Scraper):
|
||||
raise StatusException(response.status_code)
|
||||
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
if "did not match any jobs" in str(soup):
|
||||
raise ParsingException("Search did not match any jobs")
|
||||
|
||||
jobs = IndeedScraper.parse_jobs(
|
||||
soup
|
||||
@@ -84,6 +89,7 @@ class IndeedScraper(Scraper):
|
||||
|
||||
def process_job(job) -> Optional[JobPost]:
|
||||
job_url = f'{self.url}/jobs/viewjob?jk={job["jobkey"]}'
|
||||
job_url_client = f'{self.url}/viewjob?jk={job["jobkey"]}'
|
||||
if job_url in self.seen_urls:
|
||||
return None
|
||||
|
||||
@@ -102,14 +108,15 @@ class IndeedScraper(Scraper):
|
||||
if interval in CompensationInterval.__members__:
|
||||
compensation = Compensation(
|
||||
interval=CompensationInterval[interval],
|
||||
min_amount=extracted_salary.get("max"),
|
||||
max_amount=extracted_salary.get("min"),
|
||||
min_amount=int(extracted_salary.get("max")),
|
||||
max_amount=int(extracted_salary.get("min")),
|
||||
currency=currency,
|
||||
)
|
||||
|
||||
job_type = IndeedScraper.get_job_type(job)
|
||||
timestamp_seconds = job["pubDate"] / 1000
|
||||
date_posted = datetime.fromtimestamp(timestamp_seconds)
|
||||
date_posted = date_posted.strftime("%Y-%m-%d")
|
||||
|
||||
description = self.get_description(job_url, session)
|
||||
li_elements = snippet_html.find_all("li")
|
||||
@@ -129,7 +136,7 @@ class IndeedScraper(Scraper):
|
||||
job_type=job_type,
|
||||
compensation=compensation,
|
||||
date_posted=date_posted,
|
||||
job_url=job_url,
|
||||
job_url=job_url_client,
|
||||
)
|
||||
return job_post
|
||||
|
||||
@@ -167,12 +174,12 @@ class IndeedScraper(Scraper):
|
||||
jobs, _ = future.result()
|
||||
|
||||
job_list += jobs
|
||||
|
||||
except StatusException as e:
|
||||
return JobResponse(
|
||||
success=False,
|
||||
error=f"Indeed returned status code {e.status_code}",
|
||||
)
|
||||
|
||||
except ParsingException as e:
|
||||
return JobResponse(
|
||||
success=False,
|
||||
@@ -251,6 +258,7 @@ class IndeedScraper(Scraper):
|
||||
:return: script_tag
|
||||
"""
|
||||
script_tags = soup.find_all("script")
|
||||
|
||||
for tag in script_tags:
|
||||
if (
|
||||
tag.string
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
from typing import Optional
|
||||
from datetime import datetime
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
import math
|
||||
import json
|
||||
from datetime import datetime
|
||||
from typing import Optional, Tuple, List
|
||||
from urllib.parse import urlparse, parse_qs
|
||||
|
||||
@@ -11,7 +13,6 @@ from concurrent.futures import ThreadPoolExecutor, Future
|
||||
from api.core.jobs import JobPost
|
||||
from api.core.scrapers import Scraper, ScraperInput, Site, StatusException
|
||||
from api.core.jobs import *
|
||||
import math
|
||||
|
||||
|
||||
class ZipRecruiterScraper(Scraper):
|
||||
@@ -173,6 +174,11 @@ class ZipRecruiterScraper(Scraper):
|
||||
success=False,
|
||||
error=f"ZipRecruiter returned status code {e.status_code}",
|
||||
)
|
||||
except Exception as e:
|
||||
return JobResponse(
|
||||
success=False,
|
||||
error=f"ZipRecruiter failed to scrape: {e}",
|
||||
)
|
||||
|
||||
#: note: this does not handle if the results are more or less than the results_wanted
|
||||
|
||||
@@ -226,7 +232,7 @@ class ZipRecruiterScraper(Scraper):
|
||||
return CompensationInterval(interval_str)
|
||||
|
||||
@staticmethod
|
||||
def get_date_posted(job: BeautifulSoup) -> Optional[str]:
|
||||
def get_date_posted(job: BeautifulSoup) -> Optional[datetime.date]:
|
||||
"""
|
||||
Extracts the date a job was posted
|
||||
:param job
|
||||
@@ -235,10 +241,21 @@ class ZipRecruiterScraper(Scraper):
|
||||
button = job.find(
|
||||
"button", {"class": "action_input save_job zrs_btn_secondary_200"}
|
||||
)
|
||||
url_time = button["data-href"]
|
||||
if not button:
|
||||
return None
|
||||
|
||||
url_time = button.get("data-href", "")
|
||||
url_components = urlparse(url_time)
|
||||
params = parse_qs(url_components.query)
|
||||
return params.get("posted_time", [None])[0]
|
||||
posted_time_str = params.get("posted_time", [None])[0]
|
||||
|
||||
if posted_time_str:
|
||||
posted_date = datetime.strptime(
|
||||
posted_time_str, "%Y-%m-%dT%H:%M:%SZ"
|
||||
).date()
|
||||
return posted_date
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def get_compensation(job: BeautifulSoup) -> Optional[Compensation]:
|
||||
@@ -265,9 +282,9 @@ class ZipRecruiterScraper(Scraper):
|
||||
amount = amount.replace(",", "").strip("$ ").split(" ")[0]
|
||||
if "K" in amount:
|
||||
amount = amount.replace("K", "")
|
||||
amount = float(amount) * 1000
|
||||
amount = int(float(amount)) * 1000
|
||||
else:
|
||||
amount = float(amount)
|
||||
amount = int(float(amount))
|
||||
amounts.append(amount)
|
||||
|
||||
compensation = Compensation(
|
||||
|
||||
Reference in New Issue
Block a user