mirror of https://github.com/Bunsly/JobSpy
remove duplicates - gsheets (#29)
parent
9550886091
commit
c4baa79181
|
@ -11,8 +11,33 @@ from settings import *
|
||||||
|
|
||||||
|
|
||||||
class CSVFormatter:
|
class CSVFormatter:
|
||||||
|
@staticmethod
|
||||||
|
def fetch_job_urls(credentials: Any) -> set:
|
||||||
|
"""
|
||||||
|
Fetches all the job urls from the google sheet to prevent duplicates
|
||||||
|
:param credentials:
|
||||||
|
:return: urls
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
gc = gspread.authorize(credentials)
|
||||||
|
sh = gc.open(GSHEET_NAME)
|
||||||
|
|
||||||
|
worksheet = sh.get_worksheet(0)
|
||||||
|
data = worksheet.get_all_values()
|
||||||
|
job_urls = set()
|
||||||
|
for row in data[1:]:
|
||||||
|
job_urls.add(row[3])
|
||||||
|
return job_urls
|
||||||
|
except Exception as e:
|
||||||
|
raise e
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def upload_to_google_sheet(csv_data: str):
|
def upload_to_google_sheet(csv_data: str):
|
||||||
|
"""
|
||||||
|
Appends rows to google sheet
|
||||||
|
:param csv_data:
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
scope = [
|
scope = [
|
||||||
"https://www.googleapis.com/auth/spreadsheets",
|
"https://www.googleapis.com/auth/spreadsheets",
|
||||||
|
@ -29,22 +54,28 @@ class CSVFormatter:
|
||||||
data_string = csv_data.getvalue()
|
data_string = csv_data.getvalue()
|
||||||
reader = csv.reader(StringIO(data_string))
|
reader = csv.reader(StringIO(data_string))
|
||||||
|
|
||||||
|
job_urls = CSVFormatter.fetch_job_urls(credentials)
|
||||||
|
|
||||||
rows = list(reader)
|
rows = list(reader)
|
||||||
|
|
||||||
for i, row in enumerate(rows):
|
for i, row in enumerate(rows):
|
||||||
if i == 0:
|
if i == 0:
|
||||||
continue
|
continue
|
||||||
|
if row[4] in job_urls:
|
||||||
|
continue
|
||||||
|
|
||||||
|
row[6] = format(int(row[6]), ",d") if row[6] else ""
|
||||||
|
row[7] = format(int(row[7]), ",d") if row[7] else ""
|
||||||
worksheet.append_row(row)
|
worksheet.append_row(row)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def generate_filename() -> str:
|
def generate_filename() -> str:
|
||||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
"""
|
||||||
return f"JobSpy_results_{timestamp}.csv"
|
Adds a timestamp to the filename header
|
||||||
|
:return: filename
|
||||||
@staticmethod
|
"""
|
||||||
def generate_filename() -> str:
|
|
||||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
return f"JobSpy_results_{timestamp}.csv"
|
return f"JobSpy_results_{timestamp}.csv"
|
||||||
|
|
||||||
|
@ -59,20 +90,17 @@ class CSVFormatter:
|
||||||
writer = csv.writer(output)
|
writer = csv.writer(output)
|
||||||
|
|
||||||
headers = [
|
headers = [
|
||||||
"Site",
|
|
||||||
"Title",
|
"Title",
|
||||||
"Company Name",
|
"Company Name",
|
||||||
"Job URL",
|
|
||||||
"Country",
|
|
||||||
"City",
|
"City",
|
||||||
"State",
|
"State",
|
||||||
"Job Type",
|
"Job Type",
|
||||||
"Compensation Interval",
|
"Pay Cycle",
|
||||||
"Min Amount",
|
"Min Amount",
|
||||||
"Max Amount",
|
"Max Amount",
|
||||||
"Currency",
|
|
||||||
"Date Posted",
|
"Date Posted",
|
||||||
"Description",
|
"Description",
|
||||||
|
"Job URL",
|
||||||
]
|
]
|
||||||
writer.writerow(headers)
|
writer.writerow(headers)
|
||||||
|
|
||||||
|
@ -81,11 +109,8 @@ class CSVFormatter:
|
||||||
for job in job_response["jobs"]:
|
for job in job_response["jobs"]:
|
||||||
writer.writerow(
|
writer.writerow(
|
||||||
[
|
[
|
||||||
site,
|
|
||||||
job["title"],
|
job["title"],
|
||||||
job["company_name"],
|
job["company_name"],
|
||||||
job["job_url"],
|
|
||||||
job["location"]["country"],
|
|
||||||
job["location"]["city"],
|
job["location"]["city"],
|
||||||
job["location"]["state"],
|
job["location"]["state"],
|
||||||
job["job_type"].value if job.get("job_type") else "",
|
job["job_type"].value if job.get("job_type") else "",
|
||||||
|
@ -98,11 +123,9 @@ class CSVFormatter:
|
||||||
job["compensation"]["max_amount"]
|
job["compensation"]["max_amount"]
|
||||||
if job["compensation"]
|
if job["compensation"]
|
||||||
else "",
|
else "",
|
||||||
job["compensation"]["currency"]
|
|
||||||
if job["compensation"]
|
|
||||||
else "",
|
|
||||||
job.get("date_posted", ""),
|
job.get("date_posted", ""),
|
||||||
job["description"],
|
job["description"],
|
||||||
|
job["job_url"],
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
from typing import Union
|
from typing import Union
|
||||||
from datetime import datetime
|
from datetime import date
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
|
||||||
from pydantic import BaseModel, validator
|
from pydantic import BaseModel, validator
|
||||||
|
@ -34,9 +34,9 @@ class CompensationInterval(Enum):
|
||||||
|
|
||||||
class Compensation(BaseModel):
|
class Compensation(BaseModel):
|
||||||
interval: CompensationInterval
|
interval: CompensationInterval
|
||||||
min_amount: float
|
min_amount: int
|
||||||
max_amount: float
|
max_amount: int
|
||||||
currency: str = "USA"
|
currency: str = "USD"
|
||||||
|
|
||||||
|
|
||||||
class JobPost(BaseModel):
|
class JobPost(BaseModel):
|
||||||
|
@ -48,7 +48,8 @@ class JobPost(BaseModel):
|
||||||
description: str = None
|
description: str = None
|
||||||
job_type: JobType = None
|
job_type: JobType = None
|
||||||
compensation: Compensation = None
|
compensation: Compensation = None
|
||||||
date_posted: datetime = None
|
# why is 08-28-2023 a validiation error for type date? how do I fix this?
|
||||||
|
date_posted: date = None
|
||||||
|
|
||||||
|
|
||||||
class JobResponse(BaseModel):
|
class JobResponse(BaseModel):
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
from typing import Optional, Tuple, List
|
from typing import Optional, Tuple, List
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
import tls_client
|
import tls_client
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
|
@ -14,6 +15,8 @@ from api.core.scrapers import Scraper, ScraperInput, Site, StatusException
|
||||||
|
|
||||||
from concurrent.futures import ThreadPoolExecutor, Future
|
from concurrent.futures import ThreadPoolExecutor, Future
|
||||||
import math
|
import math
|
||||||
|
import traceback
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
class ParsingException(Exception):
|
class ParsingException(Exception):
|
||||||
|
@ -69,6 +72,8 @@ class IndeedScraper(Scraper):
|
||||||
raise StatusException(response.status_code)
|
raise StatusException(response.status_code)
|
||||||
|
|
||||||
soup = BeautifulSoup(response.content, "html.parser")
|
soup = BeautifulSoup(response.content, "html.parser")
|
||||||
|
if "did not match any jobs" in str(soup):
|
||||||
|
raise ParsingException("Search did not match any jobs")
|
||||||
|
|
||||||
jobs = IndeedScraper.parse_jobs(
|
jobs = IndeedScraper.parse_jobs(
|
||||||
soup
|
soup
|
||||||
|
@ -84,6 +89,7 @@ class IndeedScraper(Scraper):
|
||||||
|
|
||||||
def process_job(job) -> Optional[JobPost]:
|
def process_job(job) -> Optional[JobPost]:
|
||||||
job_url = f'{self.url}/jobs/viewjob?jk={job["jobkey"]}'
|
job_url = f'{self.url}/jobs/viewjob?jk={job["jobkey"]}'
|
||||||
|
job_url_client = f'{self.url}/viewjob?jk={job["jobkey"]}'
|
||||||
if job_url in self.seen_urls:
|
if job_url in self.seen_urls:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@ -102,14 +108,15 @@ class IndeedScraper(Scraper):
|
||||||
if interval in CompensationInterval.__members__:
|
if interval in CompensationInterval.__members__:
|
||||||
compensation = Compensation(
|
compensation = Compensation(
|
||||||
interval=CompensationInterval[interval],
|
interval=CompensationInterval[interval],
|
||||||
min_amount=extracted_salary.get("max"),
|
min_amount=int(extracted_salary.get("max")),
|
||||||
max_amount=extracted_salary.get("min"),
|
max_amount=int(extracted_salary.get("min")),
|
||||||
currency=currency,
|
currency=currency,
|
||||||
)
|
)
|
||||||
|
|
||||||
job_type = IndeedScraper.get_job_type(job)
|
job_type = IndeedScraper.get_job_type(job)
|
||||||
timestamp_seconds = job["pubDate"] / 1000
|
timestamp_seconds = job["pubDate"] / 1000
|
||||||
date_posted = datetime.fromtimestamp(timestamp_seconds)
|
date_posted = datetime.fromtimestamp(timestamp_seconds)
|
||||||
|
date_posted = date_posted.strftime("%Y-%m-%d")
|
||||||
|
|
||||||
description = self.get_description(job_url, session)
|
description = self.get_description(job_url, session)
|
||||||
li_elements = snippet_html.find_all("li")
|
li_elements = snippet_html.find_all("li")
|
||||||
|
@ -129,7 +136,7 @@ class IndeedScraper(Scraper):
|
||||||
job_type=job_type,
|
job_type=job_type,
|
||||||
compensation=compensation,
|
compensation=compensation,
|
||||||
date_posted=date_posted,
|
date_posted=date_posted,
|
||||||
job_url=job_url,
|
job_url=job_url_client,
|
||||||
)
|
)
|
||||||
return job_post
|
return job_post
|
||||||
|
|
||||||
|
@ -167,12 +174,12 @@ class IndeedScraper(Scraper):
|
||||||
jobs, _ = future.result()
|
jobs, _ = future.result()
|
||||||
|
|
||||||
job_list += jobs
|
job_list += jobs
|
||||||
|
|
||||||
except StatusException as e:
|
except StatusException as e:
|
||||||
return JobResponse(
|
return JobResponse(
|
||||||
success=False,
|
success=False,
|
||||||
error=f"Indeed returned status code {e.status_code}",
|
error=f"Indeed returned status code {e.status_code}",
|
||||||
)
|
)
|
||||||
|
|
||||||
except ParsingException as e:
|
except ParsingException as e:
|
||||||
return JobResponse(
|
return JobResponse(
|
||||||
success=False,
|
success=False,
|
||||||
|
@ -251,6 +258,7 @@ class IndeedScraper(Scraper):
|
||||||
:return: script_tag
|
:return: script_tag
|
||||||
"""
|
"""
|
||||||
script_tags = soup.find_all("script")
|
script_tags = soup.find_all("script")
|
||||||
|
|
||||||
for tag in script_tags:
|
for tag in script_tags:
|
||||||
if (
|
if (
|
||||||
tag.string
|
tag.string
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
|
import math
|
||||||
import json
|
import json
|
||||||
|
from datetime import datetime
|
||||||
from typing import Optional, Tuple, List
|
from typing import Optional, Tuple, List
|
||||||
from urllib.parse import urlparse, parse_qs
|
from urllib.parse import urlparse, parse_qs
|
||||||
|
|
||||||
|
@ -11,7 +13,6 @@ from concurrent.futures import ThreadPoolExecutor, Future
|
||||||
from api.core.jobs import JobPost
|
from api.core.jobs import JobPost
|
||||||
from api.core.scrapers import Scraper, ScraperInput, Site, StatusException
|
from api.core.scrapers import Scraper, ScraperInput, Site, StatusException
|
||||||
from api.core.jobs import *
|
from api.core.jobs import *
|
||||||
import math
|
|
||||||
|
|
||||||
|
|
||||||
class ZipRecruiterScraper(Scraper):
|
class ZipRecruiterScraper(Scraper):
|
||||||
|
@ -173,6 +174,11 @@ class ZipRecruiterScraper(Scraper):
|
||||||
success=False,
|
success=False,
|
||||||
error=f"ZipRecruiter returned status code {e.status_code}",
|
error=f"ZipRecruiter returned status code {e.status_code}",
|
||||||
)
|
)
|
||||||
|
except Exception as e:
|
||||||
|
return JobResponse(
|
||||||
|
success=False,
|
||||||
|
error=f"ZipRecruiter failed to scrape: {e}",
|
||||||
|
)
|
||||||
|
|
||||||
#: note: this does not handle if the results are more or less than the results_wanted
|
#: note: this does not handle if the results are more or less than the results_wanted
|
||||||
|
|
||||||
|
@ -226,7 +232,7 @@ class ZipRecruiterScraper(Scraper):
|
||||||
return CompensationInterval(interval_str)
|
return CompensationInterval(interval_str)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_date_posted(job: BeautifulSoup) -> Optional[str]:
|
def get_date_posted(job: BeautifulSoup) -> Optional[datetime.date]:
|
||||||
"""
|
"""
|
||||||
Extracts the date a job was posted
|
Extracts the date a job was posted
|
||||||
:param job
|
:param job
|
||||||
|
@ -235,10 +241,21 @@ class ZipRecruiterScraper(Scraper):
|
||||||
button = job.find(
|
button = job.find(
|
||||||
"button", {"class": "action_input save_job zrs_btn_secondary_200"}
|
"button", {"class": "action_input save_job zrs_btn_secondary_200"}
|
||||||
)
|
)
|
||||||
url_time = button["data-href"]
|
if not button:
|
||||||
|
return None
|
||||||
|
|
||||||
|
url_time = button.get("data-href", "")
|
||||||
url_components = urlparse(url_time)
|
url_components = urlparse(url_time)
|
||||||
params = parse_qs(url_components.query)
|
params = parse_qs(url_components.query)
|
||||||
return params.get("posted_time", [None])[0]
|
posted_time_str = params.get("posted_time", [None])[0]
|
||||||
|
|
||||||
|
if posted_time_str:
|
||||||
|
posted_date = datetime.strptime(
|
||||||
|
posted_time_str, "%Y-%m-%dT%H:%M:%SZ"
|
||||||
|
).date()
|
||||||
|
return posted_date
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_compensation(job: BeautifulSoup) -> Optional[Compensation]:
|
def get_compensation(job: BeautifulSoup) -> Optional[Compensation]:
|
||||||
|
@ -265,9 +282,9 @@ class ZipRecruiterScraper(Scraper):
|
||||||
amount = amount.replace(",", "").strip("$ ").split(" ")[0]
|
amount = amount.replace(",", "").strip("$ ").split(" ")[0]
|
||||||
if "K" in amount:
|
if "K" in amount:
|
||||||
amount = amount.replace("K", "")
|
amount = amount.replace("K", "")
|
||||||
amount = float(amount) * 1000
|
amount = int(float(amount)) * 1000
|
||||||
else:
|
else:
|
||||||
amount = float(amount)
|
amount = int(float(amount))
|
||||||
amounts.append(amount)
|
amounts.append(amount)
|
||||||
|
|
||||||
compensation = Compensation(
|
compensation = Compensation(
|
||||||
|
|
|
@ -39,9 +39,8 @@ async def scrape_jobs(scraper_input: ScraperInput) -> CommonResponse:
|
||||||
scraped_data: JobResponse = scraper.scrape(scraper_input)
|
scraped_data: JobResponse = scraper.scrape(scraper_input)
|
||||||
return (site.value, scraped_data)
|
return (site.value, scraped_data)
|
||||||
|
|
||||||
with ThreadPoolExecutor() as executor:
|
with ThreadPoolExecutor(max_workers=3) as executor:
|
||||||
results = dict(executor.map(scrape_site, scraper_input.site_type))
|
results = dict(executor.map(scrape_site, scraper_input.site_type))
|
||||||
|
|
||||||
scraper_response = CommonResponse(status="JSON response success", **results)
|
scraper_response = CommonResponse(status="JSON response success", **results)
|
||||||
|
|
||||||
if scraper_input.output_format == OutputFormat.CSV:
|
if scraper_input.output_format == OutputFormat.CSV:
|
||||||
|
@ -56,11 +55,13 @@ async def scrape_jobs(scraper_input: ScraperInput) -> CommonResponse:
|
||||||
csv_output = CSVFormatter.format(scraper_response)
|
csv_output = CSVFormatter.format(scraper_response)
|
||||||
try:
|
try:
|
||||||
CSVFormatter.upload_to_google_sheet(csv_output)
|
CSVFormatter.upload_to_google_sheet(csv_output)
|
||||||
return CommonResponse(status="Successfully uploaded to Google Sheets")
|
return CommonResponse(
|
||||||
|
status="Successfully uploaded to Google Sheets", **results
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return CommonResponse(
|
return CommonResponse(
|
||||||
status="Failed to upload to Google Sheet", error=repr(e)
|
status="Failed to upload to Google Sheet", error=repr(e), **results
|
||||||
)
|
)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Reference in New Issue