mirror of https://github.com/Bunsly/JobSpy
removed commas in fields
parent
341deba465
commit
25c084ca2c
|
@ -1,12 +1,26 @@
|
||||||
import csv
|
|
||||||
import datetime
|
|
||||||
import os
|
import os
|
||||||
|
import datetime
|
||||||
from jobspy.google import Google
|
from jobspy.google import Google
|
||||||
from jobspy.linkedin import LinkedIn
|
from jobspy.linkedin import LinkedIn
|
||||||
from jobspy.indeed import Indeed
|
from jobspy.indeed import Indeed
|
||||||
from jobspy.ziprecruiter import ZipRecruiter
|
from jobspy.ziprecruiter import ZipRecruiter
|
||||||
from jobspy.model import ScraperInput
|
from jobspy.model import ScraperInput
|
||||||
|
|
||||||
|
def clean_text(text: str) -> str:
|
||||||
|
"""
|
||||||
|
Cleans text for CSV output by removing or replacing characters
|
||||||
|
that could break CSV formatting.
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return ""
|
||||||
|
# Remove commas, newlines, carriage returns and double quotes.
|
||||||
|
cleaned = text.replace(",", " ") \
|
||||||
|
.replace("\n", " ") \
|
||||||
|
.replace("\r", " ") \
|
||||||
|
.replace('"', "'")
|
||||||
|
# Collapse multiple spaces into one.
|
||||||
|
return " ".join(cleaned.split())
|
||||||
|
|
||||||
# Define job sources
|
# Define job sources
|
||||||
sources = {
|
sources = {
|
||||||
"google": Google,
|
"google": Google,
|
||||||
|
@ -17,7 +31,7 @@ sources = {
|
||||||
|
|
||||||
# Define search preferences
|
# Define search preferences
|
||||||
search_terms = ["Automation Engineer", "CRM Manager", "Implementation Specialist", "Automation", "CRM"]
|
search_terms = ["Automation Engineer", "CRM Manager", "Implementation Specialist", "Automation", "CRM"]
|
||||||
results_wanted = 200 # Fetch more jobs
|
results_wanted = 100 # Fetch more jobs
|
||||||
max_days_old = 2 # Fetch jobs posted in last 48 hours
|
max_days_old = 2 # Fetch jobs posted in last 48 hours
|
||||||
target_state = "NY" # Only keep jobs from New York
|
target_state = "NY" # Only keep jobs from New York
|
||||||
|
|
||||||
|
@ -50,22 +64,21 @@ def scrape_jobs(search_terms, results_wanted, max_days_old, target_state):
|
||||||
# Debug: Show all jobs being fetched
|
# Debug: Show all jobs being fetched
|
||||||
print(f"📍 Fetched Job: {job.title} - {location_city}, {location_state}, {location_country}")
|
print(f"📍 Fetched Job: {job.title} - {location_city}, {location_state}, {location_country}")
|
||||||
|
|
||||||
# 🔥 Exclude jobs that don’t explicitly match the search terms
|
# Exclude jobs that don’t explicitly match the search terms
|
||||||
if not any(term.lower() in job.title.lower() for term in search_terms):
|
if not any(term.lower() in job.title.lower() for term in search_terms):
|
||||||
print(f"🚫 Excluding: {job.title} (Doesn't match {search_terms})")
|
print(f"🚫 Excluding: {job.title} (Doesn't match {search_terms})")
|
||||||
continue # Skip this job
|
continue
|
||||||
|
|
||||||
# Ensure the job is recent
|
# Ensure the job is recent and in NY (or remote)
|
||||||
if job.date_posted and (today - job.date_posted).days <= max_days_old:
|
if job.date_posted and (today - job.date_posted).days <= max_days_old:
|
||||||
# Only accept jobs if they're in NY or Remote
|
|
||||||
if location_state == target_state or job.is_remote:
|
if location_state == target_state or job.is_remote:
|
||||||
print(f"✅ MATCH: {job.title} - {location_city}, {location_state} (Posted {job.date_posted})")
|
print(f"✅ MATCH: {job.title} - {location_city}, {location_state} (Posted {job.date_posted})")
|
||||||
all_jobs.append({
|
all_jobs.append({
|
||||||
"Job ID": job.id,
|
"Job ID": job.id,
|
||||||
"Job Title (Primary)": job.title,
|
"Job Title (Primary)": clean_text(job.title),
|
||||||
"Company Name": job.company_name if job.company_name else "Unknown",
|
"Company Name": clean_text(job.company_name) if job.company_name else "Unknown",
|
||||||
"Industry": job.company_industry if job.company_industry else "Not Provided",
|
"Industry": clean_text(job.company_industry) if job.company_industry else "Not Provided",
|
||||||
"Experience Level": job.job_level if job.job_level else "Not Provided",
|
"Experience Level": clean_text(job.job_level) if job.job_level else "Not Provided",
|
||||||
"Job Type": job.job_type[0].name if job.job_type else "Not Provided",
|
"Job Type": job.job_type[0].name if job.job_type else "Not Provided",
|
||||||
"Is Remote": job.is_remote,
|
"Is Remote": job.is_remote,
|
||||||
"Currency": job.compensation.currency if job.compensation else "",
|
"Currency": job.compensation.currency if job.compensation else "",
|
||||||
|
@ -76,7 +89,7 @@ def scrape_jobs(search_terms, results_wanted, max_days_old, target_state):
|
||||||
"Location State": location_state,
|
"Location State": location_state,
|
||||||
"Location Country": location_country,
|
"Location Country": location_country,
|
||||||
"Job URL": job.job_url,
|
"Job URL": job.job_url,
|
||||||
"Job Description": job.description.replace(",", "") if job.description else "No description available",
|
"Job Description": clean_text(job.description) if job.description else "No description available",
|
||||||
"Job Source": source_name
|
"Job Source": source_name
|
||||||
})
|
})
|
||||||
else:
|
else:
|
||||||
|
@ -87,9 +100,8 @@ def scrape_jobs(search_terms, results_wanted, max_days_old, target_state):
|
||||||
print(f"\n✅ {len(all_jobs)} jobs retrieved in NY")
|
print(f"\n✅ {len(all_jobs)} jobs retrieved in NY")
|
||||||
return all_jobs
|
return all_jobs
|
||||||
|
|
||||||
|
|
||||||
def save_jobs_to_csv(jobs, filename="jobspy_output.csv"):
|
def save_jobs_to_csv(jobs, filename="jobspy_output.csv"):
|
||||||
"""Save job data to a CSV file."""
|
"""Save job data to a CSV file with a custom delimiter."""
|
||||||
if not jobs:
|
if not jobs:
|
||||||
print("⚠️ No jobs found matching criteria.")
|
print("⚠️ No jobs found matching criteria.")
|
||||||
return
|
return
|
||||||
|
@ -106,14 +118,20 @@ def save_jobs_to_csv(jobs, filename="jobspy_output.csv"):
|
||||||
"Job Source"
|
"Job Source"
|
||||||
]
|
]
|
||||||
|
|
||||||
with open(filename, mode="w", newline="", encoding="utf-8") as file:
|
# Define your custom delimiter
|
||||||
writer = csv.DictWriter(file, fieldnames=fieldnames)
|
delimiter = "|~|"
|
||||||
writer.writeheader()
|
|
||||||
writer.writerows(jobs)
|
with open(filename, mode="w", encoding="utf-8") as file:
|
||||||
|
# Write header
|
||||||
|
file.write(delimiter.join(fieldnames) + "\n")
|
||||||
|
# Write each job record
|
||||||
|
for job in jobs:
|
||||||
|
# Convert all field values to string
|
||||||
|
row = [str(job.get(field, "")) for field in fieldnames]
|
||||||
|
file.write(delimiter.join(row) + "\n")
|
||||||
|
|
||||||
print(f"✅ Jobs saved to {filename} ({len(jobs)} entries)")
|
print(f"✅ Jobs saved to {filename} ({len(jobs)} entries)")
|
||||||
|
|
||||||
|
|
||||||
# Run the scraper with multiple job searches
|
# Run the scraper with multiple job searches
|
||||||
job_data = scrape_jobs(
|
job_data = scrape_jobs(
|
||||||
search_terms=search_terms,
|
search_terms=search_terms,
|
||||||
|
|
|
@ -9,10 +9,11 @@ from datetime import datetime
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
import cloudscraper # NEW: Use cloudscraper to bypass Cloudflare
|
||||||
|
|
||||||
from jobspy.ziprecruiter.constant import headers, get_cookie_data
|
from jobspy.ziprecruiter.constant import headers, get_cookie_data
|
||||||
from jobspy.util import (
|
from jobspy.util import (
|
||||||
extract_emails_from_text,
|
extract_emails_from_text,
|
||||||
create_session,
|
|
||||||
markdown_converter,
|
markdown_converter,
|
||||||
remove_attributes,
|
remove_attributes,
|
||||||
create_logger,
|
create_logger,
|
||||||
|
@ -41,15 +42,20 @@ class ZipRecruiter(Scraper):
|
||||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Initializes ZipRecruiterScraper with the ZipRecruiter job search url
|
Initializes ZipRecruiterScraper with the ZipRecruiter job search url.
|
||||||
|
This version uses cloudscraper to bypass Cloudflare's anti-bot challenge.
|
||||||
"""
|
"""
|
||||||
super().__init__(Site.ZIP_RECRUITER, proxies=proxies)
|
super().__init__(Site.ZIP_RECRUITER, proxies=proxies)
|
||||||
|
|
||||||
self.scraper_input = None
|
# Use cloudscraper instead of the standard session to handle Cloudflare.
|
||||||
self.session = create_session(proxies=proxies, ca_cert=ca_cert)
|
self.session = cloudscraper.create_scraper()
|
||||||
|
if proxies:
|
||||||
|
self.session.proxies = proxies
|
||||||
|
|
||||||
self.session.headers.update(headers)
|
self.session.headers.update(headers)
|
||||||
self._get_cookies()
|
self._get_cookies()
|
||||||
|
|
||||||
|
self.scraper_input = None
|
||||||
self.delay = 5
|
self.delay = 5
|
||||||
self.jobs_per_page = 20
|
self.jobs_per_page = 20
|
||||||
self.seen_urls = set()
|
self.seen_urls = set()
|
||||||
|
@ -86,10 +92,10 @@ class ZipRecruiter(Scraper):
|
||||||
self, scraper_input: ScraperInput, continue_token: str | None = None
|
self, scraper_input: ScraperInput, continue_token: str | None = None
|
||||||
) -> tuple[list[JobPost], str | None]:
|
) -> tuple[list[JobPost], str | None]:
|
||||||
"""
|
"""
|
||||||
Scrapes a page of ZipRecruiter for jobs with scraper_input criteria
|
Scrapes a page of ZipRecruiter for jobs with scraper_input criteria.
|
||||||
:param scraper_input:
|
:param scraper_input:
|
||||||
:param continue_token:
|
:param continue_token:
|
||||||
:return: jobs found on page
|
:return: jobs found on page.
|
||||||
"""
|
"""
|
||||||
jobs_list = []
|
jobs_list = []
|
||||||
params = add_params(scraper_input)
|
params = add_params(scraper_input)
|
||||||
|
@ -123,7 +129,7 @@ class ZipRecruiter(Scraper):
|
||||||
|
|
||||||
def _process_job(self, job: dict) -> JobPost | None:
|
def _process_job(self, job: dict) -> JobPost | None:
|
||||||
"""
|
"""
|
||||||
Processes an individual job dict from the response
|
Processes an individual job dict from the response.
|
||||||
"""
|
"""
|
||||||
title = job.get("name")
|
title = job.get("name")
|
||||||
job_url = f"{self.base_url}/jobs//j?lvk={job['listing_key']}"
|
job_url = f"{self.base_url}/jobs//j?lvk={job['listing_key']}"
|
||||||
|
@ -184,16 +190,16 @@ class ZipRecruiter(Scraper):
|
||||||
job_descr_div = soup.find("div", class_="job_description")
|
job_descr_div = soup.find("div", class_="job_description")
|
||||||
company_descr_section = soup.find("section", class_="company_description")
|
company_descr_section = soup.find("section", class_="company_description")
|
||||||
job_description_clean = (
|
job_description_clean = (
|
||||||
remove_attributes(job_descr_div).prettify(formatter="html")
|
remove_attributes(job_descr_div).get_text(separator="\n", strip=True)
|
||||||
if job_descr_div
|
if job_descr_div
|
||||||
else ""
|
else ""
|
||||||
)
|
)
|
||||||
company_description_clean = (
|
company_description_clean = (
|
||||||
remove_attributes(company_descr_section).prettify(formatter="html")
|
remove_attributes(company_descr_section).get_text(separator="\n", strip=True)
|
||||||
if company_descr_section
|
if company_descr_section
|
||||||
else ""
|
else ""
|
||||||
)
|
)
|
||||||
description_full = job_description_clean + company_description_clean
|
description_full = job_description_clean + "\n" + company_description_clean
|
||||||
|
|
||||||
try:
|
try:
|
||||||
script_tag = soup.find("script", type="application/json")
|
script_tag = soup.find("script", type="application/json")
|
||||||
|
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue