removed commas in fields

pull/268/head
fakebranden 2025-03-12 00:03:02 +00:00
parent 341deba465
commit 25c084ca2c
3 changed files with 75 additions and 252 deletions

View File

@ -1,12 +1,26 @@
import csv
import datetime
import os import os
import datetime
from jobspy.google import Google from jobspy.google import Google
from jobspy.linkedin import LinkedIn from jobspy.linkedin import LinkedIn
from jobspy.indeed import Indeed from jobspy.indeed import Indeed
from jobspy.ziprecruiter import ZipRecruiter from jobspy.ziprecruiter import ZipRecruiter
from jobspy.model import ScraperInput from jobspy.model import ScraperInput
def clean_text(text: str) -> str:
"""
Cleans text for CSV output by removing or replacing characters
that could break CSV formatting.
"""
if not text:
return ""
# Remove commas, newlines, carriage returns and double quotes.
cleaned = text.replace(",", " ") \
.replace("\n", " ") \
.replace("\r", " ") \
.replace('"', "'")
# Collapse multiple spaces into one.
return " ".join(cleaned.split())
# Define job sources # Define job sources
sources = { sources = {
"google": Google, "google": Google,
@ -17,7 +31,7 @@ sources = {
# Define search preferences # Define search preferences
search_terms = ["Automation Engineer", "CRM Manager", "Implementation Specialist", "Automation", "CRM"] search_terms = ["Automation Engineer", "CRM Manager", "Implementation Specialist", "Automation", "CRM"]
results_wanted = 200 # Fetch more jobs results_wanted = 100 # Fetch more jobs
max_days_old = 2 # Fetch jobs posted in last 48 hours max_days_old = 2 # Fetch jobs posted in last 48 hours
target_state = "NY" # Only keep jobs from New York target_state = "NY" # Only keep jobs from New York
@ -50,22 +64,21 @@ def scrape_jobs(search_terms, results_wanted, max_days_old, target_state):
# Debug: Show all jobs being fetched # Debug: Show all jobs being fetched
print(f"📍 Fetched Job: {job.title} - {location_city}, {location_state}, {location_country}") print(f"📍 Fetched Job: {job.title} - {location_city}, {location_state}, {location_country}")
# 🔥 Exclude jobs that dont explicitly match the search terms # Exclude jobs that dont explicitly match the search terms
if not any(term.lower() in job.title.lower() for term in search_terms): if not any(term.lower() in job.title.lower() for term in search_terms):
print(f"🚫 Excluding: {job.title} (Doesn't match {search_terms})") print(f"🚫 Excluding: {job.title} (Doesn't match {search_terms})")
continue # Skip this job continue
# Ensure the job is recent # Ensure the job is recent and in NY (or remote)
if job.date_posted and (today - job.date_posted).days <= max_days_old: if job.date_posted and (today - job.date_posted).days <= max_days_old:
# Only accept jobs if they're in NY or Remote
if location_state == target_state or job.is_remote: if location_state == target_state or job.is_remote:
print(f"✅ MATCH: {job.title} - {location_city}, {location_state} (Posted {job.date_posted})") print(f"✅ MATCH: {job.title} - {location_city}, {location_state} (Posted {job.date_posted})")
all_jobs.append({ all_jobs.append({
"Job ID": job.id, "Job ID": job.id,
"Job Title (Primary)": job.title, "Job Title (Primary)": clean_text(job.title),
"Company Name": job.company_name if job.company_name else "Unknown", "Company Name": clean_text(job.company_name) if job.company_name else "Unknown",
"Industry": job.company_industry if job.company_industry else "Not Provided", "Industry": clean_text(job.company_industry) if job.company_industry else "Not Provided",
"Experience Level": job.job_level if job.job_level else "Not Provided", "Experience Level": clean_text(job.job_level) if job.job_level else "Not Provided",
"Job Type": job.job_type[0].name if job.job_type else "Not Provided", "Job Type": job.job_type[0].name if job.job_type else "Not Provided",
"Is Remote": job.is_remote, "Is Remote": job.is_remote,
"Currency": job.compensation.currency if job.compensation else "", "Currency": job.compensation.currency if job.compensation else "",
@ -76,7 +89,7 @@ def scrape_jobs(search_terms, results_wanted, max_days_old, target_state):
"Location State": location_state, "Location State": location_state,
"Location Country": location_country, "Location Country": location_country,
"Job URL": job.job_url, "Job URL": job.job_url,
"Job Description": job.description.replace(",", "") if job.description else "No description available", "Job Description": clean_text(job.description) if job.description else "No description available",
"Job Source": source_name "Job Source": source_name
}) })
else: else:
@ -87,9 +100,8 @@ def scrape_jobs(search_terms, results_wanted, max_days_old, target_state):
print(f"\n{len(all_jobs)} jobs retrieved in NY") print(f"\n{len(all_jobs)} jobs retrieved in NY")
return all_jobs return all_jobs
def save_jobs_to_csv(jobs, filename="jobspy_output.csv"): def save_jobs_to_csv(jobs, filename="jobspy_output.csv"):
"""Save job data to a CSV file.""" """Save job data to a CSV file with a custom delimiter."""
if not jobs: if not jobs:
print("⚠️ No jobs found matching criteria.") print("⚠️ No jobs found matching criteria.")
return return
@ -106,14 +118,20 @@ def save_jobs_to_csv(jobs, filename="jobspy_output.csv"):
"Job Source" "Job Source"
] ]
with open(filename, mode="w", newline="", encoding="utf-8") as file: # Define your custom delimiter
writer = csv.DictWriter(file, fieldnames=fieldnames) delimiter = "|~|"
writer.writeheader()
writer.writerows(jobs) with open(filename, mode="w", encoding="utf-8") as file:
# Write header
file.write(delimiter.join(fieldnames) + "\n")
# Write each job record
for job in jobs:
# Convert all field values to string
row = [str(job.get(field, "")) for field in fieldnames]
file.write(delimiter.join(row) + "\n")
print(f"✅ Jobs saved to {filename} ({len(jobs)} entries)") print(f"✅ Jobs saved to {filename} ({len(jobs)} entries)")
# Run the scraper with multiple job searches # Run the scraper with multiple job searches
job_data = scrape_jobs( job_data = scrape_jobs(
search_terms=search_terms, search_terms=search_terms,

View File

@ -9,10 +9,11 @@ from datetime import datetime
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import cloudscraper # NEW: Use cloudscraper to bypass Cloudflare
from jobspy.ziprecruiter.constant import headers, get_cookie_data from jobspy.ziprecruiter.constant import headers, get_cookie_data
from jobspy.util import ( from jobspy.util import (
extract_emails_from_text, extract_emails_from_text,
create_session,
markdown_converter, markdown_converter,
remove_attributes, remove_attributes,
create_logger, create_logger,
@ -41,15 +42,20 @@ class ZipRecruiter(Scraper):
self, proxies: list[str] | str | None = None, ca_cert: str | None = None self, proxies: list[str] | str | None = None, ca_cert: str | None = None
): ):
""" """
Initializes ZipRecruiterScraper with the ZipRecruiter job search url Initializes ZipRecruiterScraper with the ZipRecruiter job search url.
This version uses cloudscraper to bypass Cloudflare's anti-bot challenge.
""" """
super().__init__(Site.ZIP_RECRUITER, proxies=proxies) super().__init__(Site.ZIP_RECRUITER, proxies=proxies)
self.scraper_input = None # Use cloudscraper instead of the standard session to handle Cloudflare.
self.session = create_session(proxies=proxies, ca_cert=ca_cert) self.session = cloudscraper.create_scraper()
if proxies:
self.session.proxies = proxies
self.session.headers.update(headers) self.session.headers.update(headers)
self._get_cookies() self._get_cookies()
self.scraper_input = None
self.delay = 5 self.delay = 5
self.jobs_per_page = 20 self.jobs_per_page = 20
self.seen_urls = set() self.seen_urls = set()
@ -86,10 +92,10 @@ class ZipRecruiter(Scraper):
self, scraper_input: ScraperInput, continue_token: str | None = None self, scraper_input: ScraperInput, continue_token: str | None = None
) -> tuple[list[JobPost], str | None]: ) -> tuple[list[JobPost], str | None]:
""" """
Scrapes a page of ZipRecruiter for jobs with scraper_input criteria Scrapes a page of ZipRecruiter for jobs with scraper_input criteria.
:param scraper_input: :param scraper_input:
:param continue_token: :param continue_token:
:return: jobs found on page :return: jobs found on page.
""" """
jobs_list = [] jobs_list = []
params = add_params(scraper_input) params = add_params(scraper_input)
@ -123,7 +129,7 @@ class ZipRecruiter(Scraper):
def _process_job(self, job: dict) -> JobPost | None: def _process_job(self, job: dict) -> JobPost | None:
""" """
Processes an individual job dict from the response Processes an individual job dict from the response.
""" """
title = job.get("name") title = job.get("name")
job_url = f"{self.base_url}/jobs//j?lvk={job['listing_key']}" job_url = f"{self.base_url}/jobs//j?lvk={job['listing_key']}"
@ -184,16 +190,16 @@ class ZipRecruiter(Scraper):
job_descr_div = soup.find("div", class_="job_description") job_descr_div = soup.find("div", class_="job_description")
company_descr_section = soup.find("section", class_="company_description") company_descr_section = soup.find("section", class_="company_description")
job_description_clean = ( job_description_clean = (
remove_attributes(job_descr_div).prettify(formatter="html") remove_attributes(job_descr_div).get_text(separator="\n", strip=True)
if job_descr_div if job_descr_div
else "" else ""
) )
company_description_clean = ( company_description_clean = (
remove_attributes(company_descr_section).prettify(formatter="html") remove_attributes(company_descr_section).get_text(separator="\n", strip=True)
if company_descr_section if company_descr_section
else "" else ""
) )
description_full = job_description_clean + company_description_clean description_full = job_description_clean + "\n" + company_description_clean
try: try:
script_tag = soup.find("script", type="application/json") script_tag = soup.find("script", type="application/json")

File diff suppressed because one or more lines are too long