removed commas in fields

2026-03-05 20:14:32 -08:00 · 2025-03-12 00:03:02 +00:00
parent 341deba465
commit 25c084ca2c
3 changed files with 75 additions and 252 deletions
--- a/job_scraper_exact_match.py
+++ b/job_scraper_exact_match.py
@@ -1,12 +1,26 @@
-import csv
-import datetime
 import os
+import datetime
 from jobspy.google import Google
 from jobspy.linkedin import LinkedIn
 from jobspy.indeed import Indeed
 from jobspy.ziprecruiter import ZipRecruiter
 from jobspy.model import ScraperInput

+def clean_text(text: str) -> str:
+    """
+    Cleans text for CSV output by removing or replacing characters
+    that could break CSV formatting.
+    """
+    if not text:
+        return ""
+    # Remove commas, newlines, carriage returns and double quotes.
+    cleaned = text.replace(",", " ") \
+                  .replace("\n", " ") \
+                  .replace("\r", " ") \
+                  .replace('"', "'")
+    # Collapse multiple spaces into one.
+    return " ".join(cleaned.split())
+
 # Define job sources
 sources = {
    "google": Google,
@@ -17,7 +31,7 @@ sources = {

 # Define search preferences
 search_terms = ["Automation Engineer", "CRM Manager", "Implementation Specialist", "Automation", "CRM"]
-results_wanted = 200  # Fetch more jobs
+results_wanted = 100  # Fetch more jobs
 max_days_old = 2      # Fetch jobs posted in last 48 hours
 target_state = "NY"   # Only keep jobs from New York

@@ -50,22 +64,21 @@ def scrape_jobs(search_terms, results_wanted, max_days_old, target_state):
                # Debug: Show all jobs being fetched
                print(f"📍 Fetched Job: {job.title} - {location_city}, {location_state}, {location_country}")

-                # 🔥 Exclude jobs that don’t explicitly match the search terms
+                # Exclude jobs that don’t explicitly match the search terms
                if not any(term.lower() in job.title.lower() for term in search_terms):
                    print(f"🚫 Excluding: {job.title} (Doesn't match {search_terms})")
-                    continue  # Skip this job
+                    continue

-                # Ensure the job is recent
+                # Ensure the job is recent and in NY (or remote)
                if job.date_posted and (today - job.date_posted).days <= max_days_old:
-                    # Only accept jobs if they're in NY or Remote
                    if location_state == target_state or job.is_remote:
                        print(f"✅ MATCH: {job.title} - {location_city}, {location_state} (Posted {job.date_posted})")
                        all_jobs.append({
                            "Job ID": job.id,
-                            "Job Title (Primary)": job.title,
-                            "Company Name": job.company_name if job.company_name else "Unknown",
-                            "Industry": job.company_industry if job.company_industry else "Not Provided",
-                            "Experience Level": job.job_level if job.job_level else "Not Provided",
+                            "Job Title (Primary)": clean_text(job.title),
+                            "Company Name": clean_text(job.company_name) if job.company_name else "Unknown",
+                            "Industry": clean_text(job.company_industry) if job.company_industry else "Not Provided",
+                            "Experience Level": clean_text(job.job_level) if job.job_level else "Not Provided",
                            "Job Type": job.job_type[0].name if job.job_type else "Not Provided",
                            "Is Remote": job.is_remote,
                            "Currency": job.compensation.currency if job.compensation else "",
@@ -76,7 +89,7 @@ def scrape_jobs(search_terms, results_wanted, max_days_old, target_state):
                            "Location State": location_state,
                            "Location Country": location_country,
                            "Job URL": job.job_url,
-                            "Job Description": job.description.replace(",", "") if job.description else "No description available",
+                            "Job Description": clean_text(job.description) if job.description else "No description available",
                            "Job Source": source_name
                        })
                    else:
@@ -87,9 +100,8 @@ def scrape_jobs(search_terms, results_wanted, max_days_old, target_state):
    print(f"\n✅ {len(all_jobs)} jobs retrieved in NY")
    return all_jobs

-
 def save_jobs_to_csv(jobs, filename="jobspy_output.csv"):
-    """Save job data to a CSV file."""
+    """Save job data to a CSV file with a custom delimiter."""
    if not jobs:
        print("⚠️ No jobs found matching criteria.")
        return
@@ -106,14 +118,20 @@ def save_jobs_to_csv(jobs, filename="jobspy_output.csv"):
        "Job Source"
    ]

-    with open(filename, mode="w", newline="", encoding="utf-8") as file:
-        writer = csv.DictWriter(file, fieldnames=fieldnames)
-        writer.writeheader()
-        writer.writerows(jobs)
+    # Define your custom delimiter
+    delimiter = "|~|"
+
+    with open(filename, mode="w", encoding="utf-8") as file:
+        # Write header
+        file.write(delimiter.join(fieldnames) + "\n")
+        # Write each job record
+        for job in jobs:
+            # Convert all field values to string
+            row = [str(job.get(field, "")) for field in fieldnames]
+            file.write(delimiter.join(row) + "\n")

    print(f"✅ Jobs saved to {filename} ({len(jobs)} entries)")

-
 # Run the scraper with multiple job searches
 job_data = scrape_jobs(
    search_terms=search_terms,
--- a/jobspy/ziprecruiter/init.py
+++ b/jobspy/ziprecruiter/init.py
@@ -9,10 +9,11 @@ from datetime import datetime

 from bs4 import BeautifulSoup

+import cloudscraper  # NEW: Use cloudscraper to bypass Cloudflare
+
 from jobspy.ziprecruiter.constant import headers, get_cookie_data
 from jobspy.util import (
    extract_emails_from_text,
-    create_session,
    markdown_converter,
    remove_attributes,
    create_logger,
@@ -41,15 +42,20 @@ class ZipRecruiter(Scraper):
        self, proxies: list[str] | str | None = None, ca_cert: str | None = None
    ):
        """
-        Initializes ZipRecruiterScraper with the ZipRecruiter job search url
+        Initializes ZipRecruiterScraper with the ZipRecruiter job search url.
+        This version uses cloudscraper to bypass Cloudflare's anti-bot challenge.
        """
        super().__init__(Site.ZIP_RECRUITER, proxies=proxies)

-        self.scraper_input = None
-        self.session = create_session(proxies=proxies, ca_cert=ca_cert)
+        # Use cloudscraper instead of the standard session to handle Cloudflare.
+        self.session = cloudscraper.create_scraper()
+        if proxies:
+            self.session.proxies = proxies
+
        self.session.headers.update(headers)
        self._get_cookies()

+        self.scraper_input = None
        self.delay = 5
        self.jobs_per_page = 20
        self.seen_urls = set()
@@ -86,10 +92,10 @@ class ZipRecruiter(Scraper):
        self, scraper_input: ScraperInput, continue_token: str | None = None
    ) -> tuple[list[JobPost], str | None]:
        """
-        Scrapes a page of ZipRecruiter for jobs with scraper_input criteria
+        Scrapes a page of ZipRecruiter for jobs with scraper_input criteria.
        :param scraper_input:
        :param continue_token:
-        :return: jobs found on page
+        :return: jobs found on page.
        """
        jobs_list = []
        params = add_params(scraper_input)
@@ -123,7 +129,7 @@ class ZipRecruiter(Scraper):

    def _process_job(self, job: dict) -> JobPost | None:
        """
-        Processes an individual job dict from the response
+        Processes an individual job dict from the response.
        """
        title = job.get("name")
        job_url = f"{self.base_url}/jobs//j?lvk={job['listing_key']}"
@@ -184,16 +190,16 @@ class ZipRecruiter(Scraper):
            job_descr_div = soup.find("div", class_="job_description")
            company_descr_section = soup.find("section", class_="company_description")
            job_description_clean = (
-                remove_attributes(job_descr_div).prettify(formatter="html")
+                remove_attributes(job_descr_div).get_text(separator="\n", strip=True)
                if job_descr_div
                else ""
            )
            company_description_clean = (
-                remove_attributes(company_descr_section).prettify(formatter="html")
+                remove_attributes(company_descr_section).get_text(separator="\n", strip=True)
                if company_descr_section
                else ""
            )
-            description_full = job_description_clean + company_description_clean
+            description_full = job_description_clean + "\n" + company_description_clean

            try:
                script_tag = soup.find("script", type="application/json")
--- a/jobspy_output.csv
+++ b/jobspy_output.csv