adjusted scraper for better delimiter and comma only between records

2026-03-06 20:44:30 -08:00 · 2025-03-12 00:47:10 +00:00
parent cd916c7978
commit e9160a0b4c
2 changed files with 1099 additions and 59 deletions
--- a/job_scraper_exact_match.py
+++ b/job_scraper_exact_match.py
@@ -1,39 +1,23 @@
-import os
+import csv
 import datetime
 import os
 from jobspy.google import Google
 from jobspy.linkedin import LinkedIn
 from jobspy.indeed import Indeed
 from jobspy.ziprecruiter import ZipRecruiter
 from jobspy.model import ScraperInput
 def clean_text(text: str) -> str:
    """
    Cleans text for CSV output by removing or replacing characters
    that could break CSV formatting.
    """
    if not text:
        return ""
    # Remove commas, newlines, carriage returns and double quotes.
    cleaned = text.replace(",", " ") \
                  .replace("\n", " ") \
                  .replace("\r", " ") \
                  .replace('"', "'")
    # Collapse multiple spaces into one.
    return " ".join(cleaned.split())
 # Define job sources
 sources = {
    "google": Google,
    "linkedin": LinkedIn,
    "indeed": Indeed,
    "zip_recruiter": ZipRecruiter,
 }
 # Define search preferences
 search_terms = ["Automation Engineer", "CRM Manager", "Implementation Specialist", "Automation", "CRM"]
 results_wanted = 100  # Fetch more jobs
-max_days_old = 2      # Fetch jobs posted in last 48 hours
+max_days_old = 2  # Fetch jobs posted in last 48 hours
-target_state = "NY"   # Only keep jobs from New York
+target_state = "NY"  # Only keep jobs from New York
 def scrape_jobs(search_terms, results_wanted, max_days_old, target_state):
    """Scrape jobs from multiple sources and filter by state."""
@@ -67,18 +51,19 @@ def scrape_jobs(search_terms, results_wanted, max_days_old, target_state):
                # Exclude jobs that don’t explicitly match the search terms
                if not any(term.lower() in job.title.lower() for term in search_terms):
                    print(f"🚫 Excluding: {job.title} (Doesn't match {search_terms})")
-                    continue
+                    continue  # Skip this job
-                # Ensure the job is recent and in NY (or remote)
+                # Ensure the job is recent
                if job.date_posted and (today - job.date_posted).days <= max_days_old:
                    # Only accept jobs if they're in NY or Remote
                    if location_state == target_state or job.is_remote:
                        print(f"✅ MATCH: {job.title} - {location_city}, {location_state} (Posted {job.date_posted})")
                        all_jobs.append({
                            "Job ID": job.id,
-                            "Job Title (Primary)": clean_text(job.title),
+                            "Job Title (Primary)": job.title,
-                            "Company Name": clean_text(job.company_name) if job.company_name else "Unknown",
+                            "Company Name": job.company_name if job.company_name else "Unknown",
-                            "Industry": clean_text(job.company_industry) if job.company_industry else "Not Provided",
+                            "Industry": job.company_industry if job.company_industry else "Not Provided",
-                            "Experience Level": clean_text(job.job_level) if job.job_level else "Not Provided",
+                            "Experience Level": job.job_level if job.job_level else "Not Provided",
                            "Job Type": job.job_type[0].name if job.job_type else "Not Provided",
                            "Is Remote": job.is_remote,
                            "Currency": job.compensation.currency if job.compensation else "",
@@ -89,7 +74,7 @@ def scrape_jobs(search_terms, results_wanted, max_days_old, target_state):
                            "Location State": location_state,
                            "Location Country": location_country,
                            "Job URL": job.job_url,
-                            "Job Description": clean_text(job.description) if job.description else "No description available",
+                            "Job Description": job.description.replace(",", "") if job.description else "No description available",
                            "Job Source": source_name
                        })
                    else:
@@ -100,8 +85,14 @@ def scrape_jobs(search_terms, results_wanted, max_days_old, target_state):
    print(f"\n✅ {len(all_jobs)} jobs retrieved in NY")
    return all_jobs
 def save_jobs_to_csv(jobs, filename="jobspy_output.csv"):
-    """Save job data to a CSV file with a custom delimiter."""
+    """Save job data to a CSV file with custom formatting:
       - Fields within a record are separated by the custom delimiter |~|
       - Records are separated by a comma
       - All commas in field values are removed
       - Blank fields are replaced with 'Not Provided'
    """
    if not jobs:
        print("⚠️ No jobs found matching criteria.")
        return
@@ -118,20 +109,31 @@ def save_jobs_to_csv(jobs, filename="jobspy_output.csv"):
        "Job Source"
    ]
-    # Define your custom delimiter
+    # Build header record using custom field delimiter
-    delimiter = "|~|"
+    header_record = "|~|".join(fieldnames)
    records = [header_record]
-    with open(filename, mode="w", encoding="utf-8") as file:
+    for job in jobs:
-        # Write header
+        row = []
-        file.write(delimiter.join(fieldnames) + "\n")
+        for field in fieldnames:
-        # Write each job record
+            value = str(job.get(field, "")).strip()
-        for job in jobs:
+            if not value:
-            # Convert all field values to string
+                value = "Not Provided"
-            row = [str(job.get(field, "")) for field in fieldnames]
+            # Remove all commas from the value
-            file.write(delimiter.join(row) + "\n")
+            value = value.replace(",", "")
            row.append(value)
        # Join fields with the custom delimiter
        record = "|~|".join(row)
        records.append(record)
    # Join records with a comma as the record separator
    output = ",".join(records)
    with open(filename, "w", encoding="utf-8") as file:
        file.write(output)
    print(f"✅ Jobs saved to {filename} ({len(jobs)} entries)")
 # Run the scraper with multiple job searches
 job_data = scrape_jobs(
    search_terms=search_terms,
@@ -140,5 +142,5 @@ job_data = scrape_jobs(
    target_state=target_state
 )
-# Save results to CSV
+# Save results to CSV with custom formatting
 save_jobs_to_csv(job_data)
--- a/jobspy_output.csv
+++ b/jobspy_output.csv