mirror of https://github.com/Bunsly/JobSpy
adjusted scraper for better delimiter and comma only between records
parent
cd916c7978
commit
e9160a0b4c
|
@ -1,32 +1,16 @@
|
||||||
import os
|
import csv
|
||||||
import datetime
|
import datetime
|
||||||
|
import os
|
||||||
from jobspy.google import Google
|
from jobspy.google import Google
|
||||||
from jobspy.linkedin import LinkedIn
|
from jobspy.linkedin import LinkedIn
|
||||||
from jobspy.indeed import Indeed
|
from jobspy.indeed import Indeed
|
||||||
from jobspy.ziprecruiter import ZipRecruiter
|
|
||||||
from jobspy.model import ScraperInput
|
from jobspy.model import ScraperInput
|
||||||
|
|
||||||
def clean_text(text: str) -> str:
|
|
||||||
"""
|
|
||||||
Cleans text for CSV output by removing or replacing characters
|
|
||||||
that could break CSV formatting.
|
|
||||||
"""
|
|
||||||
if not text:
|
|
||||||
return ""
|
|
||||||
# Remove commas, newlines, carriage returns and double quotes.
|
|
||||||
cleaned = text.replace(",", " ") \
|
|
||||||
.replace("\n", " ") \
|
|
||||||
.replace("\r", " ") \
|
|
||||||
.replace('"', "'")
|
|
||||||
# Collapse multiple spaces into one.
|
|
||||||
return " ".join(cleaned.split())
|
|
||||||
|
|
||||||
# Define job sources
|
# Define job sources
|
||||||
sources = {
|
sources = {
|
||||||
"google": Google,
|
"google": Google,
|
||||||
"linkedin": LinkedIn,
|
"linkedin": LinkedIn,
|
||||||
"indeed": Indeed,
|
"indeed": Indeed,
|
||||||
"zip_recruiter": ZipRecruiter,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# Define search preferences
|
# Define search preferences
|
||||||
|
@ -67,18 +51,19 @@ def scrape_jobs(search_terms, results_wanted, max_days_old, target_state):
|
||||||
# Exclude jobs that don’t explicitly match the search terms
|
# Exclude jobs that don’t explicitly match the search terms
|
||||||
if not any(term.lower() in job.title.lower() for term in search_terms):
|
if not any(term.lower() in job.title.lower() for term in search_terms):
|
||||||
print(f"🚫 Excluding: {job.title} (Doesn't match {search_terms})")
|
print(f"🚫 Excluding: {job.title} (Doesn't match {search_terms})")
|
||||||
continue
|
continue # Skip this job
|
||||||
|
|
||||||
# Ensure the job is recent and in NY (or remote)
|
# Ensure the job is recent
|
||||||
if job.date_posted and (today - job.date_posted).days <= max_days_old:
|
if job.date_posted and (today - job.date_posted).days <= max_days_old:
|
||||||
|
# Only accept jobs if they're in NY or Remote
|
||||||
if location_state == target_state or job.is_remote:
|
if location_state == target_state or job.is_remote:
|
||||||
print(f"✅ MATCH: {job.title} - {location_city}, {location_state} (Posted {job.date_posted})")
|
print(f"✅ MATCH: {job.title} - {location_city}, {location_state} (Posted {job.date_posted})")
|
||||||
all_jobs.append({
|
all_jobs.append({
|
||||||
"Job ID": job.id,
|
"Job ID": job.id,
|
||||||
"Job Title (Primary)": clean_text(job.title),
|
"Job Title (Primary)": job.title,
|
||||||
"Company Name": clean_text(job.company_name) if job.company_name else "Unknown",
|
"Company Name": job.company_name if job.company_name else "Unknown",
|
||||||
"Industry": clean_text(job.company_industry) if job.company_industry else "Not Provided",
|
"Industry": job.company_industry if job.company_industry else "Not Provided",
|
||||||
"Experience Level": clean_text(job.job_level) if job.job_level else "Not Provided",
|
"Experience Level": job.job_level if job.job_level else "Not Provided",
|
||||||
"Job Type": job.job_type[0].name if job.job_type else "Not Provided",
|
"Job Type": job.job_type[0].name if job.job_type else "Not Provided",
|
||||||
"Is Remote": job.is_remote,
|
"Is Remote": job.is_remote,
|
||||||
"Currency": job.compensation.currency if job.compensation else "",
|
"Currency": job.compensation.currency if job.compensation else "",
|
||||||
|
@ -89,7 +74,7 @@ def scrape_jobs(search_terms, results_wanted, max_days_old, target_state):
|
||||||
"Location State": location_state,
|
"Location State": location_state,
|
||||||
"Location Country": location_country,
|
"Location Country": location_country,
|
||||||
"Job URL": job.job_url,
|
"Job URL": job.job_url,
|
||||||
"Job Description": clean_text(job.description) if job.description else "No description available",
|
"Job Description": job.description.replace(",", "") if job.description else "No description available",
|
||||||
"Job Source": source_name
|
"Job Source": source_name
|
||||||
})
|
})
|
||||||
else:
|
else:
|
||||||
|
@ -100,8 +85,14 @@ def scrape_jobs(search_terms, results_wanted, max_days_old, target_state):
|
||||||
print(f"\n✅ {len(all_jobs)} jobs retrieved in NY")
|
print(f"\n✅ {len(all_jobs)} jobs retrieved in NY")
|
||||||
return all_jobs
|
return all_jobs
|
||||||
|
|
||||||
|
|
||||||
def save_jobs_to_csv(jobs, filename="jobspy_output.csv"):
|
def save_jobs_to_csv(jobs, filename="jobspy_output.csv"):
|
||||||
"""Save job data to a CSV file with a custom delimiter."""
|
"""Save job data to a CSV file with custom formatting:
|
||||||
|
- Fields within a record are separated by the custom delimiter |~|
|
||||||
|
- Records are separated by a comma
|
||||||
|
- All commas in field values are removed
|
||||||
|
- Blank fields are replaced with 'Not Provided'
|
||||||
|
"""
|
||||||
if not jobs:
|
if not jobs:
|
||||||
print("⚠️ No jobs found matching criteria.")
|
print("⚠️ No jobs found matching criteria.")
|
||||||
return
|
return
|
||||||
|
@ -118,20 +109,31 @@ def save_jobs_to_csv(jobs, filename="jobspy_output.csv"):
|
||||||
"Job Source"
|
"Job Source"
|
||||||
]
|
]
|
||||||
|
|
||||||
# Define your custom delimiter
|
# Build header record using custom field delimiter
|
||||||
delimiter = "|~|"
|
header_record = "|~|".join(fieldnames)
|
||||||
|
records = [header_record]
|
||||||
|
|
||||||
with open(filename, mode="w", encoding="utf-8") as file:
|
|
||||||
# Write header
|
|
||||||
file.write(delimiter.join(fieldnames) + "\n")
|
|
||||||
# Write each job record
|
|
||||||
for job in jobs:
|
for job in jobs:
|
||||||
# Convert all field values to string
|
row = []
|
||||||
row = [str(job.get(field, "")) for field in fieldnames]
|
for field in fieldnames:
|
||||||
file.write(delimiter.join(row) + "\n")
|
value = str(job.get(field, "")).strip()
|
||||||
|
if not value:
|
||||||
|
value = "Not Provided"
|
||||||
|
# Remove all commas from the value
|
||||||
|
value = value.replace(",", "")
|
||||||
|
row.append(value)
|
||||||
|
# Join fields with the custom delimiter
|
||||||
|
record = "|~|".join(row)
|
||||||
|
records.append(record)
|
||||||
|
|
||||||
|
# Join records with a comma as the record separator
|
||||||
|
output = ",".join(records)
|
||||||
|
with open(filename, "w", encoding="utf-8") as file:
|
||||||
|
file.write(output)
|
||||||
|
|
||||||
print(f"✅ Jobs saved to {filename} ({len(jobs)} entries)")
|
print(f"✅ Jobs saved to {filename} ({len(jobs)} entries)")
|
||||||
|
|
||||||
|
|
||||||
# Run the scraper with multiple job searches
|
# Run the scraper with multiple job searches
|
||||||
job_data = scrape_jobs(
|
job_data = scrape_jobs(
|
||||||
search_terms=search_terms,
|
search_terms=search_terms,
|
||||||
|
@ -140,5 +142,5 @@ job_data = scrape_jobs(
|
||||||
target_state=target_state
|
target_state=target_state
|
||||||
)
|
)
|
||||||
|
|
||||||
# Save results to CSV
|
# Save results to CSV with custom formatting
|
||||||
save_jobs_to_csv(job_data)
|
save_jobs_to_csv(job_data)
|
||||||
|
|
1078
jobspy_output.csv
1078
jobspy_output.csv
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue