diff --git a/.github/workflows/job_scraper_dynamic.yml b/.github/workflows/job_scraper_dynamic.yml index dd89cae..b479611 100644 --- a/.github/workflows/job_scraper_dynamic.yml +++ b/.github/workflows/job_scraper_dynamic.yml @@ -47,11 +47,11 @@ jobs: python -m pip install --upgrade pip pip install -r requirements.txt - - name: Write user-specific config.json + - name: Write user config.json run: | echo "{ \"user_email\": \"${{ github.event.inputs.user_email }}\", - \"search_terms\": [\"${{ github.event.inputs.search_terms }}\"], + \"search_terms\": \"${{ github.event.inputs.search_terms }}\", \"results_wanted\": ${{ github.event.inputs.results_wanted }}, \"max_days_old\": ${{ github.event.inputs.max_days_old }}, \"target_state\": \"${{ github.event.inputs.target_state }}\" @@ -60,12 +60,23 @@ jobs: - name: Run JobSpy Scraper Dynamic run: python job_scraper_dynamic.py - - name: Upload user-specific CSV as artifact + - name: Sanitize email for filename + id: sanitize + run: | + safe_name=$(echo "${{ github.event.inputs.user_email }}" | sed 's/@/_at_/g; s/\./_/g') + echo "::set-output name=safe_name::$safe_name" + + - name: Verify user-specific CSV exists + run: | + if [ ! -f "jobspy_output_dynamic_${{ steps.sanitize.outputs.safe_name }}.csv" ]; then + echo "āŒ ERROR: jobspy_output_dynamic_${{ steps.sanitize.outputs.safe_name }}.csv not found!" + exit 1 + else + echo "āœ… Found: jobspy_output_dynamic_${{ steps.sanitize.outputs.safe_name }}.csv" + fi + + - name: Upload jobspy output uses: actions/upload-artifact@v4 with: - name: jobspy-output-${{ github.event.inputs.user_email }} - path: | - jobspy_output_dynamic_${{ github.event.inputs.user_email }} - .replace('@','_at_') - .replace('.','_') - .csv + name: jobspy-output-${{ steps.sanitize.outputs.safe_name }} + path: jobspy_output_dynamic_${{ steps.sanitize.outputs.safe_name }}.csv diff --git a/job_scraper_dynamic.py b/job_scraper_dynamic.py index a4b20e7..e8fa574 100644 --- a/job_scraper_dynamic.py +++ b/job_scraper_dynamic.py @@ -2,7 +2,7 @@ import csv import datetime import json import os - +import re from jobspy.google import Google from jobspy.linkedin import LinkedIn from jobspy.indeed import Indeed @@ -15,49 +15,41 @@ sources = { "indeed": Indeed, } -# Read dynamic user-specific config.json -with open("config.json", "r") as f: - config = json.load(f) +# Load user config +with open("config.json", "r") as file: + config = json.load(file) -search_terms = config.get("search_terms", []) -results_wanted = config.get("results_wanted", 100) -max_days_old = config.get("max_days_old", 2) +user_email = config.get("user_email") +search_terms = [term.strip() for term in config.get("search_terms", "").split(",")] +results_wanted = int(config.get("results_wanted", 100)) +max_days_old = int(config.get("max_days_old", 2)) target_state = config.get("target_state", "NY") -user_email = config.get("user_email", "unknown@domain.com") +# Sanitize email for filename +safe_email = re.sub(r'[@.]', lambda x: '_at_' if x.group() == '@' else '_', user_email) +output_filename = f"jobspy_output_dynamic_{safe_email}.csv" -def scrape_jobs(search_terms, results_wanted, max_days_old, target_state): - """Scrape jobs from multiple sources and filter by state.""" +def scrape_jobs(): all_jobs = [] today = datetime.date.today() - - print("\nšŸ”Ž DEBUG: Fetching jobs for search terms:", search_terms) + + print(f"\nšŸ”Ž Fetching jobs for: {search_terms}") for search_term in search_terms: for source_name, source_class in sources.items(): - print(f"\nšŸš€ Scraping {search_term} from {source_name}...") - + print(f"šŸš€ Scraping {search_term} from {source_name}...") scraper = source_class() - search_criteria = ScraperInput( + input_params = ScraperInput( site_type=[source_name], search_term=search_term, results_wanted=results_wanted, ) + results = scraper.scrape(input_params) - job_response = scraper.scrape(search_criteria) - - for job in job_response.jobs: - location_city = job.location.city.strip() if job.location.city else "Unknown" - location_state = job.location.state.strip().upper() if job.location.state else "Unknown" - location_country = str(job.location.country) if job.location.country else "Unknown" - - if not any(term.lower() in job.title.lower() for term in search_terms): - print(f"🚫 Excluding: {job.title} (Doesn't match search terms)") - continue - + for job in results.jobs: + location_state = job.location.state.strip().upper() if job.location and job.location.state else "Unknown" if job.date_posted and (today - job.date_posted).days <= max_days_old: if location_state == target_state or job.is_remote: - print(f"āœ… MATCH: {job.title} - {location_city}, {location_state} (Posted {job.date_posted})") all_jobs.append({ "Job ID": job.id, "Job Title (Primary)": job.title, @@ -70,61 +62,35 @@ def scrape_jobs(search_terms, results_wanted, max_days_old, target_state): "Salary Min": job.compensation.min_amount if job.compensation else "", "Salary Max": job.compensation.max_amount if job.compensation else "", "Date Posted": job.date_posted.strftime("%Y-%m-%d") if job.date_posted else "Not Provided", - "Location City": location_city, + "Location City": job.location.city if job.location and job.location.city else "Unknown", "Location State": location_state, - "Location Country": location_country, + "Location Country": str(job.location.country) if job.location and job.location.country else "Unknown", "Job URL": job.job_url, - "Job Description": job.description.replace(",", "") if job.description else "No description available", + "Job Description": job.description.replace(",", "") if job.description else "No description", "Job Source": source_name }) - else: - print(f"āŒ Ignored (Wrong State): {job.title} - {location_city}, {location_state}") - else: - print(f"ā³ Ignored (Too Old): {job.title} - {location_city}, {location_state}") - print(f"\nāœ… {len(all_jobs)} jobs retrieved for user {user_email}") return all_jobs - -def save_jobs_to_csv(jobs, user_email): - """Save job data to a user-specific CSV file using custom delimiter.""" +def save_jobs_to_csv(jobs, filename): if not jobs: - print("āš ļø No jobs found matching criteria.") + print("āš ļø No jobs found.") return - # Clean the email to create a safe filename - safe_email = user_email.replace("@", "_at_").replace(".", "_") - filename = f"jobspy_output_dynamic_{safe_email}.csv" + fieldnames = list(jobs[0].keys()) + header = "|~|".join(fieldnames) + records = [header] - # Remove old file if it exists - if os.path.exists(filename): - os.remove(filename) + for job in jobs: + row = [str(job.get(field, "Not Provided")).replace(",", "") for field in fieldnames] + records.append("|~|".join(row)) - fieldnames = [ - "Job ID", "Job Title (Primary)", "Company Name", "Industry", - "Experience Level", "Job Type", "Is Remote", "Currency", - "Salary Min", "Salary Max", "Date Posted", "Location City", - "Location State", "Location Country", "Job URL", "Job Description", - "Job Source", "User Email" - ] + output = ",".join(records) + with open(filename, "w", encoding="utf-8") as f: + f.write(output) - with open(filename, mode="w", newline="", encoding="utf-8") as file: - writer = csv.DictWriter(file, fieldnames=fieldnames, delimiter="|") - writer.writeheader() - for job in jobs: - job["User Email"] = user_email - writer.writerow(job) + print(f"āœ… Saved {len(jobs)} jobs to {filename}") - print(f"šŸ“„ File saved: {filename} ({len(jobs)} entries)") - return filename - - -# Run the scraper and save the results to a user-specific output file -job_data = scrape_jobs( - search_terms=search_terms, - results_wanted=results_wanted, - max_days_old=max_days_old, - target_state=target_state -) - -output_filename = save_jobs_to_csv(job_data, user_email) +# Run +scraped_jobs = scrape_jobs() +save_jobs_to_csv(scraped_jobs, output_filename) diff --git a/jobspy_output_dynamic.csv b/jobspy_output_dynamic.csv deleted file mode 100644 index e69de29..0000000