From 77cc1f8550358896f3e15cbde0f4a66de1778b2c Mon Sep 17 00:00:00 2001 From: fakebranden Date: Tue, 15 Apr 2025 09:01:33 +0000 Subject: [PATCH] update for artifact with run ID --- .github/workflows/job_scraper_dynamic.yml | 20 ++- job_scraper_dynamic.py | 147 ++++++++-------------- 2 files changed, 64 insertions(+), 103 deletions(-) diff --git a/.github/workflows/job_scraper_dynamic.yml b/.github/workflows/job_scraper_dynamic.yml index 136c238..b9fbf98 100644 --- a/.github/workflows/job_scraper_dynamic.yml +++ b/.github/workflows/job_scraper_dynamic.yml @@ -30,22 +30,20 @@ jobs: pip install --upgrade pip pip install -r requirements.txt - - name: Sanitize Email (Preserve Case) - id: sanitize + - name: Sanitize Email + Create Run ID + id: vars run: | - raw_email="${{ github.event.inputs.user_email }}" - safe_email=$(echo "$raw_email" | sed 's/@/_at_/g; s/\./_/g') + safe_email=$(echo "${{ github.event.inputs.user_email }}" | sed 's/@/_at_/g; s/\./_/g') + run_id=$(date +%s) echo "safe_email=$safe_email" >> $GITHUB_OUTPUT + echo "run_id=$run_id" >> $GITHUB_OUTPUT - - name: Ensure outputs folder exists - run: mkdir -p outputs - - - name: Run Job Scraper with Config + - name: Run Job Scraper run: | - python job_scraper_dynamic.py "${{ github.event.inputs.user_email }}" + python job_scraper_dynamic.py "${{ github.event.inputs.user_email }}" "${{ steps.vars.outputs.run_id }}" - name: Upload Output Artifact uses: actions/upload-artifact@v4 with: - name: jobspy_output_${{ steps.sanitize.outputs.safe_email }} - path: outputs/jobspy_output_${{ steps.sanitize.outputs.safe_email }}.csv + name: jobspy_output_${{ steps.vars.outputs.safe_email }}_${{ steps.vars.outputs.run_id }} + path: outputs/jobspy_output_${{ steps.vars.outputs.safe_email }}_${{ steps.vars.outputs.run_id }}.csv diff --git a/job_scraper_dynamic.py b/job_scraper_dynamic.py index 8af2213..5342ebb 100644 --- a/job_scraper_dynamic.py +++ b/job_scraper_dynamic.py @@ -1,14 +1,9 @@ -import csv -import datetime -import os -import sys -import json +import csv, datetime, os, sys, json from jobspy.google import Google from jobspy.linkedin import LinkedIn from jobspy.indeed import Indeed from jobspy.model import ScraperInput -# Define job sources sources = { "google": Google, "linkedin": LinkedIn, @@ -18,114 +13,82 @@ sources = { def sanitize_email(email): return email.replace("@", "_at_").replace(".", "_") -def load_config_file(email=None): - if email: - safe_email = sanitize_email(email) - config_path = os.path.join("configs", f"config_{safe_email}.json") - if os.path.exists(config_path): - print(f"šŸ“‚ Loading config for {email} → {config_path}") - with open(config_path, "r", encoding="utf-8") as f: - return json.load(f), safe_email - else: - raise FileNotFoundError(f"āŒ Config for {email} not found at {config_path}") - else: - raise ValueError("āŒ Email must be passed as argument") +def load_config(email): + safe_email = sanitize_email(email) + config_path = os.path.join("configs", f"config_{safe_email}.json") + if not os.path.exists(config_path): + raise FileNotFoundError(f"āŒ Config for {email} not found at {config_path}") + with open(config_path, "r", encoding="utf-8") as f: + return json.load(f), safe_email def scrape_jobs(search_terms, results_wanted, max_days_old, target_state): - all_jobs = [] today = datetime.date.today() - print(f"\nšŸ” Scraping jobs for: {search_terms}") + all_jobs = [] for term in search_terms: - for source_name, source_class in sources.items(): - print(f"šŸš€ Scraping '{term}' from {source_name}...") - scraper = source_class() - criteria = ScraperInput(site_type=[source_name], search_term=term, results_wanted=results_wanted) - + for source, Scraper in sources.items(): + print(f"šŸ” Scraping {term} from {source}") + scraper = Scraper() try: - response = scraper.scrape(criteria) + jobs = scraper.scrape(ScraperInput( + site_type=[source], + search_term=term, + results_wanted=results_wanted + )).jobs except Exception as e: - print(f"āŒ Error scraping {source_name}: {e}") + print(f"āš ļø {source} error: {e}") continue - for job in response.jobs: - city = job.location.city.strip() if job.location.city else "Unknown" - state = job.location.state.strip().upper() if job.location.state else "Unknown" - country = str(job.location.country) if job.location.country else "Unknown" - - if not any(t.lower() in job.title.lower() for t in search_terms): - continue - + for job in jobs: if job.date_posted and (today - job.date_posted).days <= max_days_old: - if state == target_state or job.is_remote: - all_jobs.append({ - "Job ID": job.id, - "Job Title (Primary)": job.title, - "Company Name": job.company_name or "Unknown", - "Industry": job.company_industry or "Not Provided", - "Experience Level": job.job_level or "Not Provided", - "Job Type": job.job_type[0].name if job.job_type else "Not Provided", - "Is Remote": job.is_remote, - "Currency": job.compensation.currency if job.compensation else "", - "Salary Min": job.compensation.min_amount if job.compensation else "", - "Salary Max": job.compensation.max_amount if job.compensation else "", - "Date Posted": job.date_posted.strftime("%Y-%m-%d") if job.date_posted else "Not Provided", - "Location City": city, - "Location State": state, - "Location Country": country, - "Job URL": job.job_url, - "Job Description": job.description.replace(",", "") if job.description else "No description available", - "Job Source": source_name - }) - print(f"āœ… {len(all_jobs)} jobs matched.") + if target_state == (job.location.state or "").upper() or job.is_remote: + if any(term.lower() in job.title.lower() for term in search_terms): + all_jobs.append({ + "Job ID": job.id, + "Job Title (Primary)": job.title, + "Company Name": job.company_name or "Unknown", + "Industry": job.company_industry or "Not Provided", + "Experience Level": job.job_level or "Not Provided", + "Job Type": job.job_type[0].name if job.job_type else "Not Provided", + "Is Remote": job.is_remote, + "Currency": job.compensation.currency if job.compensation else "", + "Salary Min": job.compensation.min_amount if job.compensation else "", + "Salary Max": job.compensation.max_amount if job.compensation else "", + "Date Posted": job.date_posted.strftime("%Y-%m-%d"), + "Location City": job.location.city or "Unknown", + "Location State": (job.location.state or "Unknown").upper(), + "Location Country": job.location.country or "Unknown", + "Job URL": job.job_url, + "Job Description": job.description.replace(",", "") if job.description else "No description", + "Job Source": source + }) + print(f"āœ… Found {len(all_jobs)} jobs") return all_jobs -def save_jobs_to_csv(jobs, output_path): - if not jobs: - print("āš ļø No jobs found.") - return - +def save_to_csv(jobs, path): + os.makedirs(os.path.dirname(path), exist_ok=True) fieldnames = [ "Job ID", "Job Title (Primary)", "Company Name", "Industry", "Experience Level", "Job Type", "Is Remote", "Currency", "Salary Min", "Salary Max", "Date Posted", "Location City", - "Location State", "Location Country", "Job URL", "Job Description", - "Job Source" + "Location State", "Location Country", "Job URL", "Job Description", "Job Source" ] - header = "|~|".join(fieldnames) - rows = [header] + rows = [header] + ["|~|".join(str(job.get(col, "Not Provided")).replace(",", "").strip() for col in fieldnames) for job in jobs] + with open(path, "w", encoding="utf-8") as f: + f.write(",".join(rows)) + print(f"šŸ’¾ Saved output to: {path}") - for job in jobs: - row = [] - for field in fieldnames: - value = str(job.get(field, "Not Provided")).replace(",", "").strip() - row.append(value if value else "Not Provided") - rows.append("|~|".join(row)) - - output = ",".join(rows) - os.makedirs(os.path.dirname(output_path), exist_ok=True) - with open(output_path, "w", encoding="utf-8") as f: - f.write(output) - - print(f"šŸ’¾ Saved output to: {output_path}") - -# MAIN if __name__ == "__main__": try: - user_email = sys.argv[1] if len(sys.argv) >= 2 else None - config, safe_email = load_config_file(user_email) + if len(sys.argv) != 3: + raise ValueError("āŒ Usage: python job_scraper_dynamic.py ") - job_data = scrape_jobs( - search_terms=config["search_terms"], - results_wanted=config["results_wanted"], - max_days_old=config["max_days_old"], - target_state=config["target_state"] - ) - - output_file = f"outputs/jobspy_output_{safe_email}.csv" - save_jobs_to_csv(job_data, output_file) + user_email, run_id = sys.argv[1], sys.argv[2] + config, safe_email = load_config(user_email) + jobs = scrape_jobs(config["search_terms"], config["results_wanted"], config["max_days_old"], config["target_state"]) + save_to_csv(jobs, f"outputs/jobspy_output_{safe_email}_{run_id}.csv") except Exception as e: - print(f"āŒ Fatal Error: {e}") + print(f"āŒ Fatal error: {e}") sys.exit(1)