update for artifact with run ID

pull/268/head
fakebranden 2025-04-15 09:01:33 +00:00
parent 84b4524c43
commit 77cc1f8550
2 changed files with 64 additions and 103 deletions

View File

@ -30,22 +30,20 @@ jobs:
pip install --upgrade pip
pip install -r requirements.txt
- name: Sanitize Email (Preserve Case)
id: sanitize
- name: Sanitize Email + Create Run ID
id: vars
run: |
raw_email="${{ github.event.inputs.user_email }}"
safe_email=$(echo "$raw_email" | sed 's/@/_at_/g; s/\./_/g')
safe_email=$(echo "${{ github.event.inputs.user_email }}" | sed 's/@/_at_/g; s/\./_/g')
run_id=$(date +%s)
echo "safe_email=$safe_email" >> $GITHUB_OUTPUT
echo "run_id=$run_id" >> $GITHUB_OUTPUT
- name: Ensure outputs folder exists
run: mkdir -p outputs
- name: Run Job Scraper with Config
- name: Run Job Scraper
run: |
python job_scraper_dynamic.py "${{ github.event.inputs.user_email }}"
python job_scraper_dynamic.py "${{ github.event.inputs.user_email }}" "${{ steps.vars.outputs.run_id }}"
- name: Upload Output Artifact
uses: actions/upload-artifact@v4
with:
name: jobspy_output_${{ steps.sanitize.outputs.safe_email }}
path: outputs/jobspy_output_${{ steps.sanitize.outputs.safe_email }}.csv
name: jobspy_output_${{ steps.vars.outputs.safe_email }}_${{ steps.vars.outputs.run_id }}
path: outputs/jobspy_output_${{ steps.vars.outputs.safe_email }}_${{ steps.vars.outputs.run_id }}.csv

View File

@ -1,14 +1,9 @@
import csv
import datetime
import os
import sys
import json
import csv, datetime, os, sys, json
from jobspy.google import Google
from jobspy.linkedin import LinkedIn
from jobspy.indeed import Indeed
from jobspy.model import ScraperInput
# Define job sources
sources = {
"google": Google,
"linkedin": LinkedIn,
@ -18,114 +13,82 @@ sources = {
def sanitize_email(email):
return email.replace("@", "_at_").replace(".", "_")
def load_config_file(email=None):
if email:
safe_email = sanitize_email(email)
config_path = os.path.join("configs", f"config_{safe_email}.json")
if os.path.exists(config_path):
print(f"📂 Loading config for {email}{config_path}")
with open(config_path, "r", encoding="utf-8") as f:
return json.load(f), safe_email
else:
raise FileNotFoundError(f"❌ Config for {email} not found at {config_path}")
else:
raise ValueError("❌ Email must be passed as argument")
def load_config(email):
safe_email = sanitize_email(email)
config_path = os.path.join("configs", f"config_{safe_email}.json")
if not os.path.exists(config_path):
raise FileNotFoundError(f"❌ Config for {email} not found at {config_path}")
with open(config_path, "r", encoding="utf-8") as f:
return json.load(f), safe_email
def scrape_jobs(search_terms, results_wanted, max_days_old, target_state):
all_jobs = []
today = datetime.date.today()
print(f"\n🔍 Scraping jobs for: {search_terms}")
all_jobs = []
for term in search_terms:
for source_name, source_class in sources.items():
print(f"🚀 Scraping '{term}' from {source_name}...")
scraper = source_class()
criteria = ScraperInput(site_type=[source_name], search_term=term, results_wanted=results_wanted)
for source, Scraper in sources.items():
print(f"🔍 Scraping {term} from {source}")
scraper = Scraper()
try:
response = scraper.scrape(criteria)
jobs = scraper.scrape(ScraperInput(
site_type=[source],
search_term=term,
results_wanted=results_wanted
)).jobs
except Exception as e:
print(f"❌ Error scraping {source_name}: {e}")
print(f"⚠️ {source} error: {e}")
continue
for job in response.jobs:
city = job.location.city.strip() if job.location.city else "Unknown"
state = job.location.state.strip().upper() if job.location.state else "Unknown"
country = str(job.location.country) if job.location.country else "Unknown"
if not any(t.lower() in job.title.lower() for t in search_terms):
continue
for job in jobs:
if job.date_posted and (today - job.date_posted).days <= max_days_old:
if state == target_state or job.is_remote:
all_jobs.append({
"Job ID": job.id,
"Job Title (Primary)": job.title,
"Company Name": job.company_name or "Unknown",
"Industry": job.company_industry or "Not Provided",
"Experience Level": job.job_level or "Not Provided",
"Job Type": job.job_type[0].name if job.job_type else "Not Provided",
"Is Remote": job.is_remote,
"Currency": job.compensation.currency if job.compensation else "",
"Salary Min": job.compensation.min_amount if job.compensation else "",
"Salary Max": job.compensation.max_amount if job.compensation else "",
"Date Posted": job.date_posted.strftime("%Y-%m-%d") if job.date_posted else "Not Provided",
"Location City": city,
"Location State": state,
"Location Country": country,
"Job URL": job.job_url,
"Job Description": job.description.replace(",", "") if job.description else "No description available",
"Job Source": source_name
})
print(f"{len(all_jobs)} jobs matched.")
if target_state == (job.location.state or "").upper() or job.is_remote:
if any(term.lower() in job.title.lower() for term in search_terms):
all_jobs.append({
"Job ID": job.id,
"Job Title (Primary)": job.title,
"Company Name": job.company_name or "Unknown",
"Industry": job.company_industry or "Not Provided",
"Experience Level": job.job_level or "Not Provided",
"Job Type": job.job_type[0].name if job.job_type else "Not Provided",
"Is Remote": job.is_remote,
"Currency": job.compensation.currency if job.compensation else "",
"Salary Min": job.compensation.min_amount if job.compensation else "",
"Salary Max": job.compensation.max_amount if job.compensation else "",
"Date Posted": job.date_posted.strftime("%Y-%m-%d"),
"Location City": job.location.city or "Unknown",
"Location State": (job.location.state or "Unknown").upper(),
"Location Country": job.location.country or "Unknown",
"Job URL": job.job_url,
"Job Description": job.description.replace(",", "") if job.description else "No description",
"Job Source": source
})
print(f"✅ Found {len(all_jobs)} jobs")
return all_jobs
def save_jobs_to_csv(jobs, output_path):
if not jobs:
print("⚠️ No jobs found.")
return
def save_to_csv(jobs, path):
os.makedirs(os.path.dirname(path), exist_ok=True)
fieldnames = [
"Job ID", "Job Title (Primary)", "Company Name", "Industry",
"Experience Level", "Job Type", "Is Remote", "Currency",
"Salary Min", "Salary Max", "Date Posted", "Location City",
"Location State", "Location Country", "Job URL", "Job Description",
"Job Source"
"Location State", "Location Country", "Job URL", "Job Description", "Job Source"
]
header = "|~|".join(fieldnames)
rows = [header]
rows = [header] + ["|~|".join(str(job.get(col, "Not Provided")).replace(",", "").strip() for col in fieldnames) for job in jobs]
with open(path, "w", encoding="utf-8") as f:
f.write(",".join(rows))
print(f"💾 Saved output to: {path}")
for job in jobs:
row = []
for field in fieldnames:
value = str(job.get(field, "Not Provided")).replace(",", "").strip()
row.append(value if value else "Not Provided")
rows.append("|~|".join(row))
output = ",".join(rows)
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
f.write(output)
print(f"💾 Saved output to: {output_path}")
# MAIN
if __name__ == "__main__":
try:
user_email = sys.argv[1] if len(sys.argv) >= 2 else None
config, safe_email = load_config_file(user_email)
if len(sys.argv) != 3:
raise ValueError("❌ Usage: python job_scraper_dynamic.py <user_email> <run_id>")
job_data = scrape_jobs(
search_terms=config["search_terms"],
results_wanted=config["results_wanted"],
max_days_old=config["max_days_old"],
target_state=config["target_state"]
)
output_file = f"outputs/jobspy_output_{safe_email}.csv"
save_jobs_to_csv(job_data, output_file)
user_email, run_id = sys.argv[1], sys.argv[2]
config, safe_email = load_config(user_email)
jobs = scrape_jobs(config["search_terms"], config["results_wanted"], config["max_days_old"], config["target_state"])
save_to_csv(jobs, f"outputs/jobspy_output_{safe_email}_{run_id}.csv")
except Exception as e:
print(f"❌ Fatal Error: {e}")
print(f"❌ Fatal error: {e}")
sys.exit(1)