mirror of https://github.com/Bunsly/JobSpy
revert to non runid file
parent
692ae9ca21
commit
cc7c7f0a1d
|
@ -1,44 +1,63 @@
|
||||||
name: JobSpy Scraper Dynamic Workflow
|
name: JobSpy Scraper Dynamic Workflow
|
||||||
|
|
||||||
|
|
||||||
on:
|
on:
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
inputs:
|
inputs:
|
||||||
user_email:
|
user_email:
|
||||||
description: 'Email of user'
|
description: 'Email of user'
|
||||||
required: true
|
required: true
|
||||||
|
default: 'Branden@autoemployme.onmicrosoft.com'
|
||||||
|
|
||||||
|
|
||||||
permissions:
|
permissions:
|
||||||
contents: read
|
contents: read
|
||||||
id-token: write
|
id-token: write
|
||||||
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
scrape_jobs:
|
scrape_jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout Repo
|
- name: Checkout Repo
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
|
||||||
- name: Set Up Python
|
- name: Set Up Python
|
||||||
uses: actions/setup-python@v4
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: '3.10'
|
python-version: '3.10'
|
||||||
|
|
||||||
- name: Sanitize Email
|
|
||||||
id: vars
|
- name: Install Dependencies
|
||||||
|
run: |
|
||||||
|
pip install --upgrade pip
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
|
||||||
|
- name: Sanitize Email (Preserve Case)
|
||||||
|
id: sanitize
|
||||||
run: |
|
run: |
|
||||||
raw_email="${{ github.event.inputs.user_email }}"
|
raw_email="${{ github.event.inputs.user_email }}"
|
||||||
safe_email=$(echo "$raw_email" | sed 's/@/_at_/g; s/\./_/g')
|
safe_email=$(echo "$raw_email" | sed 's/@/_at_/g; s/\./_/g')
|
||||||
echo "safe_email=$safe_email" >> $GITHUB_OUTPUT
|
echo "safe_email=$safe_email" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
- name: Run Job Scraper
|
|
||||||
env:
|
- name: Ensure outputs folder exists
|
||||||
GITHUB_RUN_ID: ${{ github.run_id }}
|
run: mkdir -p outputs
|
||||||
|
|
||||||
|
|
||||||
|
- name: Run Job Scraper with Config
|
||||||
run: |
|
run: |
|
||||||
python job_scraper_dynamic.py "${{ github.event.inputs.user_email }}"
|
python job_scraper_dynamic.py "${{ github.event.inputs.user_email }}"
|
||||||
|
|
||||||
|
|
||||||
- name: Upload Output Artifact
|
- name: Upload Output Artifact
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: jobspy_output_${{ steps.vars.outputs.safe_email }}_${{ github.run_id }}
|
name: jobspy_output_${{ steps.sanitize.outputs.safe_email }}
|
||||||
path: outputs/jobspy_output_${{ steps.vars.outputs.safe_email }}_${{ github.run_id }}.csv
|
path: outputs/jobspy_output_${{ steps.sanitize.outputs.safe_email }}.csv
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,51 +1,72 @@
|
||||||
import csv, datetime, os, sys, json
|
import csv
|
||||||
|
import datetime
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
from jobspy.google import Google
|
from jobspy.google import Google
|
||||||
from jobspy.linkedin import LinkedIn
|
from jobspy.linkedin import LinkedIn
|
||||||
from jobspy.indeed import Indeed
|
from jobspy.indeed import Indeed
|
||||||
from jobspy.model import ScraperInput
|
from jobspy.model import ScraperInput
|
||||||
|
|
||||||
# Define sources
|
|
||||||
|
# Define job sources
|
||||||
sources = {
|
sources = {
|
||||||
"google": Google,
|
"google": Google,
|
||||||
"linkedin": LinkedIn,
|
"linkedin": LinkedIn,
|
||||||
"indeed": Indeed,
|
"indeed": Indeed,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def sanitize_email(email):
|
def sanitize_email(email):
|
||||||
return email.replace("@", "_at_").replace(".", "_")
|
return email.replace("@", "_at_").replace(".", "_")
|
||||||
|
|
||||||
def load_config(email):
|
|
||||||
|
def load_config_file(email=None):
|
||||||
|
if email:
|
||||||
safe_email = sanitize_email(email)
|
safe_email = sanitize_email(email)
|
||||||
config_path = os.path.join("configs", f"config_{safe_email}.json")
|
config_path = os.path.join("configs", f"config_{safe_email}.json")
|
||||||
if not os.path.exists(config_path):
|
if os.path.exists(config_path):
|
||||||
raise FileNotFoundError(f"❌ Config for {email} not found at {config_path}")
|
print(f"📂 Loading config for {email} → {config_path}")
|
||||||
with open(config_path, "r", encoding="utf-8") as f:
|
with open(config_path, "r", encoding="utf-8") as f:
|
||||||
return json.load(f), safe_email
|
return json.load(f), safe_email
|
||||||
|
else:
|
||||||
|
raise FileNotFoundError(f"❌ Config for {email} not found at {config_path}")
|
||||||
|
else:
|
||||||
|
raise ValueError("❌ Email must be passed as argument")
|
||||||
|
|
||||||
def scrape_jobs(search_terms, results_wanted_str, max_days_old_str, target_state):
|
|
||||||
results_wanted = int(results_wanted_str.strip())
|
def scrape_jobs(search_terms, results_wanted, max_days_old, target_state):
|
||||||
max_days_old = int(max_days_old_str.strip())
|
|
||||||
today = datetime.date.today()
|
|
||||||
all_jobs = []
|
all_jobs = []
|
||||||
|
today = datetime.date.today()
|
||||||
|
print(f"\n🔍 Scraping jobs for: {search_terms}")
|
||||||
|
|
||||||
|
|
||||||
for term in search_terms:
|
for term in search_terms:
|
||||||
for source, Scraper in sources.items():
|
for source_name, source_class in sources.items():
|
||||||
print(f"🔍 Scraping {term} from {source}")
|
print(f"🚀 Scraping '{term}' from {source_name}...")
|
||||||
scraper = Scraper()
|
scraper = source_class()
|
||||||
|
criteria = ScraperInput(site_type=[source_name], search_term=term, results_wanted=results_wanted)
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
jobs = scraper.scrape(ScraperInput(
|
response = scraper.scrape(criteria)
|
||||||
site_type=[source],
|
|
||||||
search_term=term,
|
|
||||||
results_wanted=results_wanted
|
|
||||||
)).jobs
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"⚠️ {source} error: {e}")
|
print(f"❌ Error scraping {source_name}: {e}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
for job in jobs:
|
|
||||||
|
for job in response.jobs:
|
||||||
|
city = job.location.city.strip() if job.location.city else "Unknown"
|
||||||
|
state = job.location.state.strip().upper() if job.location.state else "Unknown"
|
||||||
|
country = str(job.location.country) if job.location.country else "Unknown"
|
||||||
|
|
||||||
|
|
||||||
|
if not any(t.lower() in job.title.lower() for t in search_terms):
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
if job.date_posted and (today - job.date_posted).days <= max_days_old:
|
if job.date_posted and (today - job.date_posted).days <= max_days_old:
|
||||||
if target_state == (job.location.state or "").upper() or job.is_remote:
|
if state == target_state or job.is_remote:
|
||||||
if any(term.lower() in job.title.lower() for term in search_terms):
|
|
||||||
all_jobs.append({
|
all_jobs.append({
|
||||||
"Job ID": job.id,
|
"Job ID": job.id,
|
||||||
"Job Title (Primary)": job.title,
|
"Job Title (Primary)": job.title,
|
||||||
|
@ -57,52 +78,73 @@ def scrape_jobs(search_terms, results_wanted_str, max_days_old_str, target_state
|
||||||
"Currency": job.compensation.currency if job.compensation else "",
|
"Currency": job.compensation.currency if job.compensation else "",
|
||||||
"Salary Min": job.compensation.min_amount if job.compensation else "",
|
"Salary Min": job.compensation.min_amount if job.compensation else "",
|
||||||
"Salary Max": job.compensation.max_amount if job.compensation else "",
|
"Salary Max": job.compensation.max_amount if job.compensation else "",
|
||||||
"Date Posted": job.date_posted.strftime("%Y-%m-%d"),
|
"Date Posted": job.date_posted.strftime("%Y-%m-%d") if job.date_posted else "Not Provided",
|
||||||
"Location City": job.location.city or "Unknown",
|
"Location City": city,
|
||||||
"Location State": (job.location.state or "Unknown").upper(),
|
"Location State": state,
|
||||||
"Location Country": job.location.country or "Unknown",
|
"Location Country": country,
|
||||||
"Job URL": job.job_url,
|
"Job URL": job.job_url,
|
||||||
"Job Description": job.description.replace(",", "") if job.description else "No description",
|
"Job Description": job.description.replace(",", "") if job.description else "No description available",
|
||||||
"Job Source": source
|
"Job Source": source_name
|
||||||
})
|
})
|
||||||
print(f"✅ Found {len(all_jobs)} jobs")
|
print(f"✅ {len(all_jobs)} jobs matched.")
|
||||||
return all_jobs
|
return all_jobs
|
||||||
|
|
||||||
def save_to_csv(jobs, path):
|
|
||||||
os.makedirs(os.path.dirname(path), exist_ok=True)
|
def save_jobs_to_csv(jobs, output_path):
|
||||||
|
if not jobs:
|
||||||
|
print("⚠️ No jobs found.")
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
fieldnames = [
|
fieldnames = [
|
||||||
"Job ID", "Job Title (Primary)", "Company Name", "Industry",
|
"Job ID", "Job Title (Primary)", "Company Name", "Industry",
|
||||||
"Experience Level", "Job Type", "Is Remote", "Currency",
|
"Experience Level", "Job Type", "Is Remote", "Currency",
|
||||||
"Salary Min", "Salary Max", "Date Posted", "Location City",
|
"Salary Min", "Salary Max", "Date Posted", "Location City",
|
||||||
"Location State", "Location Country", "Job URL", "Job Description", "Job Source"
|
"Location State", "Location Country", "Job URL", "Job Description",
|
||||||
|
"Job Source"
|
||||||
]
|
]
|
||||||
header = "|~|".join(fieldnames)
|
|
||||||
rows = [header] + ["|~|".join(str(job.get(col, "Not Provided")).replace(",", "").strip() for col in fieldnames) for job in jobs]
|
|
||||||
with open(path, "w", encoding="utf-8") as f:
|
|
||||||
f.write(",".join(rows))
|
|
||||||
print(f"💾 Saved output to: {path}")
|
|
||||||
|
|
||||||
|
|
||||||
|
header = "|~|".join(fieldnames)
|
||||||
|
rows = [header]
|
||||||
|
|
||||||
|
|
||||||
|
for job in jobs:
|
||||||
|
row = []
|
||||||
|
for field in fieldnames:
|
||||||
|
value = str(job.get(field, "Not Provided")).replace(",", "").strip()
|
||||||
|
row.append(value if value else "Not Provided")
|
||||||
|
rows.append("|~|".join(row))
|
||||||
|
|
||||||
|
|
||||||
|
output = ",".join(rows)
|
||||||
|
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
||||||
|
with open(output_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write(output)
|
||||||
|
|
||||||
|
|
||||||
|
print(f"💾 Saved output to: {output_path}")
|
||||||
|
|
||||||
|
|
||||||
|
# MAIN
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
try:
|
try:
|
||||||
if len(sys.argv) != 2:
|
user_email = sys.argv[1] if len(sys.argv) >= 2 else None
|
||||||
raise ValueError("❌ Usage: python job_scraper_dynamic.py <user_email>")
|
config, safe_email = load_config_file(user_email)
|
||||||
|
|
||||||
user_email = sys.argv[1]
|
|
||||||
run_id = os.getenv("GITHUB_RUN_ID")
|
|
||||||
if not run_id:
|
|
||||||
raise EnvironmentError("❌ GITHUB_RUN_ID is not set in the environment.")
|
|
||||||
|
|
||||||
config, safe_email = load_config(user_email)
|
job_data = scrape_jobs(
|
||||||
|
search_terms=config["search_terms"],
|
||||||
jobs = scrape_jobs(
|
results_wanted=config["results_wanted"],
|
||||||
config["search_terms"],
|
max_days_old=config["max_days_old"],
|
||||||
config["results_wanted"],
|
target_state=config["target_state"]
|
||||||
config["max_days_old"],
|
|
||||||
config["target_state"]
|
|
||||||
)
|
)
|
||||||
|
|
||||||
save_to_csv(jobs, f"outputs/jobspy_output_{safe_email}_{run_id}.csv")
|
|
||||||
|
output_file = f"outputs/jobspy_output_{safe_email}.csv"
|
||||||
|
save_jobs_to_csv(job_data, output_file)
|
||||||
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"❌ Fatal error: {e}")
|
print(f"❌ Fatal Error: {e}")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
Loading…
Reference in New Issue