dynamic yml and py update

2025-04-14 21:37:07 +00:00 · 2025-04-14 21:37:07 +00:00 · 6a326b7dd4
parent 0a5c5fa9b3
commit 6a326b7dd4
3 changed files with 57 additions and 80 deletions
--- a/.github/workflows/job_scraper_dynamic.yml
+++ b/.github/workflows/job_scraper_dynamic.yml
@ -47,11 +47,11 @@ jobs:
          python -m pip install --upgrade pip
          pip install -r requirements.txt
-      - name: Write user-specific config.json
+      - name: Write user config.json
        run: |
          echo "{
            \"user_email\": \"${{ github.event.inputs.user_email }}\",
-            \"search_terms\": [\"${{ github.event.inputs.search_terms }}\"],
+            \"search_terms\": \"${{ github.event.inputs.search_terms }}\",
            \"results_wanted\": ${{ github.event.inputs.results_wanted }},
            \"max_days_old\": ${{ github.event.inputs.max_days_old }},
            \"target_state\": \"${{ github.event.inputs.target_state }}\"
@ -60,12 +60,23 @@ jobs:
      - name: Run JobSpy Scraper Dynamic
        run: python job_scraper_dynamic.py
-      - name: Upload user-specific CSV as artifact
+      - name: Sanitize email for filename
        id: sanitize
        run: |
          safe_name=$(echo "${{ github.event.inputs.user_email }}" | sed 's/@/_at_/g; s/\./_/g')
          echo "::set-output name=safe_name::$safe_name"
      - name: Verify user-specific CSV exists
        run: |
          if [ ! -f "jobspy_output_dynamic_${{ steps.sanitize.outputs.safe_name }}.csv" ]; then
            echo "❌ ERROR: jobspy_output_dynamic_${{ steps.sanitize.outputs.safe_name }}.csv not found!"
            exit 1
          else
            echo "✅ Found: jobspy_output_dynamic_${{ steps.sanitize.outputs.safe_name }}.csv"
          fi
      - name: Upload jobspy output
        uses: actions/upload-artifact@v4
        with:
-          name: jobspy-output-${{ github.event.inputs.user_email }}
+          name: jobspy-output-${{ steps.sanitize.outputs.safe_name }}
-          path: |
+          path: jobspy_output_dynamic_${{ steps.sanitize.outputs.safe_name }}.csv
            jobspy_output_dynamic_${{ github.event.inputs.user_email }}
              .replace('@','_at_')
              .replace('.','_')
              .csv
--- a/job_scraper_dynamic.py
+++ b/job_scraper_dynamic.py
@ -2,7 +2,7 @@ import csv
 import datetime
 import json
 import os
-
+import re
 from jobspy.google import Google
 from jobspy.linkedin import LinkedIn
 from jobspy.indeed import Indeed
@ -15,49 +15,41 @@ sources = {
    "indeed": Indeed,
 }
-# Read dynamic user-specific config.json
+# Load user config
-with open("config.json", "r") as f:
+with open("config.json", "r") as file:
-    config = json.load(f)
+    config = json.load(file)
-search_terms = config.get("search_terms", [])
+user_email = config.get("user_email")
-results_wanted = config.get("results_wanted", 100)
+search_terms = [term.strip() for term in config.get("search_terms", "").split(",")]
-max_days_old = config.get("max_days_old", 2)
+results_wanted = int(config.get("results_wanted", 100))
 max_days_old = int(config.get("max_days_old", 2))
 target_state = config.get("target_state", "NY")
 user_email = config.get("user_email", "unknown@domain.com")
 # Sanitize email for filename
 safe_email = re.sub(r'[@.]', lambda x: '_at_' if x.group() == '@' else '_', user_email)
 output_filename = f"jobspy_output_dynamic_{safe_email}.csv"
-def scrape_jobs(search_terms, results_wanted, max_days_old, target_state):
+def scrape_jobs():
    """Scrape jobs from multiple sources and filter by state."""
    all_jobs = []
    today = datetime.date.today()
-    
+
-    print("\n🔎 DEBUG: Fetching jobs for search terms:", search_terms)
+    print(f"\n🔎 Fetching jobs for: {search_terms}")
    for search_term in search_terms:
        for source_name, source_class in sources.items():
-            print(f"\n🚀 Scraping {search_term} from {source_name}...")
+            print(f"🚀 Scraping {search_term} from {source_name}...")
            scraper = source_class()
-            search_criteria = ScraperInput(
+            input_params = ScraperInput(
                site_type=[source_name],
                search_term=search_term,
                results_wanted=results_wanted,
            )
            results = scraper.scrape(input_params)
-            job_response = scraper.scrape(search_criteria)
+            for job in results.jobs:
-
+                location_state = job.location.state.strip().upper() if job.location and job.location.state else "Unknown"
            for job in job_response.jobs:
                location_city = job.location.city.strip() if job.location.city else "Unknown"
                location_state = job.location.state.strip().upper() if job.location.state else "Unknown"
                location_country = str(job.location.country) if job.location.country else "Unknown"
                if not any(term.lower() in job.title.lower() for term in search_terms):
                    print(f"🚫 Excluding: {job.title} (Doesn't match search terms)")
                    continue
                if job.date_posted and (today - job.date_posted).days <= max_days_old:
                    if location_state == target_state or job.is_remote:
                        print(f"✅ MATCH: {job.title} - {location_city}, {location_state} (Posted {job.date_posted})")
                        all_jobs.append({
                            "Job ID": job.id,
                            "Job Title (Primary)": job.title,
@ -70,61 +62,35 @@ def scrape_jobs(search_terms, results_wanted, max_days_old, target_state):
                            "Salary Min": job.compensation.min_amount if job.compensation else "",
                            "Salary Max": job.compensation.max_amount if job.compensation else "",
                            "Date Posted": job.date_posted.strftime("%Y-%m-%d") if job.date_posted else "Not Provided",
-                            "Location City": location_city,
+                            "Location City": job.location.city if job.location and job.location.city else "Unknown",
                            "Location State": location_state,
-                            "Location Country": location_country,
+                            "Location Country": str(job.location.country) if job.location and job.location.country else "Unknown",
                            "Job URL": job.job_url,
-                            "Job Description": job.description.replace(",", "") if job.description else "No description available",
+                            "Job Description": job.description.replace(",", "") if job.description else "No description",
                            "Job Source": source_name
                        })
                    else:
                        print(f"❌ Ignored (Wrong State): {job.title} - {location_city}, {location_state}")
                else:
                    print(f"⏳ Ignored (Too Old): {job.title} - {location_city}, {location_state}")
    print(f"\n✅ {len(all_jobs)} jobs retrieved for user {user_email}")
    return all_jobs
-
+def save_jobs_to_csv(jobs, filename):
 def save_jobs_to_csv(jobs, user_email):
    """Save job data to a user-specific CSV file using custom delimiter."""
    if not jobs:
-        print("⚠️ No jobs found matching criteria.")
+        print("⚠️ No jobs found.")
        return
-    # Clean the email to create a safe filename
+    fieldnames = list(jobs[0].keys())
-    safe_email = user_email.replace("@", "_at_").replace(".", "_")
+    header = "|~|".join(fieldnames)
-    filename = f"jobspy_output_dynamic_{safe_email}.csv"
+    records = [header]
-    # Remove old file if it exists
+    for job in jobs:
-    if os.path.exists(filename):
+        row = [str(job.get(field, "Not Provided")).replace(",", "") for field in fieldnames]
-        os.remove(filename)
+        records.append("|~|".join(row))
-    fieldnames = [
+    output = ",".join(records)
-        "Job ID", "Job Title (Primary)", "Company Name", "Industry",
+    with open(filename, "w", encoding="utf-8") as f:
-        "Experience Level", "Job Type", "Is Remote", "Currency",
+        f.write(output)
        "Salary Min", "Salary Max", "Date Posted", "Location City",
        "Location State", "Location Country", "Job URL", "Job Description",
        "Job Source", "User Email"
    ]
-    with open(filename, mode="w", newline="", encoding="utf-8") as file:
+    print(f"✅ Saved {len(jobs)} jobs to {filename}")
        writer = csv.DictWriter(file, fieldnames=fieldnames, delimiter="|")
        writer.writeheader()
        for job in jobs:
            job["User Email"] = user_email
            writer.writerow(job)
-    print(f"📄 File saved: {filename} ({len(jobs)} entries)")
+# Run
-    return filename
+scraped_jobs = scrape_jobs()
-
+save_jobs_to_csv(scraped_jobs, output_filename)
 # Run the scraper and save the results to a user-specific output file
 job_data = scrape_jobs(
    search_terms=search_terms,
    results_wanted=results_wanted,
    max_days_old=max_days_old,
    target_state=target_state
 )
 output_filename = save_jobs_to_csv(job_data, user_email)
--- a/jobspy_output_dynamic.csv
+++ b/jobspy_output_dynamic.csv