proj structure

2026-03-04 19:44:30 -08:00 · 2023-09-03 11:10:48 -05:00
parent 39f675f3c8
commit 94c7adeec3
11 changed files with 3 additions and 0 deletions
--- a/jobspy/init.py
+++ b/jobspy/init.py
@@ -0,0 +1,121 @@
+import pandas as pd
+from typing import List, Dict, Tuple, Union
+
+from concurrent.futures import ThreadPoolExecutor
+
+from .core.jobs import JobType
+from .core.scrapers.indeed import IndeedScraper
+from .core.scrapers.ziprecruiter import ZipRecruiterScraper
+from .core.scrapers.linkedin import LinkedInScraper
+from .core.scrapers import (
+    ScraperInput,
+    Site,
+    JobResponse,
+    CommonResponse,
+)
+
+
+SCRAPER_MAPPING = {
+    Site.LINKEDIN: LinkedInScraper,
+    Site.INDEED: IndeedScraper,
+    Site.ZIP_RECRUITER: ZipRecruiterScraper,
+}
+
+
+def _map_str_to_site(site_name: str) -> Site:
+    return Site[site_name.upper()]
+
+
+def scrape_jobs(
+        site_name: str | Site | List[Site],
+        search_term: str,
+
+        location: str = "",
+        distance: int = None,
+        is_remote: bool = False,
+        job_type: JobType = None,
+        easy_apply: bool = False,  # linkedin
+        results_wanted: int = 15
+) -> pd.DataFrame:
+    """
+    Asynchronously scrapes job data from multiple job sites.
+    :return: results_wanted: pandas dataframe containing job data
+    """
+
+    if type(site_name) == str:
+        site_name = _map_str_to_site(site_name)
+
+    site_type = [site_name] if type(site_name) == Site else site_name
+    scraper_input = ScraperInput(
+        site_type=site_type,
+        search_term=search_term,
+        location=location,
+        distance=distance,
+        is_remote=is_remote,
+        job_type=job_type,
+        easy_apply=easy_apply,
+        results_wanted=results_wanted,
+    )
+
+    def scrape_site(site: Site) -> Tuple[str, JobResponse]:
+        scraper_class = SCRAPER_MAPPING[site]
+        scraper = scraper_class()
+        scraped_data: JobResponse = scraper.scrape(scraper_input)
+
+        return site.value, scraped_data
+
+    results = {}
+    for site in scraper_input.site_type:
+        site_value, scraped_data = scrape_site(site)
+        results[site_value] = scraped_data
+
+    dfs = []
+
+    for site, job_response in results.items():
+        for job in job_response.jobs:
+            data = job.dict()
+            data['site'] = site
+
+            # Formatting JobType
+            data['job_type'] = data['job_type'].value if data['job_type'] else None
+
+            # Formatting Location
+            location_obj = data.get('location')
+            if location_obj and isinstance(location_obj, dict):
+                data['city'] = location_obj.get('city', '')
+                data['state'] = location_obj.get('state', '')
+                data['country'] = location_obj.get('country', 'USA')
+            else:
+                data['city'] = None
+                data['state'] = None
+                data['country'] = None
+
+            # Formatting Compensation
+            compensation_obj = data.get('compensation')
+            if compensation_obj and isinstance(compensation_obj, dict):
+                data['interval'] = compensation_obj.get('interval').value if compensation_obj.get('interval') else None
+                data['min_amount'] = compensation_obj.get('min_amount')
+                data['max_amount'] = compensation_obj.get('max_amount')
+                data['currency'] = compensation_obj.get('currency', 'USD')
+            else:
+                data['interval'] = None
+                data['min_amount'] = None
+                data['max_amount'] = None
+                data['currency'] = None
+
+            job_df = pd.DataFrame([data])
+            dfs.append(job_df)
+
+    if dfs:
+        df = pd.concat(dfs, ignore_index=True)
+        desired_order = ['site', 'title', 'company_name', 'city', 'state','job_type',
+                         'interval', 'min_amount', 'max_amount',  'job_url', 'description',]
+        df = df[desired_order]
+    else:
+        df = pd.DataFrame()
+
+    return df
+
+
+
+