removed the code parse the data to csv

2024-12-24 11:15:07 +02:00 · 2024-12-24 11:15:07 +02:00 · 54022f2b57
parent 79189f49ef
commit 54022f2b57
1 changed files with 0 additions and 132 deletions
--- a/src/jobspy/init.py
+++ b/src/jobspy/init.py
@ -149,135 +149,3 @@ def scrape_jobs(
                logger.error(f"Future Error occurred: {e}")
    return merged_jobs
    def convert_to_annual(job_data: dict):
        if job_data["interval"] == "hourly":
            job_data["min_amount"] *= 2080
            job_data["max_amount"] *= 2080
        if job_data["interval"] == "monthly":
            job_data["min_amount"] *= 12
            job_data["max_amount"] *= 12
        if job_data["interval"] == "weekly":
            job_data["min_amount"] *= 52
            job_data["max_amount"] *= 52
        if job_data["interval"] == "daily":
            job_data["min_amount"] *= 260
            job_data["max_amount"] *= 260
        job_data["interval"] = "yearly"
    jobs_dfs: list[pd.DataFrame] = []
    for site, job_response in site_to_jobs_dict.items():
        for job in job_response.jobs:
            job_data = job.dict()
            job_url = job_data["job_url"]
            job_data["job_url_hyper"] = f'<a href="{job_url}">{job_url}</a>'
            job_data["site"] = site
            job_data["company"] = job_data["company_name"]
            job_data["job_type"] = (
                ", ".join(job_type.value[0]
                          for job_type in job_data["job_type"])
                if job_data["job_type"]
                else None
            )
            job_data["emails"] = (
                ", ".join(job_data["emails"]) if job_data["emails"] else None
            )
            if job_data["location"]:
                job_data["location"] = Location(
                    **job_data["location"]
                ).display_location()
            compensation_obj = job_data.get("compensation")
            if compensation_obj and isinstance(compensation_obj, dict):
                job_data["interval"] = (
                    compensation_obj.get("interval").value
                    if compensation_obj.get("interval")
                    else None
                )
                job_data["min_amount"] = compensation_obj.get("min_amount")
                job_data["max_amount"] = compensation_obj.get("max_amount")
                job_data["currency"] = compensation_obj.get("currency", "USD")
                job_data["salary_source"] = SalarySource.DIRECT_DATA.value
                if enforce_annual_salary and (
                    job_data["interval"]
                    and job_data["interval"] != "yearly"
                    and job_data["min_amount"]
                    and job_data["max_amount"]
                ):
                    convert_to_annual(job_data)
            else:
                if country_enum == Country.USA:
                    (
                        job_data["interval"],
                        job_data["min_amount"],
                        job_data["max_amount"],
                        job_data["currency"],
                    ) = extract_salary(
                        job_data["description"],
                        enforce_annual_salary=enforce_annual_salary,
                    )
                    job_data["salary_source"] = SalarySource.DESCRIPTION.value
            job_data["salary_source"] = (
                job_data["salary_source"]
                if "min_amount" in job_data and job_data["min_amount"]
                else None
            )
            job_df = pd.DataFrame([job_data])
            jobs_dfs.append(job_df)
    if jobs_dfs:
        # Step 1: Filter out all-NA columns from each DataFrame before concatenation
        filtered_dfs = [df.dropna(axis=1, how="all") for df in jobs_dfs]
        # Step 2: Concatenate the filtered DataFrames
        jobs_df = pd.concat(filtered_dfs, ignore_index=True)
        # Desired column order
        desired_order = [
            "id",
            "site",
            "job_url_hyper" if hyperlinks else "job_url",
            "job_url_direct",
            "title",
            "company",
            "location",
            "date_posted",
            "job_type",
            "salary_source",
            "interval",
            "min_amount",
            "max_amount",
            "currency",
            "is_remote",
            "job_level",
            "job_function",
            "listing_type",
            "emails",
            "description",
            "company_industry",
            "company_url",
            "company_logo",
            "company_url_direct",
            "company_addresses",
            "company_num_employees",
            "company_revenue",
            "company_description",
        ]
        # Step 3: Ensure all desired columns are present, adding missing ones as empty
        for column in desired_order:
            if column not in jobs_df.columns:
                jobs_df[column] = None  # Add missing columns as empty
        # Reorder the DataFrame according to the desired order
        jobs_df = jobs_df[desired_order]
        # Step 4: Sort the DataFrame as required
        return jobs_df.sort_values(
            by=["site", "date_posted"], ascending=[True, False]
        ).reset_index(drop=True)
    else:
        return pd.DataFrame()