enh: linkedin company logo (#141)

pull/146/head
Cullen Watson 2024-04-30 12:03:10 -05:00 committed by GitHub
parent 8dd08ed9fd
commit bf73c061bd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 1072 additions and 969 deletions

View File

@ -27,4 +27,4 @@ print("outputted to jobs.csv")
# jobs.to_xlsx('jobs.xlsx', index=False)
# 4: display in Jupyter Notebook (1. pip install jupyter 2. jupyter notebook)
# display(jobs)
# display(jobs)

View File

@ -32,17 +32,18 @@ while len(all_jobs) < results_wanted:
search_term="software engineer",
# New York, NY
# Dallas, TX
# Los Angeles, CA
location="Los Angeles, CA",
results_wanted=min(results_in_each_iteration, results_wanted - len(all_jobs)),
results_wanted=min(
results_in_each_iteration, results_wanted - len(all_jobs)
),
country_indeed="USA",
offset=offset,
# proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
)
# Add the scraped jobs to the list
all_jobs.extend(jobs.to_dict('records'))
all_jobs.extend(jobs.to_dict("records"))
# Increment the offset for the next page of results
offset += results_in_each_iteration

1987
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -25,8 +25,8 @@ regex = "^2024.4.28"
[tool.poetry.group.dev.dependencies]
pytest = "^7.4.1"
jupyter = "^1.0.0"
black = "^24.2.0"
pre-commit = "^3.6.2"
black = "*"
pre-commit = "*"
[build-system]
requires = ["poetry-core"]

View File

@ -1,5 +1,7 @@
from __future__ import annotations
from abc import ABC, abstractmethod
from ..jobs import (
Enum,
BaseModel,
@ -36,9 +38,10 @@ class ScraperInput(BaseModel):
hours_old: int | None = None
class Scraper:
class Scraper(ABC):
def __init__(self, site: Site, proxy: list[str] | None = None):
self.site = site
self.proxy = (lambda p: {"http": p, "https": p} if p else None)(proxy)
@abstractmethod
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...

View File

@ -197,15 +197,16 @@ class LinkedInScraper(Scraper):
if metadata_card
else None
)
date_posted = description = job_type = job_url_direct = None
date_posted = None
if datetime_tag and "datetime" in datetime_tag.attrs:
datetime_str = datetime_tag["datetime"]
try:
date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")
except:
date_posted = None
job_details = {}
if full_descr:
description, job_type, job_url_direct = self._get_job_description(job_url)
job_details = self._get_job_details(job_url)
return JobPost(
title=title,
@ -216,20 +217,18 @@ class LinkedInScraper(Scraper):
job_url=job_url,
job_url_direct=job_url_direct,
compensation=compensation,
job_type=job_type,
description=description,
emails=extract_emails_from_text(description) if description else None,
job_type=job_details.get("job_type"),
description=job_details.get("description"),
job_url_direct=job_details.get("job_url_direct"),
emails=extract_emails_from_text(job_details.get("description")),
logo_photo_url=job_details.get("logo_photo_url"),
)
def _get_job_description(
self, job_page_url: str
) -> tuple[None, None, None] | tuple[
str | None, tuple[str | None, JobType | None], str | None
]:
def _get_job_details(self, job_page_url: str) -> dict:
"""
Retrieves job description by going to the job page url
Retrieves job description and other job details by going to the job page url
:param job_page_url:
:return: description or None
:return: dict
"""
try:
session = create_session(is_tls=False, has_retry=True)
@ -238,9 +237,9 @@ class LinkedInScraper(Scraper):
)
response.raise_for_status()
except:
return None, None
return {}
if response.url == "https://www.linkedin.com/signup":
return None, None
return {}
soup = BeautifulSoup(response.text, "html.parser")
div_content = soup.find(
@ -258,7 +257,14 @@ class LinkedInScraper(Scraper):
description = div_content.prettify(formatter="html")
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
description = markdown_converter(description)
return description, self._parse_job_type(soup), self._parse_job_url_direct(soup)
return {
"description": description,
"job_type": self._parse_job_type(soup),
"job_url_direct": self._parse_job_url_direct(soup),
"logo_photo_url": soup.find("img", {"class": "artdeco-entity-image"}).get(
"data-delayed-url"
),
}
def _get_location(self, metadata_card: Optional[Tag]) -> Location:
"""