enh: linkedin company logo (#141)

pull/146/head
Cullen Watson 2024-04-30 12:03:10 -05:00 committed by GitHub
parent 8dd08ed9fd
commit bf73c061bd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 1072 additions and 969 deletions

View File

@ -27,4 +27,4 @@ print("outputted to jobs.csv")
# jobs.to_xlsx('jobs.xlsx', index=False) # jobs.to_xlsx('jobs.xlsx', index=False)
# 4: display in Jupyter Notebook (1. pip install jupyter 2. jupyter notebook) # 4: display in Jupyter Notebook (1. pip install jupyter 2. jupyter notebook)
# display(jobs) # display(jobs)

View File

@ -32,17 +32,18 @@ while len(all_jobs) < results_wanted:
search_term="software engineer", search_term="software engineer",
# New York, NY # New York, NY
# Dallas, TX # Dallas, TX
# Los Angeles, CA # Los Angeles, CA
location="Los Angeles, CA", location="Los Angeles, CA",
results_wanted=min(results_in_each_iteration, results_wanted - len(all_jobs)), results_wanted=min(
results_in_each_iteration, results_wanted - len(all_jobs)
),
country_indeed="USA", country_indeed="USA",
offset=offset, offset=offset,
# proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001", # proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
) )
# Add the scraped jobs to the list # Add the scraped jobs to the list
all_jobs.extend(jobs.to_dict('records')) all_jobs.extend(jobs.to_dict("records"))
# Increment the offset for the next page of results # Increment the offset for the next page of results
offset += results_in_each_iteration offset += results_in_each_iteration

1987
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -25,8 +25,8 @@ regex = "^2024.4.28"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]
pytest = "^7.4.1" pytest = "^7.4.1"
jupyter = "^1.0.0" jupyter = "^1.0.0"
black = "^24.2.0" black = "*"
pre-commit = "^3.6.2" pre-commit = "*"
[build-system] [build-system]
requires = ["poetry-core"] requires = ["poetry-core"]

View File

@ -1,5 +1,7 @@
from __future__ import annotations from __future__ import annotations
from abc import ABC, abstractmethod
from ..jobs import ( from ..jobs import (
Enum, Enum,
BaseModel, BaseModel,
@ -36,9 +38,10 @@ class ScraperInput(BaseModel):
hours_old: int | None = None hours_old: int | None = None
class Scraper: class Scraper(ABC):
def __init__(self, site: Site, proxy: list[str] | None = None): def __init__(self, site: Site, proxy: list[str] | None = None):
self.site = site self.site = site
self.proxy = (lambda p: {"http": p, "https": p} if p else None)(proxy) self.proxy = (lambda p: {"http": p, "https": p} if p else None)(proxy)
@abstractmethod
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ... def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...

View File

@ -197,15 +197,16 @@ class LinkedInScraper(Scraper):
if metadata_card if metadata_card
else None else None
) )
date_posted = description = job_type = job_url_direct = None date_posted = None
if datetime_tag and "datetime" in datetime_tag.attrs: if datetime_tag and "datetime" in datetime_tag.attrs:
datetime_str = datetime_tag["datetime"] datetime_str = datetime_tag["datetime"]
try: try:
date_posted = datetime.strptime(datetime_str, "%Y-%m-%d") date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")
except: except:
date_posted = None date_posted = None
job_details = {}
if full_descr: if full_descr:
description, job_type, job_url_direct = self._get_job_description(job_url) job_details = self._get_job_details(job_url)
return JobPost( return JobPost(
title=title, title=title,
@ -216,20 +217,18 @@ class LinkedInScraper(Scraper):
job_url=job_url, job_url=job_url,
job_url_direct=job_url_direct, job_url_direct=job_url_direct,
compensation=compensation, compensation=compensation,
job_type=job_type, job_type=job_details.get("job_type"),
description=description, description=job_details.get("description"),
emails=extract_emails_from_text(description) if description else None, job_url_direct=job_details.get("job_url_direct"),
emails=extract_emails_from_text(job_details.get("description")),
logo_photo_url=job_details.get("logo_photo_url"),
) )
def _get_job_description( def _get_job_details(self, job_page_url: str) -> dict:
self, job_page_url: str
) -> tuple[None, None, None] | tuple[
str | None, tuple[str | None, JobType | None], str | None
]:
""" """
Retrieves job description by going to the job page url Retrieves job description and other job details by going to the job page url
:param job_page_url: :param job_page_url:
:return: description or None :return: dict
""" """
try: try:
session = create_session(is_tls=False, has_retry=True) session = create_session(is_tls=False, has_retry=True)
@ -238,9 +237,9 @@ class LinkedInScraper(Scraper):
) )
response.raise_for_status() response.raise_for_status()
except: except:
return None, None return {}
if response.url == "https://www.linkedin.com/signup": if response.url == "https://www.linkedin.com/signup":
return None, None return {}
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
div_content = soup.find( div_content = soup.find(
@ -258,7 +257,14 @@ class LinkedInScraper(Scraper):
description = div_content.prettify(formatter="html") description = div_content.prettify(formatter="html")
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN: if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
description = markdown_converter(description) description = markdown_converter(description)
return description, self._parse_job_type(soup), self._parse_job_url_direct(soup) return {
"description": description,
"job_type": self._parse_job_type(soup),
"job_url_direct": self._parse_job_url_direct(soup),
"logo_photo_url": soup.find("img", {"class": "artdeco-entity-image"}).get(
"data-delayed-url"
),
}
def _get_location(self, metadata_card: Optional[Tag]) -> Location: def _get_location(self, metadata_card: Optional[Tag]) -> Location:
""" """