mongo connection setting

pull/53/head
yiwen 2023-10-02 19:32:00 +08:00
parent 3cec7bb62c
commit d63d102dbc
8 changed files with 1149 additions and 24 deletions

View File

@ -19,27 +19,27 @@ pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50) # set to 0 to see full job url / desc
# fetch jobs for each location
locations = read_location_list('location_seed.json')
for location in locations:
try:
jobs: pd.DataFrame = scrape_jobs(
# site_name=["indeed", "linkedin", "zip_recruiter"],
site_name=["indeed"],
search_term="software engineer",
location=location,
results_wanted=30,
# be wary the higher it is, the more likey you'll get blocked (rotating proxy should work tho)
country_indeed='USA',
# offset=25 # start jobs from an offset (use if search failed and want to continue)
proxy="http://34.120.172.140:8123",
# proxy="http://crawler-gost-proxy.jobright-internal.com:8080",
)
except Exception as e:
print(f'Error when process: {location}')
print(e)
continue
print(f'{location}: {jobs.shape[0]} rows append.')
if os.path.isfile('./jobs.csv'):
jobs.to_csv('./jobs.csv', index=False, mode='a', header=False)
else:
jobs.to_csv('./jobs.csv', index=False, mode='a', header=True)
# locations = read_location_list('location_seed.json')
# for location in locations:
# try:
# jobs: pd.DataFrame = scrape_jobs(
# # site_name=["indeed", "linkedin", "zip_recruiter"],
# site_name=["indeed"],
# search_term="software engineer",
# location=location,
# results_wanted=30,
# # be wary the higher it is, the more likey you'll get blocked (rotating proxy should work tho)
# country_indeed='USA',
# # offset=25 # start jobs from an offset (use if search failed and want to continue)
# proxy="http://34.120.172.140:8123",
# # proxy="http://crawler-gost-proxy.jobright-internal.com:8080",
# )
# except Exception as e:
# print(f'Error when process: {location}')
# print(e)
# continue
# print(f'{location}: {jobs.shape[0]} rows append.')
# if os.path.isfile('./jobs.csv'):
# jobs.to_csv('./jobs.csv', index=False, mode='a', header=False)
# else:
# jobs.to_csv('./jobs.csv', index=False, mode='a', header=True)

View File

29
src/dao/mongoDAO.py Normal file
View File

@ -0,0 +1,29 @@
from pymongo import MongoClient
class MongoDBHelper:
def __init__(self,
connection_host="mongodb://mongos.mongos:73hck*euuDyU!JXikCTV@172.31.57.134:27017",
database_name='indeed'):
self.client = MongoClient(connection_host)
self.database = self.client[database_name]
def disconnect(self):
if self.client is not None:
self.client.close()
self.client = None
self.database = None
def insert_one(self, collection_name, data):
collection = self.database[collection_name]
result = collection.insert_one(data)
return result.inserted_id
def insert_all(self, collection_name, data_list):
collection = self.database[collection_name]
result = collection.insert_many(data_list)
return result.inserted_ids
def find(self, collection_name, query):
collection = self.database[collection_name]
return collection.find(query)

View File

@ -70,6 +70,7 @@ class IndeedScraper(Scraper):
"l": scraper_input.location,
"filter": 0,
"start": scraper_input.offset + page * 10,
'fromage': 1 # only need jobs posted in recent 24h
}
if scraper_input.distance:
params["radius"] = scraper_input.distance

961
src/location_seed.json Normal file
View File

@ -0,0 +1,961 @@
[
"new york,NY",
"houston,TX",
"chicago,IL",
"brooklyn,NY",
"los angeles,CA",
"miami,FL",
"san antonio,TX",
"philadelphia,PA",
"las vegas,NV",
"bronx,NY",
"phoenix,AZ",
"dallas,TX",
"san diego,CA",
"minneapolis,MN",
"san jose,CA",
"denver,CO",
"austin,TX",
"st. louis,MO",
"indianapolis,IN",
"atlanta,GA",
"tucson,AZ",
"orlando,FL",
"portland,OR",
"seattle,WA",
"san francisco,CA",
"fort worth,TX",
"jacksonville,FL",
"milwaukee,WI",
"cincinnati,OH",
"charlotte,NC",
"columbus,OH",
"cleveland,OH",
"fort lauderdale,FL",
"sacramento,CA",
"saint paul,MN",
"el paso,TX",
"louisville,KY",
"tampa,FL",
"memphis,TN",
"pittsburgh,PA",
"detroit,MI",
"albuquerque,NM",
"oklahoma city,OK",
"washington,DC",
"fresno,CA",
"buffalo,NY",
"kansas city,MO",
"colorado springs,CO",
"bakersfield,CA",
"omaha,NE",
"birmingham,AL",
"raleigh,NC",
"dayton,OH",
"mesa,AZ",
"rochester,NY",
"long beach,CA",
"staten island,NY",
"salt lake city,UT",
"virginia beach,VA",
"nashville,TN",
"pompano beach,FL",
"hollywood,FL",
"riverside,CA",
"tulsa,OK",
"wichita,KS",
"honolulu,HI",
"knoxville,TN",
"aurora,CO",
"new orleans,LA",
"oakland,CA",
"baton rouge,LA",
"arlington,TX",
"richmond,VA",
"stockton,CA",
"anaheim,CA",
"grand rapids,MI",
"santa ana,CA",
"hialeah,FL",
"spokane,WA",
"saint petersburg,FL",
"west palm beach,FL",
"tacoma,WA",
"toledo,OH",
"spring,TX",
"corpus christi,TX",
"fort wayne,IN",
"greensboro,NC",
"littleton,CO",
"marietta,GA",
"vancouver,WA",
"lexington,KY",
"silver spring,MD",
"glendale,AZ",
"plano,TX",
"newark,NJ",
"reno,NV",
"naples,FL",
"tallahassee,FL",
"madison,WI",
"akron,OH",
"mobile,AL",
"scottsdale,AZ",
"lincoln,NE",
"henderson,NV",
"durham,NC",
"chandler,AZ",
"modesto,CA",
"pensacola,FL",
"lubbock,TX",
"katy,TX",
"winston-salem,NC",
"salem,OR",
"laredo,TX",
"jersey city,NJ",
"norfolk,VA",
"anchorage,AK",
"chula vista,CA",
"fayetteville,NC",
"lakeland,FL",
"san bernardino,CA",
"boise,ID",
"corona,CA",
"sarasota,FL",
"paradise,NV",
"north hempstead,NY",
"syracuse,NY",
"savannah,GA",
"lawrenceville,GA",
"kissimmee,FL",
"shreveport,LA",
"garland,TX",
"trenton,NJ",
"fort myers,FL",
"chesapeake,VA",
"fontana,CA",
"amarillo,TX",
"springfield,MO",
"irving,TX",
"bradenton,FL",
"lake worth,FL",
"wilmington,DE",
"boston,MA",
"montgomery,AL",
"north las vegas,NV",
"little rock,AR",
"fremont,CA",
"irvine,CA",
"des moines,IA",
"oxnard,CA",
"gilbert,AZ",
"boca raton,FL",
"ocala,FL",
"aurora,IL",
"providence,RI",
"augusta,GA",
"chattanooga,TN",
"gainesville,FL",
"brownsville,TX",
"spring valley,NV",
"moreno valley,CA",
"sunrise manor,NV",
"eugene,OR",
"huntington beach,CA",
"huntsville,AL",
"whittier,CA",
"port st. lucie,FL",
"rockford,IL",
"oceanside,CA",
"erie,PA",
"ogden,UT",
"wilmington,NC",
"woodbridge,VA",
"beaverton,OR",
"evansville,IN",
"lancaster,CA",
"salinas,CA",
"worcester,MA",
"newport news,VA",
"green bay,WI",
"yonkers,NY",
"hayward,CA",
"fort collins,CO",
"york,PA",
"jackson,MS",
"grand prairie,TX",
"torrance,CA",
"columbus,GA",
"glendale,CA",
"garden grove,CA",
"clearwater,FL",
"escondido,CA",
"palmdale,CA",
"overland park,KS",
"decatur,GA",
"new haven,CT",
"tempe,AZ",
"mission,TX",
"olympia,WA",
"ontario,CA",
"allentown,PA",
"macon,GA",
"roanoke,VA",
"alpharetta,GA",
"harrisburg,PA",
"rancho cucamonga,CA",
"el cajon,CA",
"topeka,KS",
"youngstown,OH",
"flint,MI",
"elk grove,CA",
"schenectady,NY",
"lansing,MI",
"sioux falls,SD",
"naperville,IL",
"kalamazoo,MI",
"murfreesboro,TN",
"ann arbor,MI",
"peoria,AZ",
"cape coral,FL",
"fredericksburg,VA",
"everett,WA",
"boynton beach,FL",
"homestead,FL",
"lakewood,CO",
"pomona,CA",
"north township,IN",
"pasadena,CA",
"fairfax,VA",
"pueblo,CO",
"clarksville,TN",
"lafayette,LA",
"albany,NY",
"hyattsville,MD",
"kansas city,KS",
"sugar land,TX",
"waco,TX",
"kent,WA",
"canton,OH",
"paterson,NJ",
"pasadena,TX",
"south bend,IN",
"mckinney,TX",
"bridgeport,CT",
"tyler,TX",
"springfield,MA",
"las cruces,NM",
"beaumont,TX",
"visalia,CA",
"springfield,IL",
"yuma,AZ",
"edmond,OK",
"orange,CA",
"renton,WA",
"mesquite,TX",
"sunnyvale,CA",
"miramar,FL",
"melbourne,FL",
"denton,TX",
"round rock,TX",
"columbia,MO",
"newark,DE",
"manassas,VA",
"odessa,TX",
"metairie,LA",
"ramapo,NY",
"peoria,IL",
"saginaw,MI",
"gaithersburg,MD",
"inglewood,CA",
"compton,CA",
"fullerton,CA",
"saint charles,MO",
"hamilton,OH",
"midland,TX",
"cedar rapids,IA",
"killeen,TX",
"warren,MI",
"santa maria,CA",
"santa barbara,CA",
"san mateo,CA",
"mcallen,TX",
"cary,NC",
"bellevue,WA",
"humble,TX",
"rockville,MD",
"victorville,CA",
"conroe,TX",
"olathe,KS",
"west valley city,UT",
"sterling heights,MI",
"cumming,GA",
"puyallup,WA",
"muskegon,MI",
"joliet,IL",
"billings,MT",
"racine,WI",
"bloomington,IN",
"simi valley,CA",
"yakima,WA",
"lake charles,LA",
"abilene,TX",
"hartford,CT",
"concord,CA",
"independence,MO",
"bethlehem,PA",
"roseville,CA",
"cypress,TX",
"stamford,CT",
"toms river,NJ",
"arvada,CO",
"boulder,CO",
"gainesville,GA",
"asheville,NC",
"frederick,MD",
"frisco,TX",
"surprise,AZ",
"carrollton,TX",
"myrtle beach,SC",
"vallejo,CA",
"berkeley,CA",
"panama city,FL",
"rochester,MN",
"hemet,CA",
"bellingham,WA",
"broken arrow,OK",
"longmont,CO",
"elgin,IL",
"duluth,MN",
"appleton,WI",
"falls church,VA",
"high point,NC",
"murrieta,CA",
"santa clara,CA",
"la puente,CA",
"spring hill,FL",
"new port richey,FL",
"temecula,CA",
"norman,OK",
"west jordan,UT",
"provo,UT",
"kenosha,WI",
"tuscaloosa,AL",
"ventura,CA",
"pearland,TX",
"charlottesville,VA",
"stone mountain,GA",
"vista,CA",
"downey,CA",
"redding,CA",
"costa mesa,CA",
"waterbury,CT",
"centennial,CO",
"sparks,NV",
"st. augustine,FL",
"clovis,CA",
"miami beach,FL",
"lewisville,TX",
"florissant,MO",
"greenville,NC",
"west covina,CA",
"fargo,ND",
"daly city,CA",
"rock hill,SC",
"delray beach,FL",
"spartanburg,SC",
"bothell,WA",
"burbank,CA",
"sandy springs,GA",
"lowell,MA",
"tracy,CA",
"perris,CA",
"chico,CA",
"jurupa valley,CA",
"fairfield,CA",
"vero beach,FL",
"sandy,UT",
"norwalk,CA",
"bend,OR",
"cambridge,MA",
"douglasville,GA",
"carlsbad,CA",
"davie,FL",
"college station,TX",
"palm bay,FL",
"san leandro,CA",
"concord,NC",
"nampa,ID",
"redwood city,CA",
"springfield,VA",
"bowling green,KY",
"jackson,MI",
"duluth,GA",
"san angelo,TX",
"largo,FL",
"auburn,WA",
"baytown,TX",
"grand junction,CO",
"antioch,CA",
"gastonia,NC",
"davenport,IA",
"longview,TX",
"laurel,MD",
"wichita falls,TX",
"springfield,OH",
"idaho falls,ID",
"albany,GA",
"rialto,CA",
"greeley,CO",
"bell gardens,CA",
"scranton,PA",
"edison,NJ",
"englewood,CO",
"hesperia,CA",
"vacaville,CA",
"daytona beach,FL",
"richardson,TX",
"ypsilanti,MI",
"lynchburg,VA",
"madera,CA",
"waukegan,IL",
"arden-arcade,CA",
"elizabeth,NJ",
"dearborn,MI",
"lafayette,IN",
"santa cruz,CA",
"spokane valley,WA",
"merced,CA",
"richmond,TX",
"elkhart,IN",
"rochester,MI",
"livonia,MI",
"lee's summit,MO",
"charleston,WV",
"winter haven,FL",
"belleville,IL",
"roswell,GA",
"terre haute,IN",
"bowie,MD",
"hagerstown,MD",
"columbia,MD",
"keller,TX",
"waukesha,WI",
"new bedford,MA",
"mission viejo,CA",
"plainfield,IL",
"south gate,CA",
"sunrise,FL",
"lawrence,KS",
"napa,CA",
"muncie,IN",
"brockton,MA",
"valdosta,GA",
"anderson,SC",
"lawton,OK",
"allen,TX",
"hawthorne,CA",
"mansfield,OH",
"hillsboro,OR",
"lynnwood,WA",
"lakewood township,NJ",
"kennesaw,GA",
"ballwin,MO",
"palo alto,CA",
"broomfield,CO",
"quincy,MA",
"battle creek,MI",
"annapolis,MD",
"johnson city,TN",
"santa monica,CA",
"jacksonville,NC",
"acworth,GA",
"missouri city,TX",
"federal way,WA",
"chapel hill,NC",
"greenburgh,NY",
"missoula,MT",
"el monte,CA",
"mcdonough,GA",
"franklin,TN",
"rapid city,SD",
"lynn,MA",
"lithonia,GA",
"canton,MI",
"kennewick,WA",
"fort smith,AR",
"des plaines,IL",
"westminster,CA",
"holland,MI",
"jupiter,FL",
"o'fallon,MO",
"fayetteville,AR",
"bryan,TX",
"san marcos,CA",
"woodstock,GA",
"fort pierce,FL",
"rio rancho,NM",
"cleveland,TN",
"champaign,IL",
"deltona,FL",
"orem,UT",
"midlothian,VA",
"fall river,MA",
"apopka,FL",
"farmington,MI",
"decatur,IL",
"morgantown,WV",
"indio,CA",
"harlingen,TX",
"kingsport,TN",
"herndon,VA",
"saint joseph,MO",
"lima,OH",
"poughkeepsie,NY",
"monroe,LA",
"medford,OR",
"nashua,NH",
"jackson,TN",
"clarkstown,NY",
"cheyenne,WY",
"covington,GA",
"norristown,PA",
"westerville,OH",
"sioux city,IA",
"bloomington,MN",
"upper marlboro,MD",
"springdale,AR",
"riverview,FL",
"norwalk,CT",
"williamsburg,VA",
"gary,IN",
"buford,GA",
"slidell,LA",
"hoover,AL",
"lehigh acres,FL",
"parker,CO",
"bremerton,WA",
"newnan,GA",
"pittsburg,CA",
"cicero,IL",
"suffolk,VA",
"flagstaff,AZ",
"johns creek,GA",
"bloomington,IL",
"westland,MI",
"meridian,ID",
"gardena,CA",
"citrus heights,CA",
"sumter,SC",
"livermore,CA",
"chino,CA",
"easton,PA",
"conyers,GA",
"clifton,NJ",
"gulfport,MS",
"houma,LA",
"milton,FL",
"carson,CA",
"alhambra,CA",
"glen burnie,MD",
"germantown,MD",
"loveland,CO",
"hattiesburg,MS",
"walnut creek,CA",
"watsonville,CA",
"ashburn,VA",
"binghamton,NY",
"clermont,FL",
"maryville,TN",
"owensboro,KY",
"jonesboro,GA",
"warner robins,GA",
"eau claire,WI",
"farmington hills,MI",
"troy,MI",
"danbury,CT",
"tustin,CA",
"oshkosh,WI",
"brooklyn park,MN",
"dalton,GA",
"dothan,AL",
"ellicott city,MD",
"buena park,CA",
"bethesda,MD",
"arlington heights,IL",
"lakewood,CA",
"sterling,VA",
"brandon,FL",
"macomb township,MI",
"palm coast,FL",
"dallas,GA",
"anderson,IN",
"middletown,OH",
"parma,OH",
"evanston,IL",
"hickory,NC",
"greenwood,IN",
"covina,CA",
"new braunfels,TX",
"utica,MI",
"valparaiso,IN",
"thousand oaks,CA",
"sanford,FL",
"bismarck,ND",
"waldorf,MD",
"pocatello,ID",
"bloomington township,IL",
"gresham,OR",
"palatine,IL",
"camarillo,CA",
"iowa city,IA",
"apple valley,CA",
"warwick,RI",
"turlock,CA",
"jonesboro,AR",
"conway,AR",
"redmond,WA",
"stafford,VA",
"carmel,IN",
"upland,CA",
"porterville,CA",
"lexington,NC",
"monroe,NC",
"yuba city,CA",
"bellflower,CA",
"baldwin park,CA",
"lawrence,MA",
"norcross,GA",
"schaumburg,IL",
"warren,OH",
"southfield,MI",
"smyrna,GA",
"greer,SC",
"redondo beach,CA",
"huntington park,CA",
"huntington,WV",
"st. george,UT",
"avondale,AZ",
"chino hills,CA",
"kirkland,WA",
"wilkes-barre,PA",
"mountain view,CA",
"snellville,GA",
"plant city,FL",
"st. cloud,MN",
"palm harbor,FL",
"davis,CA",
"simpsonville,SC",
"manteca,CA",
"rochester hills,MI",
"brick,NJ",
"springfield,OR",
"somerville,MA",
"alameda,CA",
"redlands,CA",
"leesburg,VA",
"fishers,IN",
"hammond,IN",
"brighton,CO",
"san ramon,CA",
"new rochelle,NY",
"bolingbrook,IL",
"kokomo,IN",
"new britain,CT",
"marysville,WA",
"temple,TX",
"lodi,CA",
"mount pleasant,SC",
"janesville,WI",
"waipahu,HI",
"joplin,MO",
"opa-locka,FL",
"folsom,CA",
"great falls,MT",
"goldsboro,NC",
"oviedo,FL",
"rocky mount,NC",
"lauderhill,FL",
"rogers,AR",
"utica,NY",
"council bluffs,IA",
"pleasanton,CA",
"san tan valley,AZ",
"north charleston,SC",
"orange park,FL",
"league city,TX",
"waterford township,MI",
"plainfield,NJ",
"winter park,FL",
"bessemer,AL",
"pawtucket,RI",
"denham springs,LA",
"johnstown,PA",
"dubuque,IA",
"cherry hill,NJ",
"troy,NY",
"weston,FL",
"cranston,RI",
"pharr,TX",
"petaluma,CA",
"san rafael,CA",
"winchester,VA",
"traverse city,MI",
"white plains,NY",
"la mesa,CA",
"passaic,NJ",
"bossier city,LA",
"mooresville,NC",
"woodbury,MN",
"parkville,MD",
"lynwood,CA",
"tulare,CA",
"union city,CA",
"aiken,SC",
"shawnee,KS",
"north little rock,AR",
"dover,DE",
"statesville,NC",
"casper,WY",
"dublin,OH",
"casas adobes,AZ",
"lake elsinore,CA",
"pflugerville,TX",
"brandon,MS",
"port charlotte,FL",
"centreville,VA",
"jefferson city,MO",
"palm beach gardens,FL",
"framingham,MA",
"kenner,LA",
"pasco,WA",
"sanford,NC",
"tomball,TX",
"lorain,OH",
"grants pass,OR",
"bay city,MI",
"layton,UT",
"matthews,NC",
"la habra,CA",
"fort mill,SC",
"harrisonburg,VA",
"columbus,IN",
"castle rock,CO",
"spring valley,CA",
"mount vernon,NY",
"huntington station,NY",
"meridian,MS",
"gadsden,AL",
"mechanicsburg,PA",
"goodyear,AZ",
"stuart,FL",
"west lafayette,IN",
"milpitas,CA",
"hendersonville,NC",
"ithaca,NY",
"eagan,MN",
"yorba linda,CA",
"union city,NJ",
"state college,PA",
"port orange,FL",
"west bloomfield township,MI",
"stockbridge,GA",
"arcadia,CA",
"cocoa,FL",
"tamarac,FL",
"massillon,OH",
"glen allen,VA",
"hanford,CA",
"portland,ME",
"corvallis,OR",
"weatherford,TX",
"cedar park,TX",
"new bern,NC",
"tinley park,IL",
"cookeville,TN",
"decatur,AL",
"midland,MI",
"flower mound,TX",
"orland park,IL",
"suwanee,GA",
"blaine,MN",
"noblesville,IN",
"east orange,NJ",
"elyria,OH",
"punta gorda,FL",
"brooksville,FL",
"logan,UT",
"west hartford,CT",
"venice,FL",
"levittown,PA",
"fairbanks,AK",
"south san francisco,CA",
"east lansing,MI",
"buckeye,AZ",
"weslaco,TX",
"san marcos,TX",
"florence-graham,CA",
"bel air,MD",
"san clemente,CA",
"madison,AL",
"mechanicsville,VA",
"la crosse,WI",
"huntsville,TX",
"sheboygan,WI",
"san luis obispo,CA",
"berwyn,IL",
"altoona,PA",
"laguna niguel,CA",
"vienna,VA",
"florence,AL",
"eastvale,CA",
"taylor,MI",
"lufkin,TX",
"bayonne,NJ",
"newport beach,CA",
"pico rivera,CA",
"ames,IA",
"pottstown,PA",
"ewa beach,HI",
"coon rapids,MN",
"fayetteville,GA",
"casa grande,AZ",
"montebello,CA",
"bay shore,NY",
"clearfield,UT",
"san gabriel,CA",
"north miami,FL",
"middletown,NY",
"lakewood,WA",
"rosemead,CA",
"medina,OH",
"north port,FL",
"griffin,GA",
"valrico,FL",
"port orchard,WA",
"ormond beach,FL",
"carson city,NV",
"north richland hills,TX",
"deland,FL",
"moore,OK",
"saint cloud,FL",
"caldwell,ID",
"doral,FL",
"conway,SC",
"hamden,CT",
"valley stream,NY",
"biloxi,MS",
"meriden,CT",
"manhattan,KS",
"west new york,NJ",
"haverhill,MA",
"eden prairie,MN",
"north bergen,NJ",
"fond du lac,WI",
"brentwood,NY",
"rancho cordova,CA",
"cupertino,CA",
"vineland,NJ",
"texarkana,TX",
"snohomish,WA",
"mankato,MN",
"waltham,MA",
"crown point,IN",
"pasadena,MD",
"new iberia,LA",
"euless,TX",
"bristol,CT",
"bellevue,NE",
"danville,VA",
"powder springs,GA",
"burnsville,MN",
"skokie,IL",
"monterey park,CA",
"mentor,OH",
"millcreek,UT",
"cartersville,GA",
"easley,SC",
"petersburg,VA",
"round lake,IL",
"south whittier,CA",
"taylorsville,UT",
"west allis,WI",
"pontiac,MI",
"wilson,NC",
"woodland,CA",
"carrollton,GA",
"reston,VA",
"kendale lakes,FL",
"burleson,TX",
"phenix city,AL",
"azusa,CA",
"saint clair shores,MI",
"crystal lake,IL",
"lilburn,GA",
"newark,OH",
"lancaster,OH",
"malden,MA",
"wheaton,IL",
"greensburg,PA",
"loganville,GA",
"carmichael,CA",
"hempstead,NY",
"hendersonville,TN",
"fontainebleau,FL",
"new castle,DE",
"lake forest,CA",
"bartlett,TN",
"apex,NC",
"spring valley,NY",
"the hammocks,FL",
"albany,OR",
"bozeman,MT",
"martinsburg,WV",
"wesley chapel,FL",
"novato,CA",
"grove city,OH",
"east saint louis,IL",
"tamiami,FL",
"chicago heights,IL",
"medford,MA",
"chillicothe,OH",
"farmington,NM",
"dearborn heights,MI",
"riverdale,GA",
"howell,MI",
"lake city,FL",
"paducah,KY",
"golden,CO",
"yukon,OK",
"lutz,FL",
"lompoc,CA",
"gilroy,CA",
"midwest city,OK",
"rocklin,CA",
"royal oak,MI",
"westminster,MD",
"roswell,NM",
"oak lawn,IL",
"mchenry,IL",
"goshen,IN",
"new baltimore,MI",
"castro valley,CA",
"downers grove,IL",
"colton,CA",
"painesville,OH",
"altamonte springs,FL",
"marrero,LA",
"st. charles,IL",
"plymouth,MA",
"freehold township,NJ",
"butler,PA",
"shoreline,WA",
"rockwall,TX",
"palm desert,CA",
"blue springs,MO",
"fountain valley,CA",
"annandale,VA",
"piscataway township,NJ",
"mishawaka,IN",
"monroe,MI"
]

39
src/scrape.py Normal file
View File

@ -0,0 +1,39 @@
from dao.mongoDAO import MongoDBHelper
from tools.tools import get_location_list, get_title_list
import os
from jobspy import scrape_jobs
import pandas as pd
from jobspy.jobs import JobPost
# for write date to mongo
mongo_helper = MongoDBHelper()
# locations, titles for search
locations = get_location_list()
titles = get_title_list()
# write jobs to mongo
def write_jobs_to_mongo(job_list: [JobPost], mongo: MongoDBHelper):
print(job_list)
# mongo.insert_all(job_list)
for location in locations:
for title in titles:
try:
jobs: pd.DataFrame = scrape_jobs(
site_name=["indeed"],
search_term=title,
location=location,
results_wanted=30,
country_indeed='USA',
# offset=25 # start jobs from an offset (use if search failed and want to continue)
proxy="http://34.120.172.140:8123"
# proxy="http://crawler-gost-proxy.jobright-internal.com:8080"
)
except Exception as e:
print(f'Error when process: [{location}][{title}]')
print(e)
continue
print(f'[{location}][{title}]: {jobs.shape[0]} rows append.')

79
src/title_seed.json Normal file
View File

@ -0,0 +1,79 @@
[
"Administrative Assistant",
"Executive Assistant",
"Office Manager",
"Project Manager",
"Program Manager",
"Technical Project Manager",
"IT Project Manager",
"Scrum Master",
"Business Analyst",
"Business Development Representative",
"Business Development Manager",
"Operations Manager",
"Operations Associate",
"Data Analyst",
"Data Scientist",
"Security Engineer",
"Backend Software Engineer",
"Frontend Software Engineer",
"Blockchain Engineer",
"Software Engineer",
"Java Developer",
"Full Stack Engineer",
"Python Developer",
".Net Developer",
"Software Engineer in Test",
"Android Engineer",
"React Developer",
"Embedded Software Engineer",
"Support Engineer",
"Web Developer",
"iOS Developer",
"Data Engineer",
"Data Architect",
"Machine Learning Engineer",
"NLP Engineer",
"Artificial Intelligence Engineer",
"Network Engineer",
"Network Administrator",
"Systems Engineer",
"Systems Administrator",
"DevOps Engineer",
"Site Reliability Engineer",
"Electrical Engineer",
"Hardware Engineer",
"Hardware Design Engineer",
"Mechanical Engineer",
"Manufacturing Engineer",
"Civil Engineer",
"Structural Engineer",
"Controls Engineer",
"Quality Engineer",
"UX Researcher",
"UX Designer",
"Product Manager",
"Product Owner",
"Product Designer",
"HR Generalist",
"Technical Recruiter",
"Recruiting Coordinator",
"General Sales",
"Account Executive",
"Sales Account Manager",
"Brand Marketing Manager",
"Product Marketing Manager",
"Marketing Specialist",
"Accounting Manager",
"Accountant",
"Controller",
"Tax Manager",
"Audit Manager",
"Financial Analyst",
"Finance Manager",
"Financial Advisor",
"IT Support Specialist",
"Technical Support Engineer",
"Technical Support Specialist",
"Customer Service Representative"
]

16
src/tools/tools.py Normal file
View File

@ -0,0 +1,16 @@
import json
def read_json_list(json_list_file):
with open(json_list_file) as f:
item_list = [item for item in json.load(f)]
print(f'finish read json list file: {json_list_file}, res len: {len(item_list)}.')
return item_list
def get_location_list(location_file='location_seed.json'):
return read_json_list(location_file)
def get_title_list(title_file='title_seed.json'):
return read_json_list(title_file)