ワンキャリアの掲載企業をスクレイピングする目的
- 個人的な就活市場の調査
環境構築
コードエディター
Visual Studio Code
OS
macOS Catalina バージョン 10.15.7
言語
python 3.6.6
事前準備
ライブラリのインストール
- requests
- BeautifulSoup
- tqdm
この3つをインストールします。
$ pip install beautifulsoup4
$ pip install requests
$ pip install tqdm
実装コード
import requests
import math
import datetime, time, sys, calendar
from tqdm import tqdm
from bs4 import BeautifulSoup
def YmdHMS(created_at):
time_utc = time.strptime(created_at, '%a %b %d %H:%M:%S +0000 %Y')
unix_time = calendar.timegm(time_utc)
time_local = time.localtime(unix_time) # 2018/9/24に修正しました
return time.strftime("%Y/%m/%d %H:%M:%S", time_local)
# 「企業をさがす」ページを取得して解析する
load_url = "https://www.onecareer.jp/companies"
search_list_html = requests.get(load_url)
search_list_soup = BeautifulSoup(search_list_html.content, "html.parser")
# 本選考件数を取得する
event_count = int(search_list_soup.find(class_="v2-page-info__item-count").text.split(' ')[0])
print("企業掲載件数は【" + str(event_count) + "】です。")
# 全てのページネーションを辿り、HTMLを取得する
page_nation_count = math.ceil(event_count / 25)
print("ページネーション数は【" + str(page_nation_count) + "】です。")
page_dict = {}
for page_nation_num in tqdm(range(page_nation_count)):
page_nation_url = "https://www.onecareer.jp/companies?page=" + str((page_nation_num+1))
search_result_html = requests.get(page_nation_url)
search_result_soup = BeautifulSoup(search_result_html.content, "html.parser")
objects_list = search_result_soup.find_all(class_="v2-companies__item")
page_dict["page" + str(page_nation_num+1)] = objects_list
#URL指定(テスト用)
# page_url = "https://www.onecareer.jp/searches/companies?keyword=%E5%9B%BD%E7%AB%8B%E7%97%85%E9%99%A2&button="
# search_result_html = requests.get(page_url)
# search_result_soup = BeautifulSoup(search_result_html.content, "html.parser")
# objects_list = search_result_soup.find_all(class_="v2-companies__item")
# page_dict["test"] = objects_list
# ファイルに保存する
#取得日
today = datetime.date.today()
fname = "onecareer_企業を探す_" + str(today) + ".csv"
count = 0
with open(fname, 'w') as fs:
# ヘッダー部分
fs.write(
"企業名" + '\t'
+ "業界名" + '\t'
+ "業種名" + '\t'
+ "説明文" + '\t'
+ "募集コース1_タイトル" + '\t'
+ "募集コース1_URL" + '\t'
+ "募集コース2_タイトル" + '\t'
+ "募集コース2_URL" + '\t'
+ "募集コース3_タイトル" + '\t'
+ "募集コース3_URL" + '\t'
+ "募集コース4_タイトル" + '\t'
+ "募集コース4_URL" + '\n')
for company_list in page_dict.values():
for company in company_list:
#企業名
if company.find(class_="v2-companies__title") is not None:
company_name = company.find(class_="v2-companies__title").text.replace("\n","").replace("\r","")
else:
company_name = ""
#業界名・業種名
if company.find(class_="v2-companies__business-field") is not None:
#業界名
industry_main = company.find(class_="v2-companies__business-field").contents[1].text.replace("\n","").replace("\r","")
#業種名
industry_sub = company.find(class_="v2-companies__business-field").contents[3].text.replace("\n","").replace("\r","")
else:
indsutry_main = ""
indsutry_sub = ""
#説明文
if company.find(class_="v2-companies__description") is not None:
company_description = company.find(class_="v2-companies__description").text.replace("\n","").replace("\r","")
else:
company_description = ""
#募集コース_タイトル
if company.find(class_="v2-companies__selection-step") is not None:
if len(company.find_all(class_="v2-companies__selection-step")) == 1:
course_title1 = company.find(class_="v2-companies__selection-step-link").text.replace("\n","").replace("\r","")
course_title2 = ""
course_title3 = ""
course_title4 = ""
elif len(company.find_all(class_="v2-companies__selection-step")) == 2:
course_title1 = company.find_all(class_="v2-companies__selection-step-link")[0].contents[0].replace("\n","").replace("\r","")
course_title2 = company.find_all(class_="v2-companies__selection-step-link")[1].contents[0].replace("\n","").replace("\r","")
course_title3 = ""
course_title4 = ""
elif len(company.find_all(class_="v2-companies__selection-step")) == 3:
course_title1 = company.find_all(class_="v2-companies__selection-step-link")[0].contents[0].replace("\n","").replace("\r","")
course_title2 = company.find_all(class_="v2-companies__selection-step-link")[1].contents[0].replace("\n","").replace("\r","")
course_title3 = company.find_all(class_="v2-companies__selection-step-link")[2].contents[0].replace("\n","").replace("\r","")
course_title4 = ""
elif len(company.find_all(class_="v2-companies__selection-step")) == 4:
course_title1 = company.find_all(class_="v2-companies__selection-step-link")[0].contents[0].replace("\n","").replace("\r","")
course_title2 = company.find_all(class_="v2-companies__selection-step-link")[1].contents[0].replace("\n","").replace("\r","")
course_title3 = company.find_all(class_="v2-companies__selection-step-link")[2].contents[0].replace("\n","").replace("\r","")
course_title4 = company.find_all(class_="v2-companies__selection-step-link")[3].contents[0].replace("\n","").replace("\r","")
else:
course_title1 = ""
course_title2 = ""
course_title3 = ""
course_title4 = ""
else:
course_title1 = ""
course_title2 = ""
course_title3 = ""
course_title4 = ""
#募集コース_URL
domain_url = "https://www.onecareer.jp"
if company.find(class_="v2-companies__selection-step") is not None:
if len(company.find_all(class_="v2-companies__selection-step")) == 1:
course_url1 = domain_url + company.find(class_="v2-companies__selection-step-link").get('href')
course_url2 = ""
course_url3 = ""
course_url4 = ""
elif len(company.find_all(class_="v2-companies__selection-step")) == 2:
course_url1 = domain_url + company.find_all(class_="v2-companies__selection-step-link")[0].get('href')
course_url2 = domain_url + company.find_all(class_="v2-companies__selection-step-link")[1].get('href')
course_url3 = ""
course_url4 = ""
elif len(company.find_all(class_="v2-companies__selection-step")) == 3:
course_url1 = domain_url + company.find_all(class_="v2-companies__selection-step-link")[0].get('href')
course_url2 = domain_url + company.find_all(class_="v2-companies__selection-step-link")[1].get('href')
course_url3 = domain_url + company.find_all(class_="v2-companies__selection-step-link")[2].get('href')
course_url4 = ""
elif len(company.find_all(class_="v2-companies__selection-step")) == 4:
course_url1 = domain_url + company.find_all(class_="v2-companies__selection-step-link")[0].get('href')
course_url2 = domain_url + company.find_all(class_="v2-companies__selection-step-link")[1].get('href')
course_url3 = domain_url + company.find_all(class_="v2-companies__selection-step-link")[2].get('href')
course_url4 = domain_url + company.find_all(class_="v2-companies__selection-step-link")[3].get('href')
else:
course_url1 = ""
course_url2 = ""
course_url3 = ""
course_url4 = ""
else:
course_url1 = ""
course_url2 = ""
course_url3 = ""
course_url4 = ""
print("企業名:"+ company_name)
print("業界名:"+ industry_main)
print("業種名:"+ industry_sub)
print("説明文:"+ company_description)
print("募集コース1_タイトル:"+ course_title1)
print("募集コース1_URL:"+ course_url1)
print("募集コース2_タイトル:"+ course_title2)
print("募集コース2_URL:"+ course_url2)
print("募集コース3_タイトル:"+ course_title3)
print("募集コース3_URL:"+ course_url3)
print("募集コース4_タイトル:"+ course_title4)
print("募集コース4_URL:"+ course_url4)
count += 1
print(str(count) + "件目")
fs.write(company_name + '\t'
+ industry_main + '\t'
+ industry_sub + '\t'
+ company_description + '\t'
+ course_title1 + '\t'
+ course_url1 + '\t'
+ course_title2 + '\t'
+ course_url2 + '\t'
+ course_title3 + '\t'
+ course_url3 + '\t'
+ course_title4 + '\t'
+ course_url4 + '\n')
注意事項
ワンキャリアの掲載企業数は約4.5万件です。
そのため、このプログラムを動かすとワンキャリアのサーバーに多大な負荷を与えることになります。
節度を守ってご利用いただくようお願いいたします。
実装を終えて
「募集コース」の取得処理がすごく煩雑になっています。
本来は「募集コース」数の分だけカラムを動的に増やす処理にしたかったのですが、面倒臭いのでひとまずこの形で。