専門ユニット2/山内研セミナー(2020/11/04)

関連サイトと資料

9.1 stdinとstdout

egrep.py
import sys, re
   
regex = sys.argv[1]
   
for line in sys.stdin:
    if re.search(regex, line):
        sys.stdout(line)
    

line_count.py
import sys
   
count = 0
for line in sys.stdin:
    count += 1
   
print(count)
    

most_common_words.py
import sys
from collections import Counter
  
try:
    num_words = int(sys.argv[1])
except:
    print('usage: most_common_words.py num_words')
    sys.exit(1)
   
counter = Counter(word.lower() for line in sys.stdin for word in line.strip().split() if word)
   
for word, count in counter.most_common(num_words):
    sys.stdout.write(str(count))
    sys.stdout.write('\t')
    sys.stdout.write(word)
    sys.stdout.write('\n'))
    

9.2 ファイルの読み込み

9.2.1 テキストファイルの基礎

ファイルを作成する
with open('input.txt', 'w') as f:
    f.write("""# 123
123
# 456
456
789
""")
    

行頭が#の行数を数える
import re
   
starts_with_hash = 0
   
with open('input.txt') as f:
    for line in f:
        if re.match("^#", line):
            starts_with_hash += 1
   
print(starts_with_hash)
    

ファイルを作成する
with open('email_addresses.txt', 'w') as f:
    f.write("""joelgrus@gmail.com
joel@m.datasciencester.com
yama@ic.kanagawa-it.ac.jp
tosiyama@gmail.com
""")
    

メールアドレスからのドメイン名の抽出
from collections import Counter
  
def get_domain(email_address):
    """Split on '@' and return the last piece"""
    return email_address.lower().split("@")[-1]
   
with open('email_addresses.txt', 'r') as f:
    domain_counts = Counter(get_domain(line.strip()) for line in f if "@" in line)
   
print(domain_counts)
    

9.2.2 区切り文字を使ったファイル

ファイルを作成する
with open('tab_delimited_stock_prices.txt', 'w') as f:
    f.write("""6/20/2014\tAAPL\t90.91
6/20/2014\tMSFT\t41.68
6/20/2014\tFB\t64.5
6/19/2014\tAAPL\t91.86
6/19/2014\tMSFT\t41.51
6/19/2014\tFB\t64.34
""")
    

タブで区切られたデータを処理する
import csv
   
def process(date, symbol, closing_price):
    print('date={0} symbol={1} closing_price={2}'.format(date, symbol, closing_price))
   
with open('tab_delimited_stock_prices.txt') as f:
    tab_reader = csv.reader(f, delimiter='\t')
    for row in tab_reader:
        date = row[0]
        symbol = row[1]
        closing_price = float(row[2])
        process(date, symbol, closing_price)
    

ファイルを作成する
with open('colon_delimited_stock_prices.txt', 'w') as f:
    f.write("""date:symbol:closing_price
6/20/2014:AAPL:90.91
6/20/2014:MSFT:41.68
6/20/2014:FB:64.5
""")
    

1行目にヘッダがあり、コロンで区切られたデータを処理する
with open('colon_delimited_stock_prices.txt') as f:
    colon_reader = csv.DictReader(f, delimiter=':')
    for dict_row in colon_reader:
        date = dict_row["date"]
        symbol = dict_row["symbol"]
        closing_price = float(dict_row["closing_price"])
        process(date, symbol, closing_price)
    

カンマで区切られたデータをファイルに書き込む
todays_prices = {'AAPL': 90.91, 'MSFT': 41.68, 'FB': 64.5 }
   
with open('comma_delimited_stock_prices.txt', 'w') as f:
    csv_writer = csv.writer(f, delimiter=',')
    for stock, price in todays_prices.items():
        csv_writer.writerow([stock, price])
    

9.3 Webスクレイピング

2020/10/21のサポートページの「仮想環境labo2020へのmatplotlibのインストール」の項目と同じように、beautifulsoup4, requests, html5libをインストールしてください。 コマンドは「pip install beautifulsoup4 requests html5lib」となります。

9.3.1 HTMLとその解析

対象のHTML
<!doctype html>
<html lang="en-US">
<head>
    <title>Getting Data</title>
    <meta charset="utf-8">
</head>
<body>
    <h1>Getting Data</h1>
    <div class="explanation">
        This is an explanation.
    </div>
    <div class="comment">
        This is a comment.
    </div>
    <div class="content">
        <p id="p1">This is the first paragraph.</p>
        <p class="important">This is the second paragraph.</p>
    </div>
    <div class="signature">
        <span id="name">Joel</span>
        <span id="twitter">@joelgrus</span>
        <span id="email">joelgrus-at-gmail</span>
    </div>
</body>
</html>
    

サンプル1
from bs4 import BeautifulSoup
import requests
   
url = "https://raw.githubusercontent.com/joelgrus/data/master/getting-data.html"
html = requests.get(url).text
soup = BeautifulSoup(html, 'html5lib')
   
first_paragraph = soup.find('p')
print(first_paragraph)
   
first_paragraph_text = soup.p.text
first_paragraph_words = soup.p.text.split()
   
print(first_paragraph_text)
print(first_paragraph_words)
   
first_paragraph_id2 = soup.p.get('id')
print(first_paragraph_id2)
   
all_paragraphs = soup.find_all('p')
paragraphs_with_ids = [p for p in soup('p') if p.get('id')]
   
print(all_paragraphs)
print(paragraphs_with_ids)
   
important_paragraphs = soup('p', {'class' : 'important'})
important_paragraphs2 = soup('p', 'important')
important_paragraphs3 = [p for p in soup('p') if 'important' in p.get('class', [])]
  
print(important_paragraphs)
print(important_paragraphs2)
print(important_paragraphs3)
    

サンプル2
from bs4 import BeautifulSoup
import requests
   
url = "https://raw.githubusercontent.com/joelgrus/data/master/getting-data.html"
html = requests.get(url).text
soup = BeautifulSoup(html, 'html5lib')
   
spans_inside_divs = [span for div in soup('div') for span in div('span')]
   
print(spans_inside_divs)
    

9.3.2 事例:議会の行動に目を光らせる

https://www.house.gov/representativesの全リンクを収集
from bs4 import BeautifulSoup
import requests
   
url = "https://www.house.gov/representatives"
text = requests.get(url).text
soup = BeautifulSoup(text, "html5lib")
    
all_urls = [a['href'] for a in soup('a') if a.has_attr('href')]
   
print(all_urls)
    

9.4 APIを使う

2020/10/21のサポートページの「仮想環境labo2020へのmatplotlibのインストール」の項目と同じように、python-dateutilをインストールしてください。 コマンドは「pip install python-dateutil」となります。

9.4.2 認証が不要なAPIを使う

Githubリポジトリを解析
import requests, json
from collections import Counter
from dateutil.parser import parse   
    
github_user = "joelgrus"
endpoint = f"https://api.github.com/users/{github_user}/repos"
     
repos = json.loads(requests.get(endpoint).text)
   
dates = [parse(repo["created_at"]) for repo in repos]
month_counts = Counter(date.month for date in dates)
weekday_counts = Counter(date.weekday() for date in dates)
   
#print(month_counts)
#print(weekday_counts)
   
last_5_repositories = sorted(repos, key=lambda r: r["pushed_at"], reverse=True)[:5]    
last_5_languages = [repo["language"] for repo in last_5_repositories]
   
#print(last_5_repositories)
print(last_5_languages)