自然言語処理入門第2章演習問題②
とりあえず、中級難易度まで。
15.
# -*- coding: utf-8 -*-
from __future__ import division
import nltk
from nltk.corpus import brown
text = nltk.Text(brown.words(brown.fileids()[0]))
fdist =nltk.FreqDist(text)
for w in fdist.keys():
if fdist[w] >= 3:
print w,fdist[w]
16.
# -*- coding: utf-8 -*-
from __future__ import division
import nltk
# トークン数 ことなり語数 語彙の多様性
from nltk.corpus import brown
print "%s\t %s\t %s\t %s" % ("category", "トークン数", "異なり語数", "多様性")
for category in brown.categories():
words = brown.words(categories = category)
words_len = len(words)
words_uniq_len = len(set(words))
print "%16s\t%6d\t%8d\t%2.2f" % (category,words_len,words_uniq_len,(words_len/words_uniq_len))
出力結果
category トークン数 異なり語数 多様性
adventure 69342 8874 7.81
belles_lettres 173096 18421 9.40
editorial 61604 9890 6.23
fiction 68488 9302 7.36
government 70117 8181 8.57
hobbies 82345 11935 6.90
humor 21695 5017 4.32
learned 181888 16859 10.79
lore 110299 14503 7.61
mystery 57169 6982 8.19
news 100554 14394 6.99
religion 39399 6373 6.18
reviews 40704 8626 4.72
romance 70022 8452 8.28
science_fiction 14470 3233 4.48
17.
stopwordsだけだと、","、"."などが入ってしまい、観測支障がでるので、isalnum()
を用いてみる。
# -- coding: utf-8 --
from future import division
import nltk
from nltk.corpus import stopwords
from nltk.book import *
def frequent_50(text):
stop_words = stopwords.words("english")
fdist = FreqDist(text)
top_50,i=[],0
for w in fdist:
if not w in stop_words and w.isalnum():
top_50.append(w)
i += 1
if i == 49:
break
return top_50
print frequent_50(text1)
18. ストップワードをのぞき、文章中でもっとも頻繁に出現する
filterのlambda式ふむふむ。ここ
for pair in fdist.keys()[:50]:
なるほどこう言う書き方もできるのか
# -*- coding: utf-8 -*-
from __future__ import division
import nltk
from nltk.corpus import stopwords
from nltk.book import *
#ストップワードをのぞき、文章中でもっとも頻繁に出現するbigram top50
def frequent_bigrams_50(text):
stop_words = stopwords.words("english")
bigrams = nltk.bigrams(text)
bigrams = filter(lambda x:x[0] not in stop_words and x[1] not in stop_words and x[0].isalnum() and x[1].isalnum() ,bigrams)
fdist = FreqDist(bigrams)
for pair in fdist.keys()[:50]:
print pair,fdist[pair]
frequent_bigrams_50(text1)
19.
とりあえず、感情を探し出してみました。 fiction,mistery,romanceにsurprisedが多くて、まさにという感じでした。adventureは感情表現が一番豊富そうな感じでしたね。
# -*- coding: utf-8 -*-
from __future__ import division
import nltk
from nltk.corpus import brown
modals = "mad relieved irritated glad angry calm surprised comfortable sad upset hurt nervous furious depressed".split(' ')
cfd = nltk.ConditionalFreqDist(
(category,word)
for category in brown.categories()
for word in brown.words(categories=category)
if word in modals
)
cfd.tabulate()
ここで表の正規化までやっていらっしゃる。 とても参考になります。
def cpd_tabulate(cpd, samples): conditions = cpd.conditions() condition_size = max(len(str(c)) for c in conditions) print ' ' * condition_size, for s in samples: print "%*s" % (max(5, len(str(s))), str(s)), print for c in conditions: print "%*s" % (condition_size, str(c)), freqs = [cpd[c].prob(sample) * 100 for sample in samples] for i, f in enumerate(freqs): print "%*s" % (max(4, len(samples[i])), '%1.3f' % f), print cpd = ConditionalProbDist(cfd, ELEProbDist) cpd_tabulate(cpd, modals)
20.brown内のwordの頻度を計算する関数
# -*- coding: utf-8 -*-
from __future__ import division
import nltk
from nltk.corpus import brown
def word_freq(word,category):
words = brown.words(categories = category)
fdist = nltk.FreqDist(words)
return (fdist[word]/fdist.N())*100
print "%2.3f" % word_freq("surprised","adventure")
21.return total syllabal count in text
# -*- coding: utf-8 -*-
from __future__ import division
import nltk
from nltk.corpus import brown
from nltk.corpus import cmudict
#return total syllabal count in text
def count_syllabal(text):
prondict = cmudict.dict()
cnt_syllabal = 0
for w in text:
if w in prondict:
cnt_syllabal += len(prondict[w][0])
return cnt_syllabal
print count_syllabal(brown.words(categories = "romance"))
22.
# -*- coding: utf-8 -*-
from __future__ import division
import nltk
from nltk.corpus import brown
#3単語ごとに1回likeと挟み込んだ新しいtextを生成する
#alnum以外は単語に考慮しない。
#入力はnltk.Text Object
def hadge(text):
hadged,cnt=[],0
for w in text:
if not w.isalnum():
continue
if (cnt % 3 == 0) and cnt > 0:
hadged.append("like")
#print w
hadged.append(w)
cnt += 1
return hadged
print hadge(brown.words(categories = "romance"))