自然言語処理入門第2章演習問題①
1.単語のリストを含んだ変数を作成そして、操作してみる
wlist = ["summer","autumn","winter","spring"]
print wlist
#errorおこる > list同士連結しろ
#wlist + "fortal"
#print wlist
print wlist + ["fortal"]
#>['summer', 'autumn', 'winter', 'spring', 'fortal']
print wlist*2
#>['summer', 'autumn', 'winter', 'spring', 'summer', 'autumn', 'winter', 'spring']
print wlist[2]
#>winter
print wlist[1:3]
#>['autumn', 'winter']
print sorted(wlist)
#>['autumn', 'spring', 'summer', 'winter']
2.単語トークン、異なり語を調べる
# -*- coding: utf-8 -*-
import nltk
from nltk.corpus import gutenberg
austen = gutenberg.words('austen-persuasion.txt')
print len(austen)
print len(set(austen))
3.略
4.一般教書演説のmen,women,perpleの単語の出現頻度
説明読んだだけじゃConditionalFreqDistよく分からなかったけど、これといて分かった。
word_listでfor多用して多次元配列を作って、cfdにぶち込んで、いろいろとmethodを付けてあげるようなイメージですね。
# -*- coding: utf-8 -*-
import nltk
from nltk.corpus import state_union
word_list = [(fileid[:4],target)
for fileid in state_union.fileids()
for word in state_union.words(fileids=fileid)
for target in ["men","women","people"]
if word.lower() == target]
cfd = nltk.ConditionalFreqDist(word_list)
cfd.tabulate()
5.holonymとmeronym
# -*- coding: utf-8 -*-
import nltk
from nltk.corpus import wordnet as wn
words = ["people.n.01","computer.n.01","source.n.01","lion.n.01","card.n.01",
"bed.n.01"]
for word in words:
word = wn.synset(word)
print "word:%s" % word
print word
print "meronyms"
print word.member_meronyms()
print word.part_meronyms()
print word.substance_meronyms()
print "holonyms"
print word.member_holonyms()
print word.part_holonyms()
print word.substance_holonyms()
6.スルーで
1つの単語が複数の意味を持っている場合に、どうやってtranslateするかが問題かな。 ここで、wordnetの出番なのかな、と。
7.スルーで
8.男性と女性の頭文字のアルファベット
# -*- coding: utf-8 -*-
import nltk
from nltk.corpus import names
cfd = nltk.ConditionalFreqDist( (fileid[:-4],name[0]) for fileid in names.fileids() for name in names.words(fileids=fileid) ) cfd.tabulate()
9.2テキスト間における語彙、語彙の豊富さ、ジャンルの違い
参考にしつつPythonのモジュールとかも勉強。 sysへえー、StringIOへえーってかんじです。
# -*- coding: utf-8 -*-
import sys
from StringIO import StringIO
import nltk
from nltk.text import Text
sys.stdout = StringIO()
from nltk.book import *
sys.stdout = sys.__stdout__
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
#2つのテキスト間で全く意味の違う単語を探す
def different_mean_words(text1,text2):
words1 = set(text1)
words2 = set(text2)
common_words = (words1 & words2) - stop_words
ans = []
for w in common_words:
sys.stdout = output1 = StringIO()
text1.similar(w)
sys.stdout = output2 = StringIO()
text2.similar(w)
sys.stdout = sys.__stdout__
similar_words1 = output1.getvalue().replace('Building word-context index...', '')
similar_words2 = output2.getvalue().replace('Building word-context index...', '')
similar1 = set(similar_words1.split(' '))
similar2 = set(similar_words2.split(' '))
if len(similar1 & similar2) == 0:
print w
ans.append(w)
return ans
result = different_mean_words(text1,text2)
print results
10
# -*- coding: utf-8 -*-
import nltk
from nltk.book import *
#上位3分の1を占めることなり語がどれくらいあるか
#fdist.keys()でfor回して多い順にfdist.[fdist.key]を累積
#もし全体の3分の1以上であれば終了
#単語の数を出力する
for text in [text1,text2,text3,text4,text5,text6,text7,text8,text9]:
fdist = nltk.FreqDist(text)
ans_len,ans_words = 0,[]
for word in fdist.keys():
if not word.isalnum():
continue
if ans_len > len(text) * 0.33:
break
ans_len += fdist[word]
ans_words.append(word)
print len(ans_words),text
結果
48 <Text: Moby Dick by Herman Melville 1851>
33 <Text: Sense and Sensibility by Jane Austen 1811>
21 <Text: The Book of Genesis>
23 <Text: Inaugural Address Corpus>
53 <Text: Chat Corpus>
92 <Text: Monty Python and the Holy Grail>
94 <Text: Wall Street Journal>
74 <Text: Personals Corpus>
36 <Text: The Man Who Was Thursday by G . K . Chesterton 1908>
ここのかたと数値が異なっているのはregexを使っていないからなのだと思うのですが、それでもかなり数値に開きが出てしまっていることがちょっと心配。
間違ってないよね?
11
法助動詞の分布表
# -*- coding: utf-8 -*-
import nltk
from nltk.corpus import brown
modals = "will may can must shall should could would".split(' ')
cfd = nltk.ConditionalFreqDist(
(category,w)
for category in brown.categories()
for w in brown.words(categories=category)
if w in modals
)
cfd.tabulate()
結果
can could may must shall should will would
adventure 46 151 5 27 7 15 50 191
belles_lettres 246 213 207 170 34 102 236 392
editorial 121 56 74 53 19 88 233 180
fiction 37 166 8 55 3 35 52 287
government 117 38 153 102 98 112 244 120
hobbies 268 58 131 83 5 73 264 78
humor 16 30 8 9 2 7 13 56
learned 365 159 324 202 40 171 340 319
lore 170 141 165 96 12 76 175 186
mystery 42 141 13 30 1 29 20 186
news 93 86 66 50 5 59 389 244
religion 82 59 78 54 21 45 71 68
reviews 45 40 45 19 1 18 58 47
romance 74 193 11 45 3 32 43 244
science_fiction 16 49 4 8 3 3 16 79
12
# -*- coding: utf-8 -*-
from __future__ import division
import nltk
#複数の発音をもつ単語の数、複数の発音になる単語の場所
from nltk.corpus import cmudict
entries = nltk.corpus.cmudict.entries()
cnt,before,syllabal = 0,[],dict()
for word,pron in entries:
if word != before:
cnt +=1
else:
for syllabale in pron:
if not syllabale in before_pron:
syllabal[word] = syllabale; #違う発音の部分
before = word
before_pron = pron
print (len(entries)-cnt)/len(entries) #複数の発音を持つ単語の割合
print syllabal #違う発音の部分のdict
13.下位語を持たない名詞の同義語集合
# -*- coding: utf-8 -*-
from __future__ import division
import nltk
from nltk.corpus import wordnet as wn
cnt,total = 0,0
for synset in wn.all_synsets('n'):
total +=1
if not synset.hyponyms():
cnt += 1
print cnt/total
14.
# -*- coding: utf-8 -*-
from __future__ import division
import nltk
from nltk.corpus import wordnet as wn
#sの定義、下位語、上位語の全ての定義を連結した文字列を返す
def supergloss(s):
def defstring(s):
return "[%s] %s\n" % (s.name, s.definition)
definition = defstring(s)
for hyponym in s.hyponyms():
definition += defstring(hyponym)
for hypernym in s.hypernyms():
definition += defstring(hypernym)
return definition
print supergloss(wn.synset("car.n.01"))