Less is Best

rubyが好き。技術の話とスタートアップに興味があります。

自然言語処理入門第2章演習問題①

1.単語のリストを含んだ変数を作成そして、操作してみる

wlist = ["summer","autumn","winter","spring"]
print wlist


#errorおこる > list同士連結しろ
#wlist + "fortal"
#print wlist



print wlist + ["fortal"]
#>['summer', 'autumn', 'winter', 'spring', 'fortal']

print wlist*2
#>['summer', 'autumn', 'winter', 'spring', 'summer', 'autumn', 'winter', 'spring']

print wlist[2]
#>winter

print  wlist[1:3]
#>['autumn', 'winter']

print sorted(wlist)
#>['autumn', 'spring', 'summer', 'winter']

2.単語トークン、異なり語を調べる

# -*- coding: utf-8 -*-
import nltk

from nltk.corpus import gutenberg
austen = gutenberg.words('austen-persuasion.txt')

print len(austen)
print len(set(austen))

3.略

4.一般教書演説のmen,women,perpleの単語の出現頻度

説明読んだだけじゃConditionalFreqDistよく分からなかったけど、これといて分かった。

word_listでfor多用して多次元配列を作って、cfdにぶち込んで、いろいろとmethodを付けてあげるようなイメージですね。

# -*- coding: utf-8 -*-
import nltk

from nltk.corpus import state_union

word_list = [(fileid[:4],target) 
            for fileid in state_union.fileids()
            for word in state_union.words(fileids=fileid)
            for target in ["men","women","people"]
            if word.lower() == target]

cfd = nltk.ConditionalFreqDist(word_list)

cfd.tabulate()

5.holonymとmeronym

# -*- coding: utf-8 -*-
import nltk


from nltk.corpus import wordnet as wn

words = ["people.n.01","computer.n.01","source.n.01","lion.n.01","card.n.01",
         "bed.n.01"]

for word in words:
  word =  wn.synset(word)

  print "word:%s" % word
  print word

  print "meronyms"
  print word.member_meronyms()
  print word.part_meronyms()
  print word.substance_meronyms()

  print "holonyms"
  print word.member_holonyms()
  print word.part_holonyms()
  print word.substance_holonyms()

6.スルーで

1つの単語が複数の意味を持っている場合に、どうやってtranslateするかが問題かな。 ここで、wordnetの出番なのかな、と。

7.スルーで

8.男性と女性の頭文字のアルファベット

# -*- coding: utf-8 -*-
import nltk
from nltk.corpus import names

   cfd = nltk.ConditionalFreqDist( (fileid[:-4],name[0]) for fileid in names.fileids() for name in names.words(fileids=fileid) ) cfd.tabulate()

9.2テキスト間における語彙、語彙の豊富さ、ジャンルの違い

参考にしつつPythonのモジュールとかも勉強。 sysへえー、StringIOへえーってかんじです。

# -*- coding: utf-8 -*-
import sys
from StringIO import StringIO

import nltk
from nltk.text import Text

sys.stdout = StringIO()

from nltk.book import *
sys.stdout = sys.__stdout__

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))


#2つのテキスト間で全く意味の違う単語を探す
def different_mean_words(text1,text2):
  words1 = set(text1)
  words2 = set(text2)

  common_words = (words1 & words2) - stop_words

  ans = []
  for w in common_words:
    sys.stdout = output1 = StringIO()
    text1.similar(w)
    sys.stdout = output2 = StringIO()
    text2.similar(w)
    sys.stdout = sys.__stdout__
    similar_words1 = output1.getvalue().replace('Building word-context index...', '')
    similar_words2 = output2.getvalue().replace('Building word-context index...', '')
    similar1 = set(similar_words1.split(' '))
    similar2 = set(similar_words2.split(' '))
    if len(similar1 & similar2) == 0:
      print w
      ans.append(w)
  return ans

result = different_mean_words(text1,text2)
print results

10

# -*- coding: utf-8 -*-

import nltk
from nltk.book import *

#上位3分の1を占めることなり語がどれくらいあるか
#fdist.keys()でfor回して多い順にfdist.[fdist.key]を累積
#もし全体の3分の1以上であれば終了
#単語の数を出力する
for text in [text1,text2,text3,text4,text5,text6,text7,text8,text9]:
  fdist = nltk.FreqDist(text)


  ans_len,ans_words = 0,[]
  for word in fdist.keys():
    if not word.isalnum():
      continue
    if ans_len > len(text) * 0.33:
      break

    ans_len += fdist[word]
    ans_words.append(word)
  print len(ans_words),text

結果

48 <Text: Moby Dick by Herman Melville 1851>

33 <Text: Sense and Sensibility by Jane Austen 1811>

21 <Text: The Book of Genesis>

23 <Text: Inaugural Address Corpus>

53 <Text: Chat Corpus>

92 <Text: Monty Python and the Holy Grail>

94 <Text: Wall Street Journal>

74 <Text: Personals Corpus>

36 <Text: The Man Who Was Thursday by G . K . Chesterton 1908>

ここのかたと数値が異なっているのはregexを使っていないからなのだと思うのですが、それでもかなり数値に開きが出てしまっていることがちょっと心配。

間違ってないよね?

11

法助動詞の分布表

# -*- coding: utf-8 -*-

import nltk
from nltk.corpus import brown

modals = "will may can must shall should could would".split(' ')

cfd = nltk.ConditionalFreqDist(
  (category,w)
  for category in brown.categories()
  for w in brown.words(categories=category)
  if w in modals
  )

cfd.tabulate()

結果

can could may must shall should will would

adventure 46 151 5 27 7 15 50 191

belles_lettres 246 213 207 170 34 102 236 392

editorial 121 56 74 53 19 88 233 180

fiction 37 166 8 55 3 35 52 287

government 117 38 153 102 98 112 244 120

hobbies 268 58 131 83 5 73 264 78

humor 16 30 8 9 2 7 13 56

learned 365 159 324 202 40 171 340 319

lore 170 141 165 96 12 76 175 186

mystery 42 141 13 30 1 29 20 186

news 93 86 66 50 5 59 389 244

religion 82 59 78 54 21 45 71 68

reviews 45 40 45 19 1 18 58 47

romance 74 193 11 45 3 32 43 244

science_fiction 16 49 4 8 3 3 16 79

12

# -*- coding: utf-8 -*-
from __future__ import division
import nltk

#複数の発音をもつ単語の数、複数の発音になる単語の場所
from nltk.corpus import cmudict

entries = nltk.corpus.cmudict.entries()

cnt,before,syllabal = 0,[],dict()
for word,pron in entries:
  if word != before:
    cnt +=1

  else:
    for syllabale in pron:
      if not syllabale in before_pron:
        syllabal[word] = syllabale; #違う発音の部分

  before = word
  before_pron = pron

print (len(entries)-cnt)/len(entries) #複数の発音を持つ単語の割合
print syllabal #違う発音の部分のdict

13.下位語を持たない名詞の同義語集合

# -*- coding: utf-8 -*-
from __future__ import division
import nltk

from nltk.corpus import wordnet as wn

cnt,total = 0,0

for synset in wn.all_synsets('n'):
  total +=1
  if not synset.hyponyms():
    cnt += 1
print cnt/total

14.

# -*- coding: utf-8 -*-
from __future__ import division
import nltk

from nltk.corpus import wordnet as wn

#sの定義、下位語、上位語の全ての定義を連結した文字列を返す
def supergloss(s):
  def defstring(s):
    return "[%s] %s\n" % (s.name, s.definition)

  definition = defstring(s)
  for hyponym in s.hyponyms():
    definition += defstring(hyponym)

  for hypernym in s.hypernyms():
    definition += defstring(hypernym)
  return definition

print supergloss(wn.synset("car.n.01"))