読者です 読者をやめる 読者になる 読者になる

Less is Best

rubyが好き。技術の話とスタートアップに興味があります。

自然言語処理入門第2章演習問題②

とりあえず、中級難易度まで。

15.

# -*- coding: utf-8 -*-
from __future__ import division
import nltk

from nltk.corpus import brown

text = nltk.Text(brown.words(brown.fileids()[0]))
fdist =nltk.FreqDist(text)

for w in fdist.keys():
  if fdist[w] >= 3:
    print w,fdist[w]

16.

# -*- coding: utf-8 -*-
from __future__ import division
import nltk

# トークン数 ことなり語数 語彙の多様性

from nltk.corpus import brown

print "%s\t %s\t %s\t %s" % ("category", "トークン数", "異なり語数", "多様性")
for category in brown.categories():
    words = brown.words(categories = category)
    words_len = len(words)
    words_uniq_len = len(set(words))
    print "%16s\t%6d\t%8d\t%2.2f" % (category,words_len,words_uniq_len,(words_len/words_uniq_len))

出力結果

    category トークン数 異なり語数 多様性

     adventure 69342 8874 7.81

     belles_lettres 173096 18421 9.40

     editorial 61604 9890 6.23

     fiction 68488 9302 7.36

     government 70117 8181 8.57

     hobbies 82345 11935 6.90

     humor 21695 5017 4.32

     learned 181888 16859 10.79

     lore 110299 14503 7.61

     mystery 57169 6982 8.19

     news 100554 14394 6.99

     religion 39399 6373 6.18

     reviews 40704 8626 4.72

     romance 70022 8452 8.28

     science_fiction 14470 3233 4.48

17.

stopwordsだけだと、","、"."などが入ってしまい、観測支障がでるので、isalnum()を用いてみる。

# -- coding: utf-8 -- from future import division
import nltk

from nltk.corpus import stopwords
from nltk.book import *

def frequent_50(text):
  stop_words = stopwords.words("english")
  fdist = FreqDist(text)
  top_50,i=[],0

  for w in fdist:
    if not w in stop_words and w.isalnum():
      top_50.append(w)
      i += 1
    if i == 49:
      break
  return top_50

print frequent_50(text1)

18. ストップワードをのぞき、文章中でもっとも頻繁に出現する

filterのlambda式ふむふむ。ここ

for pair in fdist.keys()[:50]:

なるほどこう言う書き方もできるのか

# -*- coding: utf-8 -*-
from __future__ import division  
import nltk

from nltk.corpus import stopwords
from nltk.book import *
#ストップワードをのぞき、文章中でもっとも頻繁に出現するbigram top50
def frequent_bigrams_50(text):
  stop_words = stopwords.words("english")
  bigrams = nltk.bigrams(text)
  bigrams = filter(lambda x:x[0] not in stop_words and x[1] not in stop_words and x[0].isalnum() and x[1].isalnum() ,bigrams)
  fdist = FreqDist(bigrams)
  for pair in fdist.keys()[:50]:

    print pair,fdist[pair]


frequent_bigrams_50(text1)

19.

とりあえず、感情を探し出してみました。 fiction,mistery,romanceにsurprisedが多くて、まさにという感じでした。adventureは感情表現が一番豊富そうな感じでしたね。

# -*- coding: utf-8 -*-
from __future__ import division  
import nltk

from nltk.corpus import brown

modals = "mad relieved irritated glad angry calm surprised comfortable sad upset hurt nervous furious depressed".split(' ')


cfd = nltk.ConditionalFreqDist(
  (category,word)
  for category in brown.categories()
  for word in brown.words(categories=category)
  if word in modals
)

cfd.tabulate()

ここで表の正規化までやっていらっしゃる。 とても参考になります。

def cpd_tabulate(cpd, samples):
   conditions = cpd.conditions()
   condition_size = max(len(str(c)) for c in conditions)
   print ' ' * condition_size,
   for s in samples:
       print "%*s" % (max(5, len(str(s))), str(s)),
   print
   for c in conditions:

       print "%*s" % (condition_size, str(c)),
       freqs = [cpd[c].prob(sample) * 100 for sample in samples]

       for i, f in enumerate(freqs):
           print "%*s" % (max(4, len(samples[i])), '%1.3f' % f),
       print
cpd = ConditionalProbDist(cfd, ELEProbDist)
cpd_tabulate(cpd, modals)

20.brown内のwordの頻度を計算する関数

# -*- coding: utf-8 -*-
from __future__ import division  
import nltk

from nltk.corpus import brown

def word_freq(word,category):
  words = brown.words(categories = category)
  fdist = nltk.FreqDist(words)
  return (fdist[word]/fdist.N())*100

print "%2.3f" % word_freq("surprised","adventure")

21.return total syllabal count in text

# -*- coding: utf-8 -*-
from __future__ import division  
import nltk

from nltk.corpus import brown
from nltk.corpus import cmudict

#return total syllabal count in text
def count_syllabal(text):
  prondict = cmudict.dict()
  cnt_syllabal = 0
  for w in text:
    if w in prondict:
      cnt_syllabal += len(prondict[w][0])
  return cnt_syllabal

print   count_syllabal(brown.words(categories = "romance"))

22.

# -*- coding: utf-8 -*-
from __future__ import division  
import nltk

from nltk.corpus import brown

#3単語ごとに1回likeと挟み込んだ新しいtextを生成する
#alnum以外は単語に考慮しない。
#入力はnltk.Text Object
def hadge(text):
  hadged,cnt=[],0

  for w in text:
    if not w.isalnum():
      continue
    if (cnt % 3 == 0) and cnt > 0:
      hadged.append("like")
    #print w
    hadged.append(w)
    cnt += 1

  return hadged

print hadge(brown.words(categories = "romance"))
広告を非表示にする