2017年10月3日 星期二

how to start your chinese NLP journey?

# how to import chinese corpus for NLP
# http://cpmarkchang.logdown.com/posts/184192-python-nltk-sinica-treebank
# http://ckip.iis.sinica.edu.tw/CKIP/treebank.htm
# http://museum02.digitalarchives.tw/ndap/2002/SinicaTreebank/ckip.iis.sinica.edu.tw/CKIP/tr/201301_20140813.pdf
>>> from nltk.corpus import sinica_treebank
>>> import nltk

# get all treebank words in one time
>>> sinica_treebank.words()
['\xe4\xb8\x80', '\xe5\x8f\x8b\xe6\x83\x85', ...]

# get tagged_words and sentences in treebank
>>> sinica_treebank.tagged_words()
[('\xe4\xb8\x80', 'Neu'), ('\xe5\x8f\x8b\xe6\x83\x85', 'Nad'), ...]
>>> sinica_treebank.sents()[15]
['\xe5\xa4\xa7\xe8\x81\xb2', '\xe7\x9a\x84', '\xe5\x8f\xab', '\xe8\x91\x97']

# get the grammar tree
>>> sinica_treebank.parsed_sents()[15]
Tree('VP', [Tree('V\xe2\x80\xa7\xe5\x9c\xb0', [Tree('VH11', ['\xe5\xa4\xa7\xe8\x81\xb2']), Tree('DE', ['\xe7\x9a\x84'])]), Tree('VE2', ['\xe5\x8f\xab']), Tree('Di', ['\xe8\x91\x97'])])

# draw the grammar tree
>>> sinica_treebank.parsed_sents()[15].draw()

# get concordance
>>> sinica_text=nltk.Text(sinica_treebank.words())
>>> sinica_text.concordance('我')

# frequency distribution
>>> sinica_fd=nltk.FreqDist(sinica_treebank.words())
>>> top100=sinica_fd.items()[0:100]
>>> for (x,y) in top100:
>>>     print x,y

# how to list all docs in a corpus
# http://www.burnelltek.com/blog/0376c9eac69611e6841d00163e0c0e36
import nltk
from nltk.corpus import gutenberg
print(gutenberg.fileids())

# corpus from web
from nltk.corpus import webtext
print(webtext.fileids())

# corpus for inaugural
from nltk.corpus import inaugural
print(inaugural.fileids())

# corpus from chat
from nltk.corpus import nps_chat
print(nps_chat.fileids())

# how to get a document from a corpus
emma = gutenberg.words("austen-emma.txt")
print(emma)

# entropy, point mutual information, perplexity are measures for sentimental classification

# how to use conditional frequency Distribution
# http://www.burnelltek.com/blog/e08e0bbecb1811e6841d00163e0c0e36
import nltk
from nltk.corpus import brown
pairs = [(genre, word) for genre in brown.categories() for word in brown.words(categories=genre)]
cfd = nltk.ConditionalFreqDist(pairs)

# how to generate all possible bigrams
sent = ['I', 'am', 'a', 'good', 'man']
print(list(nltk.bigrams(sent)))


# how to use conditional frequency distribution
# http://www.burnelltek.com/blog/e08e0bbecb1811e6841d00163e0c0e36
import nltk
from nltk.corpus import brown
pairs = [(genre, word) for genre in brown.categories() for word in brown.words(categories=genre)]
cfd = nltk.ConditionalFreqDist(pairs)

# show conditions for conditional frequency distribution
print(cfd.conditions())

# display a table for specified conditions and term frequency
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
cfd.tabulate(conditions=genres, samples=modals)

# display a plot for conditional frequency distribution
cfd.plot(conditions=genres, samples=modals)

# how to get the frequency distribution with specified vacabularies from bigrams
text = brown.words(categories='news')
bigrams_words = nltk.bigrams(text)
cfd = nltk.ConditionalFreqDist(bigrams_words)
fd = cfd['can']
fd.plot(10)

# how to process pos tags
import nltk
words = nltk.word_tokenize('And now for something completely different')
print(words)
word_tag = nltk.pos_tag(words)
print(word_tag)

# nltk pos tagging for Chinese
nltk.word_tokenize(text):对指定的句子进行分词,返回单词列表
nltk.pos_tag(words):对指定的单词列表进行词性标记,返回标记列表
CategorizedTaggedCorpusReader::tagged_words(fileids, categories):该方法接受文本标识或者类别标识作为参数,返回这些文本被标注词性后的单词列表
CategorizedTaggedCorpusReader::tagged_sents(fileids, categories):该方法接受文本标识或者类别标识作为参数,返回这些文本被标注词性后的句子列表,句子为单词列表
SinicaTreebankCorpusReader::tagged_words(fileids):该方法接受文本标识作为参数,返回文本被标注词性后的单词列表
SinicaTreebankCorpusReader::tagged_sents(fileids):该方法接受文本标识作为参数,返回文本被标注词性后的句子列表,句子为单词列表

# how to stats all segmented chinese words?
python -m jieba 1.txt -q | tr '/' '\n' | sort | uniq -c
python -m jieba 1.txt -q | tr '/' '\n' | sed 's/^[ \t]//' # clean leading spaces in each line

# how to calculate the words diversity
numerator=$(python -m jieba 1.txt -q | tr '/' '\n' | sed 's/^[ \t]//' | sort  | uniq | wc -l) # stats how many uniq words in the doc
denominator=$(python -m jieba 1.txt -q | tr '/' '\n' | sed 's/^[ \t]//' | sort | wc -l # stats how many words in the doc)
words_diversity = $numerator / $denominator

# online code search on command line
https://github.com/stayhigh/how2
pip install how2

https://github.com/gautamkrishnar/socli
sudo apt-get install python python-pip
sudo pip install socli

https://www.npmjs.com/package/stack-overflow-search
 npm install -g stack-overflow-search