ちゃお・・・†
舞い降り・・・†
先日、前処理大全という本を読んで影響を受けたので、今回は自然言語処理の前処理とついでに素性の作り方をPythonコードとともに列挙したいと思います。必ずしも全部やる必要はないので適宜使ってください。
前処理大全[データ分析のためのSQL/R/Python実践テクニック]
- 作者: 本橋智光
- 出版社/メーカー: 技術評論社
- 発売日: 2018/04/13
- メディア: 大型本
- この商品を含むブログ (1件) を見る
前処理
余分な改行やスペースなどを除去
with open(path) as fd: for line in fd: line = line.rstrip()
アルファベットの小文字化
text = text.lower()
正規化 (半角/全角変換などなど)
import neologdn neologdn.normalize('ハンカクカナ') # => 'ハンカクカナ' neologdn.normalize('全角記号!?@#') # => '全角記号!?@#' neologdn.normalize('全角記号例外「・」') # => '全角記号例外「・」' neologdn.normalize('長音短縮ウェーーーーイ') # => '長音短縮ウェーイ' neologdn.normalize('チルダ削除~∼∾〜〰~') # => 'チルダ削除' neologdn.normalize('いろんなハイフン˗֊‐‑‒–⁃⁻₋−') # => 'いろんなハイフン-' neologdn.normalize(' PRML 副 読 本 ') # => 'PRML副読本' neologdn.normalize(' Natural Language Processing ') # => 'Natural Language Processing' neologdn.normalize('かわいいいいいいいいい', repeat=6) # => 'かわいいいいいい'
GitHub - ikegami-yukino/neologdn: Japanese text normalizer for mecab-neologd
トークナイズ
MeCab
import MeCab mecab_wakati = MeCab.Tagger('-Owakati') words = mecab_wakati.parse(text).strip().split()
SentencePiece
import sentencepiece as spm spm.SentencePieceTrainer.Train('--input=test/botchan.txt --model_prefix=test_model --vocab_size=1000') sp = spm.SentencePieceProcessor() sp.Load('test/test_model.model') sp.Encode('This is a test')
sentencepiece/README.md at master · google/sentencepiece · GitHub
NLTK
from nltk.tokenize import sent_tokenize, word_tokenize text = "I'm crying. Is this a pen?" sentences = sent_tokenize(text) words = list(map(word_tokenize, sentences))
絵文字の除去
import emoji words = list(filter(lambda x: x in emoji.UNICODE_EMOJI, words))
短縮表現を元に戻す
これを実現できるライブラリをご存知でしたら教えてください。。。
shortened = ( ('\'m', ' am'), ('\'re', ' are'), ('don\'t', 'do not'), ('doesn\'t', 'does not'), ('didn\’t', 'did not'), ('won\'t', 'will not'), ('wanna', 'want to'), ('gonna', 'going to'), ('gotta', 'got to'), ('hafta', 'have to'), ('needa', 'need to'), ('outta', 'out of'), ('kinda', 'kind of'), ('sorta', 'sort of'), ('lotta', 'lot of'), ('lemme', 'let me'), ('gimme', 'give me'), ('getcha', 'get you'), ('gotcha', 'got you'), ('letcha', 'let you'), ('betcha', 'bet you'), ('shoulda', 'should have'), ('coulda', 'could have'), ('woulda', 'would have'), ('musta', 'must have'), ('mighta', 'might have'), ('dunno', 'do not know'), ('sup', 'what is up'), ) for (before, after) in shortened: sentence = sentence.replace(before, after)
HTMLタグの除去
import nltk
raw = nltk.clean_html(html)
ストップワードを除去する
SlothLib ストップワードリストなどを使ってストップワードを除外する
with open('Japanese.txt') as fd: stop_words = frozenset(fd.splitlines()) words = list(filter(lambda x: x in stop_words, words))
英語の場合
from nltk.corpus import stopwords stop_words = frozenset(stopwords.words('english')) words = list(filter(lambda x: x in stop_words, words))
特定の品詞だけを抽出
import MeCab CONTENT_WORD_POS = ('名詞', '動詞', '形容詞', '副詞') tagger = MeCab.Tagger() words = [] for line in tagger.parse(sentence).splitlines()[:-1]: surface, feature = line.split('\t') if feature.startswith(CONTENT_WORD_POS) and ',非自立,' not in feature: words.append(surface)
Stemming
from nltk.stem.porter import PorterStemmer stemmer = PorterStemmer() words = list(map(stemmer.stem, words))
Lemmatization
from nltk.stem.wordnet import WordNetLemmatizer wnl = WordNetLemmatizer() words = list(map(wnl.lemmatize, words))
日本語の場合
import MeCab tagger = MeCab.Tagger() lemmas = [] for line in tagger.parse('行った').splitlines()[:-1]: surface, feature = line.split('\t') if feature.split(',')[6] != '*': lemmas.append(feature.split(',')[6]) # 「行く」が入る
typoを直す
import pytypo words = list(map(pytypo.correct, words))
<BOS>や<EOS>をつける
文の始まりを表すBOS (Beginning of Sentence) と文の終わりを表すEOS (End of Sentence) をつける
with open(path) as fd: for line in fd: words = ['<BOS>'] + line.split() + ['<EOS>']
素性にするための処理
単語のID化
from collections import defaultdict word_to_id = defaultdict(lambda: len(word_to_id))
n-gram
sklearnなどでやってくれるので自前でn-gram化の処理を書くことは少ないです
def to_ngrams(item, max_n): return [to_ngram(item, n) for n in range(2, max_n + 1)] def to_ngram(item, n): return [item[i:i+n] for i in range(len(item)-n+1)]
Bag-of-Words
from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer() corpus = [ 'This is the first document.', 'This is the second second document.', 'And the third one.', 'Is this the first document?', ] X = vectorizer.fit_transform(corpus)
4.2. Feature extraction — scikit-learn 0.19.1 documentation
TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(corpus)
4.2. Feature extraction — scikit-learn 0.19.1 documentation
単語の分散表現
word2vec
import os from gensim.models.word2vec import Word2Vec, PathLineSentences sentences = PathLineSentences(os.path.join(os.getcwd(), 'corpus')) model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4) model.wv['computer']
gensim: models.word2vec – Deep learning with word2vec
FastText
from gensim.models import FastText sentences = [['cat', 'say', 'meow'], ['dog', 'say', 'woof']] model = FastText(sentences) say_vector = model.wv['say'] # get vector for word of_vector = model.wv['of'] # get vector for out-of-vocab word
gensim: models.fasttext – FastText model
文書の分散表現
from gensim.models.doc2vec import Doc2Vec from gensim.models.doc2vec import TaggedDocument docs = [] docs.append(TaggedDocument(words=['cat', 'say', 'meow'], tags=['cat'])) docs.append(TaggedDocument(words=['dog', 'say', 'woof'], tags=['dog'])) model = Doc2Vec(documents=docs, min_count=1) print(model.docvecs['cat'])
gensim: models.doc2vec – Deep learning with paragraph2vec
極大部分文字列
import pykwic kwic = pykwic.EKwic() kwic.add_line('うなぎうなうなうなぎなう') kwic.add_line('うらにはにわにわとりがいる') kwic.build() maxsubsts = [] for maxsubst in kwic.maxmal_substring(): maxsubsts.append(maxsubst[0])
unicodeテキストをkwicするpythonライブラリpykwic作った - (setf yaruki nil) - nlpyutoriグループ
文字種による抽象化
TinySegmenterなどで使われている素性です
import re patterns = ( (re.compile('[一二三四五六七八九十百千万億兆]'), 'M'), (re.compile('[一-龠々〆ヵヶ]'), 'H'), (re.compile('[ぁ-ん]'), 'I'), (re.compile('[ァ-ヴーア-ン゛ー]'), 'K'), (re.compile('[a-zA-Za-zA-Z]'), 'A'), (re.compile('[0-90-9]'), 'N') ) features = [] for char in sentence: for pattern, val in patterns: if pattern.match(char): features.append(val) break
word = 'Melbourne' feature = [] feature.append('word.lower=%s' % word.lower()) feature.append('word.isupper=%s' % word.isupper()) feature.append('word.istitle=%s' % word.istitle()) feature.append('word.isdigit=%s' % word.isdigit())
語の接尾
固有表現抽出や、名前から性別を推定するタスクに用いられることがあります
word = 'Melbourne' feature = [] feature.append('word[-3:]=%s' % word[-3:]) feature.append('word[-2:]=%s' % word[-2:])
オントロジーによる言い換え
WordNetなどのオントロジーを使って同義語に置き換えたり上位概念に抽象化したりする
import MeCab from nltk.corpus import wordnet as wn def extract_noun(sentence): tagger = MeCab.Tagger() nouns = [] for line in tagger.parse(sentence).splitlines()[:-1]: surface, feature = line.split('\t') if feature.startswith('名詞'): nouns.append(surface) return nouns sentence = '猫が鳴いてる' nouns = extract_noun(sentence) sentences = [] for noun in nouns: for synset in wn.synsets(noun, lang='jpn', pos=wn.NOUN): for synonym in wn.synset(synset.name()).lemma_names('jpn'): sentences.append(sentence.replace(noun, synonym)) for hypernym in synset.hypernyms(): for hypernym_synset in wn.synset(hypernym.name()).lemma_names('jpn'): sentences.append(sentence.replace(noun, hypernym_synset))
分散表現での類似語による言い換え
from gensim.models.word2vec import Word2Vec model = Word2Vec.load('w2v.model') word = '雨' sentence = '雨に唄えば' sentences = [] for (similar_word, score) in model.wv.most_similar([model.wv[word]], [], 5): if word == similar_word: continue sentences.append(sentence.replace(word, similar_word))
gensim: models.word2vec – Deep learning with word2vec
トピックモデル・クラスタリングによる文書の抽象化
NLPではないタスクですが、こんな素性を使うことがありました
Latent Dirichlet Allocation
import joblib lda = joblib.load('lda.model') lda.transform(X).argmax(axis=1)
sklearn.decomposition.LatentDirichletAllocation — scikit-learn 0.19.1 documentation
KMeans
import joblib kmeans = joblib.load('kmeans.model') kmeans.predict(X)
sklearn.cluster.KMeans — scikit-learn 0.19.1 documentation
文字列同士の距離
word2vec
from gensim.models.word2vec import Word2Vec model = Word2Vec.load('w2v.model') model.wv.similarity(word1, word2)
gensim: models.word2vec – Deep learning with word2vec
FastText
from gensim.models import FastText model = FastText.load('fasttext.model') model.wv.similarity(word1, word2)
gensim: models.fasttext – FastText model
レーベンシュタイン編集距離やハミング距離など
import distance distance.levenshtein('lenvestein', 'levenshtein') distance.hamming('hamming', 'hamning')
GitHub - doukremt/distance: Levenshtein and Hamming distance computation
文書同士の距離
Doc2Vec
from gensim.models.doc2vec import Doc2Vec model = Doc2Vec.load('doc2vec.model') model.docvecs.similarity(doc1, doc2) model.docvecs.similarity_unseen_docs(model, doc1, doc2)
gensim: models.doc2vec – Deep learning with paragraph2vec
Normalized compression distance
import zlib def ncd(x, y): if x == y: return 0 z_x = len(zlib.compress(x)) z_y = len(zlib.compress(y)) z_xy = len(zlib.compress(x + y)) return float(z_xy - min(z_x, z_y)) / max(z_x, z_y) if __name__ == '__main__': query = 'Hello, world!' results = ['Hello, Python world!', 'Goodbye, Python world!', 'world record'] for r in results: print(r, ncd(query, r))
Tech Tips: Normalized compression distanceとNormalized Google distance
Lexical Density
from __future__ import division import MeCab CONTENT_WORD_POS = ('名詞', '動詞', '形容詞', '副詞') def compute_lexical_density(sentence): t = MeCab.Tagger() n = t.parseToNode(sentence) content_words = 0 total = 0 while n: if not n.feature.startswith('BOS/EOS'): if n.feature.startswith(CONTENT_WORD_POS) and ',非自立,' not in n.feature: content_words += 1 total += 1 n = n.next return content_words / total