본문 바로가기
Data-driven Methodology/DS (Data Science)

[Data Science] 문서의 행렬 표현 (DTM and TF-IDF)

by goatlab 2022. 9. 29.
728x90
반응형
SMALL

CountVectorizer를 이용한 토큰화

 

import sklearn

print(sklearn.__version__)
from sklearn.feature_extraction.text import CountVectorizer

vector = CountVectorizer()

text = ['Text mining, also referred to as text data mining, similar to text analytics, is the process of deriving high-quality information from text.']

vector.fit_transform(text).toarray()
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 4, 1, 2]],
      dtype=int64)
# 단어별 인덱스 확인
print(vector.vocabulary_)
print(vector.fit_transform(text).toarray())
{'text': 15, 'mining': 9, 'also': 0, 'referred': 13, 'to': 17, 'as': 2, 'data': 3, 'similar': 14, 'analytics': 1, 'is': 8, 'the': 16, 'process': 11, 'of': 10, 'deriving': 4, 'high': 6, 'quality': 12, 'information': 7, 'from': 5}
[[1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 4 1 2]]
# CountVectorizer에서 제공하는 불용어 리스트 사용
vector1 = CountVectorizer(stop_words = "english")

print(vector1.fit_transform(text).toarray())
print(vector1.vocabulary_)
[[1 1 1 1 1 2 1 1 1 1 4]]
{'text': 10, 'mining': 5, 'referred': 8, 'data': 1, 'similar': 9, 'analytics': 0, 'process': 6, 'deriving': 2, 'high': 3, 'quality': 7, 'information': 4}

 

DTM and TF-IDF

 

import pandas as pd
from math import log

docs = ['We love horror films','We also like romance films','But my favorite genre is musical','So we want to watch West Side Story']

vocab = list(set(w for doc in docs for w in doc.split()))
vocab.sort()

print(vocab)
['But', 'Side', 'So', 'Story', 'We', 'West', 'also', 'favorite', 'films', 'genre', 'horror', 'is', 'like', 'love', 'musical', 'my', 'romance', 'to', 'want', 'watch', 'we']
N = len(docs)

# 특정 문서 d에서 특정 단어 w가 등장하는 횟수
def tf(w, d):
    return d.count(w)

# df: 특정단어 w가 등장하는 문서의 수
# idf: df(w)에 반비례하는 수
def idf(w):
    df = 0
    
    for doc in docs:
    # +=: 왼쪽 변수에 오른쪽 값을 더한 후, 결과를 왼쪽 변수에 저장
        df += w in doc
        return log(N/(df+1))
    
# 최종적으로 구하려는 TF-IDF 함수
def tfidf(w, d):
    return tf(w, d) * idf(w)
# DTM
result = []

# 각 문서 하나하나를 호출한 후 단어별로 카운트
for i in range(N):
    result.append([])
    
    d = docs[i]
    
    for j in range(len(vocab)):
        w = vocab[j]
        
        result[-1].append(tf(w, d))
        
tf_ = pd.DataFrame(result, columns = vocab)
tf_

# calculating IDF values
result = []

for j in range(len(vocab)):
    result.append([])
    
    t = vocab[j]
    
    result[-1].append(idf(t))
    
idf_ = pd.DataFrame(result, index = vocab, columns = ["IDF"])
idf_

# TF-IDF matrix
result = []

for i in range(N):
    result.append([])
    
    d = docs[i]
    
    for j in range(len(vocab)):
        w = vocab[j]
        
        result[-1].append(tfidf(w, d))
        
tfidf_ = pd.DataFrame(result, columns = vocab)
tfidf_

from sklearn.feature_extraction.text import CountVectorizer

vector = CountVectorizer()

print(vector.fit_transform(docs).toarray())
print(vector.vocabulary_)
[[0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0]
 [1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0]
 [0 1 1 0 1 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1]]
{'we': 18, 'love': 8, 'horror': 5, 'films': 3, 'also': 0, 'like': 7, 'romance': 11, 'but': 1, 'my': 10, 'favorite': 2, 'genre': 4, 'is': 6, 'musical': 9, 'so': 13, 'want': 16, 'to': 15, 'watch': 17, 'west': 19, 'side': 12, 'story': 14}
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer().fit(docs)

print(tfidf.vocabulary_)
print(tfidf.transform(docs).toarray())
{'we': 18, 'love': 8, 'horror': 5, 'films': 3, 'also': 0, 'like': 7, 'romance': 11, 'but': 1, 'my': 10, 'favorite': 2, 'genre': 4, 'is': 6, 'musical': 9, 'so': 13, 'want': 16, 'to': 15, 'watch': 17, 'west': 19, 'side': 12, 'story': 14}
[[0.         0.         0.         0.4530051  0.         0.57457953
  0.         0.         0.57457953 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.36674667 0.        ]
 [0.49819711 0.         0.         0.39278432 0.         0.
  0.         0.49819711 0.         0.         0.         0.49819711
  0.         0.         0.         0.         0.         0.
  0.31799276 0.        ]
 [0.         0.40824829 0.40824829 0.         0.40824829 0.
  0.40824829 0.         0.         0.40824829 0.40824829 0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.36742339 0.36742339 0.36742339 0.36742339 0.36742339 0.36742339
from afinn import Afinn 

afn = Afinn()

text = 'I love horror films.'

afn.score(text)
3.0
text = 'I like horror films.'

afn.score(text)
2.0
afn = Afinn(emoticons = True)

text = 'I love horror films. :('

afn.score(text)
1.0
afn.score(':(')
-2.0
from textblob import TextBlob
# polarity: from -1 (negative) to 1 (positive)
# subjectivity: from 0 (very objective) to 1 (very subjective)

TextBlob("I love horror films").sentiment
Sentiment(polarity=0.5, subjectivity=0.6)
TextBlob("I like horror films").sentiment
Sentiment(polarity=0.0, subjectivity=0.0)
TextBlob("So I want to watch West Side Story").sentiment
Sentiment(polarity=0.0, subjectivity=0.0)
728x90
반응형
LIST