728x90
반응형
SMALL
텍스트 전처리를 위한 라이브러리 설치
! pip install nltk
import nltk
from nltk.tokenize import word_tokenize
# 문장 구조가 학습된 일종의 모형
nltk.download('punkt')
text = "Text mining, also referred to as text data mining, similar to text analytics, is the process of deriving high-quality information from text. It involves 'the discovery by computer of new, previously unknown information, by automatically extracting information from different written resources.'"
token = word_tokenize(text)
print(token)
['Text', 'mining', ',', 'also', 'referred', 'to', 'as', 'text', 'data', 'mining', ',', 'similar', 'to', 'text', 'analytics', ',', 'is', 'the', 'process', 'of', 'deriving', 'high-quality', 'information', 'from', 'text', '.', 'It', 'involves', "'the", 'discovery', 'by', 'computer', 'of', 'new', ',', 'previously', 'unknown', 'information', ',', 'by', 'automatically', 'extracting', 'information', 'from', 'different', 'written', 'resources', '.', "'"]
품사 태깅
|
from nltk.tag import pos_tag
nltk.download('pos_tag')
nltk.download('averaged_perceptron_tagger')
pos_tag(token)
정제 및 정규화
text = 'I visited USA from England on 21-11-20'
normalized_text = text.replace("USA","United State").replace("England","United Kingdom").replace("-18","-2018")
print(normalized_text)
I visited United State from United Kingdom on 21-11-20
# 소문자 변환
normalized_text = normalized_text.lower()
print(normalized_text)
i visited united state from united kingdom on 21-11-20
불용어 처리
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
#영어 텍스트 전처리를 위한 불용어 세트
stop_words=stopwords.words('english')
print(stop_words[:20])
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his']
text = "Text mining, also referred to as text data mining, similar to text analytics, is the process of deriving high-quality information from text. It involves 'the discovery by computer of new, previously unknown information, by automatically extracting information from different written resources.'"
word_tokens = word_tokenize(text)
print(word_tokens)
result=[]
for w in word_tokens:
if w not in stop_words:
result.append(w)
print(result)
영문 텍스트 오타 교정
# 영문 텍스트 오타 교정을 위한 라이브러리 설치
!pip install Speller
!pip install autocorrect
# 오타 수정
from autocorrect import Speller
spell = Speller()
text = "Text mining, also referrred to as teext data minning, similar to text analytics, is the process of deriving high-quality information from text. It involves 'the discovery by computer of new, previously unknown information, by automatically extracting information from different written resources.'"
spell(text)
"Text mining, also referred to as text data winning, similar to text analytics, is the process of deriving high-quality information from text. It involves 'the discovery by computer of new, previously unknown information, by automatically extracting information from different written resources.'"
# 정제 및 정규화
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
stemmer = PorterStemmer()
stemmer.stem('cooking')
'cook'
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()
stemmer.stem('cooking')
'cook'
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('products')
lemmatizer.lemmatize('production')
lemmatizer.lemmatize('cooking')
lemmatizer.lemmatize('cooking', pos = 'v')
# Regular Expression
import re
text = """001 Kim PROF002 Lee STUD003 Park STUD"""
print(text)
re.split('\s+', text)
re.findall('\d+', text)
re.findall('\D+', text)
re.findall('[A-Z]', text)
re.findall('[A-Z][a-z]+', text)
# 정규 표현식을 활용한 텍스트 전처리
from nltk.tokenize import RegexpTokenizer
text = "Text mining, $ also referred to as text # data mining, \ similar to text analytics, is the process of deriving high-quality information from text. It involves 'the discovery by computer of new, previously unknown information, by automatically extracting information from different written resources.'"
# 단어별로 분리 (특수 문자 등 제거)
tokenizer1 = RegexpTokenizer("[\w]+")
print(tokenizer1.tokenize(text))
['Text', 'mining', 'also', 'referred', 'to', 'as', 'text', 'data', 'mining', 'similar', 'to', 'text', 'analytics', 'is', 'the', 'process', 'of', 'deriving', 'high', 'quality', 'information', 'from', 'text', 'It', 'involves', 'the', 'discovery', 'by', 'computer', 'of', 'new', 'previously', 'unknown', 'information', 'by', 'automatically', 'extracting', 'information', 'from', 'different', 'written', 'resources']
# 띄어쓰기로 분리, gaps : 정규 표현식을 토큰으로 나눌 것인지 여부 결정
tokenizer2 = RegexpTokenizer("\s+", gaps = True)
print(tokenizer2.tokenize(text))
['Text', 'mining,', '$', 'also', 'referred', 'to', 'as', 'text', '#', 'data', 'mining,', '\\', 'similar', 'to', 'text', 'analytics,', 'is', 'the', 'process', 'of', 'deriving', 'high-quality', 'information', 'from', 'text.', 'It', 'involves', "'the", 'discovery', 'by', 'computer', 'of', 'new,', 'previously', 'unknown', 'information,', 'by', 'automatically', 'extracting', 'information', 'from', 'different', 'written', "resources.'"]
728x90
반응형
LIST
'Data-driven Methodology > DS (Data Science)' 카테고리의 다른 글
[Data Science] 시계열 데이터 (Time Series Data) (0) | 2022.10.11 |
---|---|
[Data Science] 문서의 행렬 표현 (DTM and TF-IDF) (0) | 2022.09.29 |
[Data Science] Text Data (2) (0) | 2022.09.29 |
[Data Science] Text Data (1) (0) | 2022.09.29 |
[Data Science] 회귀 모델 (0) | 2022.09.29 |