from sklearn.datasets import fetch_20newsgroups
= fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quates'))
train_news = train_news.data
X_train = train_news.target
y_train
= fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quates'))
test_news = test_news.data
X_test = test_news.target y_test
텍스트 분석 - 20 뉴스그룹 분류
머신 러닝
전처리
학습
Count Vector
from sklearn.feature_extraction.text import CountVectorizer
= CountVectorizer()
cnt_vect = cnt_vect.fit_transform(X_train)
X_train_cnt_vect = cnt_vect.transform(X_test) X_test_cnt_vect
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
'ignore')
warnings.filterwarnings(
= LogisticRegression(solver='liblinear')
lr_clf
lr_clf.fit(X_train_cnt_vect, y_train)= lr_clf.predict(X_test_cnt_vect)
pred print(f'{accuracy_score(y_test, pred):.3f} ')
0.731
TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
= TfidfVectorizer()
tfidf_vect = tfidf_vect.fit_transform(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_test)
X_test_tfidf_vect
= LogisticRegression(solver='liblinear')
lr_clf
lr_clf.fit(X_train_tfidf_vect, y_train)= lr_clf.predict(X_test_tfidf_vect)
pred print(f'{accuracy_score(y_test, pred):.3f} ')
0.778