-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtfidfvectorize.py
More file actions
35 lines (27 loc) · 1.37 KB
/
tfidfvectorize.py
File metadata and controls
35 lines (27 loc) · 1.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# @title tfidf作成(ファジー)
# Tf-idf calculation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# from sklearn.metrics.pairwise import cosine_similarity # cosine calculation
# from sklearn.metrics.pairwise import linear_kernel # kernel density estimation quick ver of cosine
import pandas as pd
import streamlit as st
#from wakati import wakati_proc
from wakati import sentencep
# 分析対象の列指定
# strt_rdy = df['f1_']
# strt_rdy = dfs_20["IPC"].str.replace("\(.*?\)| +","").str.replace(","," ").str.replace("/","_")
@st.cache_data
def get_tfidfword(dataframe, colname):
#strt_rdy = #wakati_proc(dataframe[colname].astype(str))
strt_rdy = sentencep(dataframe,colname)["txt_enc"].tolist()
vectorizer = TfidfVectorizer(
stop_words='english',
# token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b'
# token_pattern='^[a-z][0-9]{2}[a-z].*/'
ngram_range=(1, 1), min_df=2, lowercase=True)
# fit the vectorizer to the dataframe and generate 1038380 texts tf-idf matrix table
tfidf_matrix = vectorizer.fit_transform(strt_rdy)
tfidf_matrix.mean(axis=0) # axis=1で行方向(column)
tfidf_df = pd.concat([pd.DataFrame(vectorizer.get_feature_names_out(), columns=["word"]), pd.DataFrame(
tfidf_matrix.mean(axis=0).T)], axis=1).sort_values(by=0, ascending=False).head(100)
return tfidf_df