### dependencies ###
import pandas as pd
import numpy as np
import random
import re
### tokenizers #####
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import wordpunct_tokenize
### DTM tehcniques ###
from sklearn.feature_extraction.text import TfidfTransformer
from nltk import wordpunct_tokenize
from nltk import word_tokenize
## DTM Makers ####
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
#### dimension reduction techniques ######
### dimension reduction teckniques
from sklearn.decomposition import SparsePCA
from sklearn.manifold import TSNE
from sklearn.manifold import LocallyLinearEmbedding as LLE
from sklearn.decomposition import SparsePCA
import umap
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import os
import datetime
#### models implenmentations ######
## models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
### models selection ###
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt
### senitment analysis ###
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
### Clustering ####
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from hdbscan import HDBSCAN
from mpl_toolkits.mplot3d import Axes3D
from xgboost import plot_importance
############################################################################# Full Read #####################################################################################################
### will use for latter analysis ####
%cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\Amazon_reviews"
reviews= pd.read_csv("Amazon_reviews.csv", names=["rating", "title", "review"],)
#################################################################################### partial sample Read ####################################################################################
%cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\Amazon_reviews"
n = 3000000 # df size
s = 10000 #desired sample size
skip = sorted(random.sample(range(1, n), n-s)) # rows to skip
reviews = pd.read_csv("Amazon_reviews.csv", skiprows=skip, names=["rating", "title", "review"])
del(skip,n)
### text blob
TextBlob(reviews.review[1]).sentiment
sent_blob=pd.DataFrame(columns=['Text_blob_polarity', "text_blob_subjectivity"])
x=0
for i, review in enumerate(reviews.review):
sent=TextBlob(review).sentiment
sent_blob.loc[i,"Text_blob_polarity"]=sent[0]
sent_blob.loc[i,"text_blob_subjectivity"]=sent[1]
if i%1000==0:
print("{}% done at {}".format(x, datetime.datetime.now()))
n+=1000
sent_blob
### vader analysis #####
now=datetime.datetime.now()
analyser = SentimentIntensityAnalyzer()
sent_vader = pd.DataFrame(columns=["vader_neg", "vader_neu",'vader_pos','vader_compound'])
x=0
for i, review in enumerate(reviews.review):
sent=analyser.polarity_scores(review)
sent_vader.loc[i,"vader_neg"]=sent['neg']
sent_vader.loc[i,"vader_neu"]=sent['neu']
sent_vader.loc[i,"vader_pos"]=sent['pos']
sent_vader.loc[i,"vader_compound"]=sent['compound']
if i%1000==0:
print("{}% done at {}".format(x, datetime.datetime.now()))
x+=10
sent_vader
### saving sentiment #####
%cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\sentiment"
sent_vader.to_pickle('sent_vader.pkl')
sent_blob.to_pickle('sent_blob.pkl')
master=pd.concat([sent_vader, sent_blob], axis=1)
master.to_pickle('master_sent.pkl')
master
####################################################### graphs for sentiment ###############################################################
%cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\sentiment"
sent_master=pd.read_pickle('master_sent.pkl')
sent_master
ratings=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\df's\\reviews.pkl").rating
sent_master['rating']=ratings
sent_master.vader_neg=sent_master.vader_neg.astype(float)
sent_master.vader_neu=sent_master.vader_neu.astype(float)
sent_master.vader_pos=sent_master.vader_pos.astype(float)
sent_master.vader_compound=sent_master.vader_compound.astype(float)
sent_master=sent_master.sort_values('rating')
grp_sent=sent_master.groupby('rating', as_index=False).mean()
# grp_sent
# sent_master
# sns.barplot(y=grp_sent.vader_neg, x=grp_sent.ratting)
# plt.show()
# sns.barplot(y=grp_sent.vader_pos, x=grp_sent.ratting)
# plt.show()
# sns.barplot(y=grp_sent.text_blob_subjectivity, x=grp_sent.ratting)
# plt.show()
fig = plt.figure(figsize=(50,50))
plt.style.use(['fivethirtyeight'])
ax1=plt.subplot2grid((4,4), (0,0), rowspan=1, colspan=1)
ax2=plt.subplot2grid((4,4), (0,1), rowspan=1, colspan=1)
ax3=plt.subplot2grid((4,4), (1,0), rowspan=2, colspan=2)
sns.barplot(y=grp_sent.Text_blob_polarity, x=grp_sent.rating, ax=ax1)
sns.barplot(y=grp_sent.vader_compound, x=grp_sent.rating, ax=ax2)
x=sent_master.rating.astype(str)
sns.scatterplot(y=sent_master.Text_blob_polarity, x= sent_master.vader_compound, hue=sent_master.rating.astype(str)+'#',
palette='twilight',ax=ax3)
ax3.set_title("\n\nText Blob Polairty Vs. Vader Compund Score", weight='bold',size=50)
ax3.set_xlabel("Vader Compound Score",size=35, weight='bold')
ax3.set_ylabel("TextBlob Polairty Score",size=35, weight='bold')
ax3.legend(loc=2, markerscale=6,fancybox=True, shadow=True,title='City Area', prop={'size': 30})
ax1.set_title("Text Blob Polairty Score", weight='bold',size=40)
ax1.set_ylabel("Text Blob Polarity",size=25, weight='bold')
ax1.set_xlabel("Star Rating", size=25, weight='bold')
ax2.set_title("Vader Polairty Score", weight='bold',size=40)
ax2.set_ylabel("Vader Compound Score",size=25, weight='bold')
ax2.set_xlabel("Star Rating",size=25, weight='bold')
plt.show()
fig.savefig('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\figs\\sentiment.png')
# sent_master
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
from nltk import wordpunct_tokenize
### Stop Words ####
en_stop = stopwords.words('english')
# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
# Additions to the stop word list
en_stop.extend([".","-","(", ")","/", r",+", "’", "”","“", "\n",r"\\)","!","?",r"\\(",r"\\'",r'\\"'])
############################################################################ Document Term Matrixs ###################################################################
stem_function= PorterStemmer
port=PorterStemmer
## stemer function
def stemer(tokens, stem_function):
stems=[]
for token in tokens:
stem=port().stem(token)
stems.append(stem)
return stems
## tokenizes words
def tokenizer(text):
text=re.sub("(!|\\)|\\(|`|~|,|\\.|#|$|%|\\*|\\'|\\\")", "",text) # regex to get ride of function
toks=wordpunct_tokenize(text.lower()) # converting everything to lower case
stemed=stemer(toks, stem_function)
return stemed
word_list=reviews.review.to_list()
vect = CountVectorizer(tokenizer=tokenizer, stop_words = en_stop, ngram_range= (1,2),max_features = 1000)
vect.fit(word_list)
dtm = vect.transform(word_list)
df2 = pd.DataFrame(dtm.toarray(), columns=vect.get_feature_names())
df2.head()
##################################################################### Environment Cleanup ###################################################################################################
#del(en_stop,string, text, word_list,dtm)
### saving DF2
#df2.to_pickle("freq_df.pkl")
# del(en_stop,string, text, word_list)
########################################################################### Term Frequancey L1 & L2 ##############################################################################
from sklearn.feature_extraction.text import TfidfTransformer
tfdif2 = TfidfTransformer(norm='l1', use_idf=True)
tfdif3 = TfidfTransformer(norm='l2', use_idf=True)
dtm_l2 = tfdif2.fit_transform(df2)
dtm_l1 = tfdif3.fit_transform(df2)
%cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\df's"
### l2 DTM #####
dtm_l2=pd.DataFrame(dtm_l2.toarray(), columns=vect.get_feature_names())
dtm_l2=pd.concat([dtm_l2, master], axis=1)
# dtm_l2.shape()
dtm_l2.to_pickle("dtm_l2.pkl") ## using pickle fromat due to size
del(dtm_l2)
# ### l1 DTM #####
dtm_l1=pd.DataFrame(dtm_l1.toarray(), columns=vect.get_feature_names())
dtm_l1=pd.concat([dtm_l1, master], axis=1)
dtm_l1.to_pickle("dtm_l1.pkl") ## using pickle format due to size
# dtm_l1.shape()
del(dtm_l1)
df2=pd.concat([df2, master], axis=1)
df2.to_pickle("freq_df.pkl")
# df2.shape()
reviews.to_pickle("reviews.pkl")
############################################################################ T-SNE Tunning ########################################################################
### mapping TSNE ####
%cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\df's"
directory_main=os.listdir()[0:3]
df=pd.read_pickle(directory_main[0])
### aproximating TSNE with sample
# rows=random.sample(range(1, df.shape[0]), 1000)
# df=df.iloc[rows,:]
y=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\df's\\reviews.pkl").rating
y[rows]
x=TSNE(n_components=2, perplexity=250, verbose=10).fit_transform(df.values)
sns.scatterplot(x=x[:,0], y=x[:,1],hue=y.astype(str)+'_#')
############################################################################ train test split & TSNE Fit #################################################
## this esection creates a train and test direcotry & fits T-SNE #####
now=datetime.datetime.now()
%cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\df's"
directory_main=os.listdir()[0:3]
name=["dtm_l1","dtm_l2", "dtm_freq"]
y=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\df's\\reviews.pkl").rating
for i, file in enumerate(directory_main):
### reading in file #####
%cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\df's"
df=pd.read_pickle(file)
### getting TSNE #####
sne_df=TSNE(n_components=2, perplexity=250, verbose=10).fit_transform(df.values)
sne_df=pd.DataFrame(sne_df, columns=["TSNE_1","TSNE_1"])
TSNE_file=name[i]+"_TSNE"+".pkl"
sne_df.to_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\TSNE\\{}".format(TSNE_file))
df['TSNE_1']=sne_df.iloc[:,0]
df['TSNE_2']=sne_df.iloc[:,1]
print("TSNE calcuclation done for ", name[i])
print("total time ", datetime.datetime.now()-now)
print("Current Time: ", datetime.datetime.now())
#### setting up directories for Train Test splits #####
x_train, x_test, y_train, y_test = train_test_split(df.values, y, test_size=0.05, random_state=42)
%cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\x_train"
pd.DataFrame(x_train, columns=df.columns).to_pickle(name[i]+"_x_train"+".pkl")
%cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\x_test"
pd.DataFrame(x_test, columns=df.columns).to_pickle(name[i]+"_x_test"+".pkl")
%cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\y_train"
pd.DataFrame(y_train).to_pickle(name[i]+"_y_train"+".pkl")
%cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\y_test"
pd.DataFrame(y_test).to_pickle(name[i]+"_y_test"+".pkl")
################################################################################ TSNE Graph ####################################################################################
# tsne=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\TSNE\\dtm_l1_TSNE.pkl')
# ratings=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\df's\\reviews.pkl").rating
# tsne['Ratings']=ratings
# tsne=tsne.sort_values('Ratings')
# tsne['Ratings']=tsne['Ratings'].astype(str)+'#'
ones=tsne[tsne.Ratings=='1#']
fives=tsne[tsne.Ratings=='5#']
fig, axe = plt.subplots(figsize=(20,20))
sns.kdeplot(data=ones.iloc[:,0],data2=ones.iloc[:,1],cmap="Blues", ax=axe, shade=True, alpha=.3 )
sns.kdeplot(data=fives.iloc[:,0],data2=fives.iloc[:,1],cmap="Reds", shade=True, ax=axe, alpha=.3)
sns.scatterplot(x=tsne.iloc[:,0], y=tsne.iloc[:,1], hue=tsne['Ratings'], palette='twilight',
x_jitter=10,y_jitter=10,alpha=.8,style=tsne['Ratings'], ax=axe)
axe.set_title('T-SNE Graph', size=50, fontdict= {'family':'MS Reference Sans Serif'} )
axe.set_ylabel('T-SNE Component One', size=30)
axe.set_xlabel('T-SNE Component Two', size=30)
axe.legend(loc=2, markerscale=3,fancybox=True, shadow=True, prop={'size': 20})
fig.savefig('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\figs\\TSNE.png')
### Clean Up ####
# del(df, directory_main, file, i, name,names, y, x_train, x_test, y_train, y_test)
This section is not in the most logical order so I will explain. The first initial dimension reduction were just fit based off of default settings. After running the first batch of models and narrowing down my DTMs the dimensions were then re-worked and more finely tunned.
########################################################################### First Dimension Reductions #############################################################################
import warnings
warnings.simplefilter("ignore")
now=datetime.datetime.now()
#### dimension Reduction ####
dimensions=3
n_neighbores=15
#### Parameters ####
umap_fun=umap.UMAP(n_neighbors=15, n_components=3)
lle_fun=LLE(n_neighbors=5,n_components=3,n_jobs=-1)
sparce_fun=SparsePCA(n_components=3, n_jobs=-1,)
pca_fun=PCA(n_components=.8)
### initializations ####
dimension_tec=[umap_fun, lle_fun, sparce_fun]
reduction_test=pd.DataFrame()
reduction_train=pd.DataFrame()
names=["dtm_freq", "dtm_l1", "dtm_l2"]
### getting paths ######
path="C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\"
x_train_directory=os.listdir("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\x_train")
x_test_directory=os.listdir("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\x_test")
for i, file in enumerate(x_train_directory):
### take out [0:50] to run whole dataset ####
drops=['vader_neg','vader_neu', 'vader_pos', 'vader_compound','Text_blob_polarity','text_blob_subjectivity','TSNE_1','TSNE_2']
x_train=pd.read_pickle(path+"x_train\\"+file)
sent_train=x_train[['vader_neg','vader_neu', 'vader_pos', 'vader_compound','Text_blob_polarity','text_blob_subjectivity']]
TSNE_train=x_train[['TSNE_1','TSNE_2']]
x_train=x_train[[col for col in x_train.columns if col not in drops]]
# display(x_train)
x_test=pd.read_pickle(path+"x_test\\"+x_test_directory[i])
sent_test=x_test[['vader_neg','vader_neu', 'vader_pos', 'vader_compound','Text_blob_polarity','text_blob_subjectivity']]
TSNE_test=x_test[['TSNE_1','TSNE_2']]
x_test=x_test[[col for col in x_test.columns if col not in drops]]
# display(x_test)
### Sparse ####
Sparse_transform_train=sparce_fun.fit_transform(x_train) # fitting to the training set
col_name=["Sparse_{}".format(x) for x in range(Sparse_transform_train.shape[1])] # creating col names
sparse_df_train=pd.DataFrame(Sparse_transform_train, columns=col_name) # Sparse train df
sparse_df_test=pd.DataFrame(sparce_fun.transform(x_test), columns=col_name) # Sparse test df
print("\nSparse for DTM {}\ntotal time :".format(file), datetime.datetime.now()-now) ## ticker
### PCA #####
PCA_transform_train=pca_fun.fit_transform(x_train) # fitting to the training set
col_name=["PCA_{}".format(x) for x in range(PCA_transform_train.shape[1])] # creating col names
PCA_df_train=pd.DataFrame(PCA_transform_train, columns=col_name) # PCA train df
PCA_df_test=pd.DataFrame(pca_fun.transform(x_test), columns=col_name) # PCA test df
print("\n\n\nPCA for DTM {}\ntotal time :".format(file), datetime.datetime.now()-now) ## ticker
### U Map #####
u_map_transform_train=umap_fun.fit_transform(x_train) # fitting to the training set
col_name=["UMAP_{}".format(x) for x in range(u_map_transform_train.shape[1])] # creating col names
umap_df_train=pd.DataFrame(u_map_transform_train, columns=col_name) # umap train df
umap_df_test=pd.DataFrame(umap_fun.transform(x_test), columns=col_name) # umap test df
print("\n\n\nUMAP for DTM {}\ntotal time :".format(file), datetime.datetime.now()-now) ## ticker
### lle ####
lle_transform_train=lle_fun.fit_transform(x_train) # fitting to the training set
col_name=["LLE_{}".format(x) for x in range(lle_transform_train.shape[1])] # creating col names
lle_df_train=pd.DataFrame(lle_transform_train, columns=col_name) # lle train df
lle_df_test=pd.DataFrame(lle_fun.transform(x_test), columns=col_name) # lle test df
print("\nLLE for DTM {}\ntotal time :".format(file), datetime.datetime.now()-now) ## ticker
### Saving\creating DF's ###
train_df=pd.concat([sparse_df_train, PCA_df_train,umap_df_train,sent_train,lle_df_train,TSNE_train], axis=1)
test_df=pd.concat([sparse_df_test, PCA_df_test, umap_df_test, sent_test, lle_df_test,TSNE_test], axis=1)
test_df.to_pickle(path+"dimension_rd_x_test\\"+names[i]+"_dim_rd_test.pkl")
train_df.to_pickle(path+"dimension_rd_x_train\\"+names[i]+"_dim_rd_train.pkl")
print("\n############################# File Saved At {} #################################".format(datetime.datetime.now()))
####################################################################### L1 Dimension Reduction ##############################################################################
import warnings
warnings.simplefilter("ignore")
now=datetime.datetime.now()
#### Parameters ####
umap_fun=umap.UMAP(n_neighbors=50, n_components=2, metric='cosine')
lle_fun=LLE(n_neighbors=40,n_components=2,n_jobs=-1)
sparce_fun=SparsePCA(n_components=3, n_jobs=-1,)
pca_fun=PCA(n_components=.8)
### initializations ####
dimension_tec=[umap_fun, lle_fun, sparce_fun]
reduction_test=pd.DataFrame()
reduction_train=pd.DataFrame()
names=[ "dtm_l1"]
### getting paths ######
path="C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\"
x_train_directory=os.listdir("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\x_train")[1]
x_test_directory=os.listdir("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\x_test")[1]
file=x_train_directory
i=0
################################ reading DataFrame ###########################################################
### take out [0:50] to run whole dataset ####
drops=['vader_neg','vader_neu', 'vader_pos', 'vader_compound','Text_blob_polarity','text_blob_subjectivity','TSNE_1','TSNE_2']
x_train=pd.read_pickle(path+"x_train\\"+file)
sent_train=x_train[['vader_neg','vader_neu', 'vader_pos', 'vader_compound','Text_blob_polarity','text_blob_subjectivity']]
TSNE_train=x_train[['TSNE_1','TSNE_2']]
x_train=x_train[[col for col in x_train.columns if col not in drops]]
# display(x_train)
x_test=pd.read_pickle(path+"x_test\\"+x_test_directory)
sent_test=x_test[['vader_neg','vader_neu', 'vader_pos', 'vader_compound','Text_blob_polarity','text_blob_subjectivity']]
TSNE_test=x_test[['TSNE_1','TSNE_2']]
x_test=x_test[[col for col in x_test.columns if col not in drops]]
# display(x_test)
#################################### Sparse #################################
Sparse_transform_train=sparce_fun.fit_transform(x_train) # fitting to the training set
col_name=["Sparse_{}".format(x) for x in range(Sparse_transform_train.shape[1])] # creating col names
sparse_df_train=pd.DataFrame(Sparse_transform_train, columns=col_name) # Sparse train df
sparse_df_test=pd.DataFrame(sparce_fun.transform(x_test), columns=col_name) # Sparse test df
print("\nSparse for DTM {}\ntotal time :".format(file), datetime.datetime.now()-now) ## ticker
##################################### PCA ########################################
PCA_transform_train=pca_fun.fit_transform(x_train) # fitting to the training set
col_name=["PCA_{}".format(x) for x in range(PCA_transform_train.shape[1])] # creating col names
PCA_df_train=pd.DataFrame(PCA_transform_train, columns=col_name) # PCA train df
PCA_df_test=pd.DataFrame(pca_fun.transform(x_test), columns=col_name) # PCA test df
print("\n\n\nPCA for DTM {}\ntotal time :".format(file), datetime.datetime.now()-now) ## ticker
##################################### U Map #######################################
u_map_transform_train=umap_fun.fit_transform(x_train) # fitting to the training set
col_name=["UMAP_{}".format(x) for x in range(u_map_transform_train.shape[1])] # creating col names
umap_df_train=pd.DataFrame(u_map_transform_train, columns=col_name) # umap train df
umap_df_test=pd.DataFrame(umap_fun.transform(x_test), columns=col_name) # umap test df
print("\n\n\nUMAP for DTM {}\ntotal time :".format(file), datetime.datetime.now()-now) ## ticker
#################################### lle ############################################
lle_transform_train=lle_fun.fit_transform(x_train) # fitting to the training set
col_name=["LLE_{}".format(x) for x in range(lle_transform_train.shape[1])] # creating col names
lle_df_train=pd.DataFrame(lle_transform_train, columns=col_name) # lle train df
lle_df_test=pd.DataFrame(lle_fun.transform(x_test), columns=col_name) # lle test df
print("\nLLE for DTM {}\ntotal time :".format(file), datetime.datetime.now()-now) ## ticker
############################## Saving\creating DF's ####################################################
train_df=pd.concat([sparse_df_train, PCA_df_train,umap_df_train,sent_train,lle_df_train,TSNE_train], axis=1)
test_df=pd.concat([sparse_df_test, PCA_df_test, umap_df_test, sent_test, lle_df_test,TSNE_test], axis=1)
test_df.to_pickle(path+"dimension_rd_x_test_2\\"+names[i]+"_dim_rd_test_2.pkl")
train_df.to_pickle(path+"dimension_rd_x_train_2\\"+names[i]+"_dim_rd_train_2.pkl")
print("\n############################# File Saved At {} #################################".format(datetime.datetime.now()))
####################################################### Saving fits ####################################################
import pickle
dimension_tec=[umap_fun, lle_fun, sparce_fun, pca_fun]
dimension_name=['umap_fun.pickle', 'lle_fun.pickle', 'sparce_fun.pickle', 'PCA_fun.pickle']
%cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\models objects"
for i, fit in enumerate(dimension_tec):
pickle_out = open(dimension_name[i],"wb")
pickle.dump(fit, pickle_out)
pickle_out.close()
########################################################################### UMAP Graph ##############################################################################################
#### reading in files
y_train_l1=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\y_train\\dtm_l1_y_train.pkl')
x_train=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\x_train\\dtm_l1_x_train.pkl')
#### used for tunning UMap Wich was latter applied to the finale dimensions
# umap_fun=umap.UMAP(n_neighbors=50, n_components=2, metric='cosine')
# u_map_transform_train=umap_fun.fit_transform(x_train.iloc[:,:-8])
# raw=u_map_transform_train
# u_map_transform_train=pd.DataFrame(u_map_transform_train, columns=['UMAP_1', 'UMAP_2'])
# u_map_transform_train['rating']=y_train_l1.rating
# u_map_transform_train=u_map_transform_train.sort_values('rating')
# u_map_transform_train.rating=u_map_transform_train.rating.astype(str)+'#'
# u_map_transform_train=u_map_transform_train[u_map_transform_train.rating!='nan#']
# ones=u_map_transform_train[u_map_transform_train.rating=='1#']
# fives=u_map_transform_train[u_map_transform_train.rating=='5#']
########################################### plotting UMAP ##############################################################
fig, axe = plt.subplots(figsize=(20,20))
### ones and Fives Densities
# sns.kdeplot(data=ones.iloc[:,0],data2=ones.iloc[:,1],cmap="Blues", ax=axe, shade=True, alpha=.4 )
# sns.kdeplot(data=fives.iloc[:,0],data2=fives.iloc[:,1],cmap="Reds", shade=True, ax=axe, alpha=.4)
### Total Densities
sns.kdeplot(data=u_map_transform_train.iloc[:,0],data2=u_map_transform_train.iloc[:,1], cmap="twilight", ax=axe, shade=True, alpha=.4 )
### individual Points
sns.scatterplot(u_map_transform_train.UMAP_1, u_map_transform_train.UMAP_2, hue=u_map_transform_train.rating ,
style=u_map_transform_train.rating, palette='twilight', alpha=.8)
axe.set_title('UMAP Graph', size=50, fontdict= {'family':'MS Reference Sans Serif'} )
axe.set_ylabel('UMAP Component One', size=30)
axe.set_xlabel('UMAP Component Two', size=30)
axe.legend(loc=2, markerscale=3,fancybox=True, shadow=True, prop={'size': 20})
#plt.savefig('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\figs\\UMAP.png')
####################################################################### LLE Graph ##############################################################################################
# lle_fun=LLE(n_neighbors=40,n_components=2,n_jobs=-1)
# lle_transform_train=lle_fun.fit_transform(x_train.iloc[:3000,:-8]) # fitting to the training set
lle_transform_train=pd.DataFrame(lle_transform_train, columns=['LLE_1', 'LLE_2'])
lle_transform_train['rating']=y_train_l1.rating
lle_transform_train=lle_transform_train.sort_values('rating')
lle_transform_train.rating=lle_transform_train.rating.astype(str)+'#'
lle_transform_train=lle_transform_train[lle_transform_train.rating!='nan#']
ones=lle_transform_train[lle_transform_train.rating=='1#']
fives=lle_transform_train[lle_transform_train.rating=='5#']
##################################################### Plotting LLE ############################################################################
fig, axe = plt.subplots(figsize=(20,20))
### ones Desnsity ###
#sns.kdeplot(data=ones.iloc[:,0],data2=ones.iloc[:,1],cmap="Blues", ax=axe, shade=True, alpha=.4 )
### Twos Desnity #####
#sns.kdeplot(data=fives.iloc[:,0],data2=fives.iloc[:,1],cmap="Reds", shade=True, ax=axe, alpha=.4)
### total Density ####
sns.kdeplot(data=lle_transform_train.iloc[:,1],data2=lle_transform_train.iloc[:,0],cmap="twilight", ax=axe, shade=True, alpha=.4 )
sns.scatterplot(lle_transform_train.LLE_2, lle_transform_train.LLE_1, hue=lle_transform_train.rating ,
style=lle_transform_train.rating, palette='twilight', alpha=.8)
axe.set_title('LLE Graph', size=50, fontdict= {'family':'MS Reference Sans Serif'} )
axe.set_ylabel('LLE Component One', size=30)
axe.set_xlabel('LLE Component Two', size=30)
axe.set_ylim(-.02,.03)
axe.legend(loc=2, markerscale=3,fancybox=True, shadow=True, prop={'size': 20})
# axe.set_xlim(-.05,.08)
plt.savefig('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\figs\\LLE.png')
plt.show()
######################################################################### first Paramgrid ####################################################################################
knn_params={
'p':[2],
'n_neighbors':[5,10,20],
}
random_params={
'n_estimators':[100,500],
'min_samples_split':[5,10],
'min_samples_leaf':[5,10],
}
gradient_params={
'learning_rate':[.1,.01,.001],
'n_estimators':[100,500],
'subsample':[.6,.8,1],
'min_samples_split':[5],
'min_samples_leaf':[5],
'random_state':[4],
}
xg_params={
'learning_rate':[.1,.01,.001],
'n_estimators':[100,500],
'subsample':[.6,.8,1],
'max_depth':[5,7,9],
}
master_params=[knn_params, random_params, gradient_params, xg_params]
models=[KNeighborsClassifier, RandomForestClassifier, GradientBoostingClassifier, XGBClassifier]
######################################################################### first inital GridSearch ##################################################################################
path="C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\"
now=datetime.datetime.now()
y_train_files=os.listdir(path+"y_train")
x_train_files=os.listdir(path+"dimension_rd_x_train")
df_names=["freq", "l1","l2"]
master_1=pd.DataFrame()
##### selecting only 100 PCA's (issue with time) ##########################
cols=["PCA_{}".format(x) for x in range(100)]
cols.extend(['Sparse_0','Sparse_1','Sparse_2','UMAP_0','UMAP_1','UMAP_2','vader_neg','vader_neu', 'vader_pos', 'vader_compound',
'Text_blob_polarity','text_blob_subjectivity','LLE_0','LLE_1','LLE_2', 'TSNE_1','TSNE_2'])
for i, x_train in enumerate(x_train_files):
#### reading files #######
x_train=pd.read_pickle(path+"dimension_rd_x_train\\{}".format(x_train))
y_train=pd.read_pickle(path+"y_train\\{}".format(y_train_files[i])).rating.values
x_train=x_train.loc[:,cols].values
for j, model in enumerate(models):
#### fitting models ####
clf=GridSearchCV(model(), master_params[j], n_jobs= -1 ,cv=5, verbose=10)
new_models=clf.fit(x_train, y_train)
df=pd.DataFrame(new_models.cv_results_)
df['df']=df_names[i]
df['cols']='All'
df['model']=model.__name__
master_1=pd.concat([master_1,df], axis=0)
print('Finished with df {} model {}'.format(df_names[i],model.__name__))
print("Total time:", datetime.datetime.now()-now)
print("\n\n#################################################### Fin #############################################################")
######################################################################### top Model Graphs Round One #########################################################################################
rd_one=pd.read_csv('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\results\\master_all_cols.csv')
display(rd_one.groupby(['model','df'], as_index=False)['mean_test_score'].max())
df_3=rd_one.groupby(['model','df'], as_index=False)['mean_test_score'].agg(['min','max','mean','std'],as_index=False)
df_3=df_3.reset_index()
sns.set_style('darkgrid')
df_3=df_3.sort_values('max', ascending=False)
fig, ax= plt.subplots(figsize=(11,5))
sns.barplot(y=df_3['model'], x=df_3['max'], hue=df_3.df, palette='mako')
ax.set_title('Top Model by Algorithum and DTM')
ax.set_ylabel('')
ax.set_xlabel('Accuracey')
#ax.annotate('afg',(34,45))
axe.legend(loc=2, markerscale=5,fancybox=True, shadow=True, prop={'size': 30})
plt.savefig('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\figs\\top_models.png')
#ax.table(df_3)
###################################################################### NEW Feature Fit + Various Paremeters ########################################################################
### this cell dynamically changed as new features and parameters were test on feature importance
xg_params={
'learning_rate':[.1,.001,.01],
'n_estimators':[100,500],
'subsample':[.6,.8,1],
'max_depth':[5,9,15],
}
path="C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\"
i=0
now=datetime.datetime.now()
y_train_files=os.listdir(path+"y_train")
x_train_files=os.listdir(path+"dimension_rd_x_train_2")
df_names="l1"
master_3=pd.DataFrame()
####### feature importance selection ################3
cols=ft_import_df.sort_values('ft_im', ascending=False).cols[0:200]
# cols=["PCA_{}".format(x) for x in range(100)]
# cols=['Sparse_0','Sparse_1','Sparse_2','UMAP_0','UMAP_1','UMAP_2','vader_neg','vader_neu', 'vader_pos', 'vader_compound',
# 'Text_blob_polarity','text_blob_subjectivity','LLE_0','LLE_1','LLE_2', 'PCA_1','PCA_2','PCA_3']
##################### Reading in File ############################################
x_train=pd.read_pickle(path+"dimension_rd_x_train_2\\{}".format(x_train_files[0]))
x_train=x_train[cols].values
y_train=pd.read_pickle(path+"y_train\\{}".format(y_train_files[1])).rating.values
#### fitting models ####
clf=GridSearchCV(XGBClassifier(tree_method='gpu_hist'), xg_params ,cv=3, verbose=10)
new_models=clf.fit(x_train, y_train)
df=pd.DataFrame(new_models.cv_results_)
df['df']=df_names
df['cols']='to 50 ft'
df['model']=XGBClassifier.__name__
master_3=pd.concat([master_2,df], axis=0)
print('Finished with df {} model {}'.format(df_names[i],model.__name__))
print("Total time:", datetime.datetime.now()-now)
print("\n\n#################################################### Fin #############################################################")
####################################################################### Creating Feature Importance DataFrame ################################################################################
###
#clf.estimator.fit(x_train, y_train)
# clf_1=XGBClassifier(**clf.best_params_)
# clf_1.fit(x_train, y_train)
# clf_1.feature_importances_
# plot_importance(clf_1,max_num_features=30 )
# x_train=pd.read_pickle(path+"dimension_rd_x_train_2\\{}".format(x_train_files[0]))
# x_train[[col for col in x_train if col not in ['TSNE_1', 'TSNE_2']]]
# ft_import_df=pd.DataFrame()
# ft_import_df['ft_im']=clf_1.feature_importances_
# ft_import_df['cols']=x_train[[col for col in x_train if col not in ['TSNE_1', 'TSNE_2']]].columns
# ft_import_df.sort_values('ft_im', ascending=False).cols[0:100]
#ft_import_df.to_pickle('ft_imp_df.pkl')
# fin_100=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\results\\results_ft_selct_100.pkl')
# fin_100.mean_test_score.max()
####################################################################### Plotting Feature Importance ####################################################################################
sns.set_style("dark")
ft_import_df_1=ft_import_df.sort_values('ft_im', ascending=False)
fig, ax= plt.subplots(figsize=(20,10))
sns.barplot(ft_import_df_1.ft_im[0:16],ft_import_df_1.cols[0:16],ax=ax, palette='twilight')
ax.set_title("XGBoost's Top 16 Most Important Features\n", size=30,fontdict= {'family':'MS Reference Sans Serif','weight':'bold'}, weight='bold' )
ax.set_ylabel('')
ax.set_xlabel('F-Score', weight='bold', size=15)
plt.yticks(fontsize=18)
plt.tight_layout()
plt.savefig('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\figs\\XG_ft_importance_fig.png')
fin_100=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\results\\results_ft_selct_100.pkl')
fin_100.mean_test_score.max()
########################################################################## Single Fit Algorithum Cell #####################################################################################
path="C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\"
i=0
now=datetime.datetime.now()
y_train_files=os.listdir(path+"y_train")
x_train_files=os.listdir(path+"dimension_rd_x_train_2")
x_train=pd.read_pickle(path+"dimension_rd_x_train_2\\{}".format(x_train_files[0]))
y_train_files=os.listdir(path+"y_train")
y_train=pd.read_pickle(path+"y_train\\{}".format(y_train_files[1]))
path="C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\"
params={'learning_rate': 0.1,
'max_depth': 5,
'n_estimators': 100,
'subsample': 0.6,
'tree_method': 'gpu_hist'}
col_list_fix=['vader_compound', 'Text_blob_polarity', 'vader_pos', 'vader_neg', 'text_blob_subjectivity', 'vader_neu']
i=0
y_train_files=os.listdir(path+"y_train")
x_train_files=os.listdir(path+"dimension_rd_x_train_2")
ft_import_df=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\ft_imp_df.pkl')
##### Selecting top 100 ft ##########################
cols=ft_import_df.sort_values('ft_im', ascending=False).cols[0:100]
x_train=pd.read_pickle(path+"dimension_rd_x_train_2\\{}".format(x_train_files[0]))
x_train=x_train[cols]
y_train=pd.read_pickle(path+"y_train\\dtm_l1_y_train.pkl").rating.values
#### fitting models ####
for col in col_list_fix:
x_test[col]=x_test[col].astype(float)
x_train[col]=x_train[col].astype(float)
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
clf=XGBClassifier(**params)
clf.fit(x_train,y_train)
scores = cross_validate(clf, x_train , y_train, cv=5)
scores
############################################################### Test fit For Y Variable ###################################################################
x_test=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\dimension_rd_x_test\\dtm_l1_dim_rd_test.pkl")
y_test=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\y_test\\dtm_l1_y_test.pkl")
y_train=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\y_train\\dtm_l1_y_train.pkl")
x_train=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\dimension_rd_x_train\\dtm_l1_dim_rd_train.pkl")
col_list_fix=['vader_compound', 'Text_blob_polarity', 'vader_pos', 'vader_neg', 'text_blob_subjectivity', 'vader_neu']
for col in col_list_fix:
x_test[col]=x_test[col].astype(float)
x_train[col]=x_train[col].astype(float)
y_train=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\\y_train\\dtm_l1_y_train.pkl")
# display(x_test.head())
# x_train.head()
# clf.fit(x_train[cols].values,y_train)
cols=ft_import_df.sort_values('ft_im', ascending=False).cols[0:100]
pred=clf.predict(x_test[cols])
pred_1=clf.predict(x_train[cols])
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
print("\n\n\n test df results")
confusion_matrix(y_test,pred_1)
print(classification_report(y_train, pred_1))
print("\n\n\n test df results")
confusion_matrix(y_test,pred)
print(classification_report(y_test, pred))
####################################################### saving model ####################################################
import pickle
pickle_out = open("clf.pickle","wb")
pickle.dump(clf, pickle_out)
pickle_out.close()
##################################################################### Creating a Data Pipline to Test #################################################################################
%cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\Amazon_reviews"
from sklearn.feature_extraction.text import TfidfTransformer
n = 3000000 # df size
s = 800 #desired sample size
skip = sorted(random.sample(range(1, n), n-s)) # rows to skip
reviews_test = pd.read_csv("Amazon_reviews.csv", skiprows=skip, names=["rating", "title", "review"])
now=datetime.datetime.now()
analyser = SentimentIntensityAnalyzer()
sent_vader = pd.DataFrame(columns=["vader_neg", "vader_neu",'vader_pos','vader_compound'])
x=0
###### vader
for i, review in enumerate(reviews_test.review):
sent=analyser.polarity_scores(review)
sent_vader.loc[i,"vader_neg"]=sent['neg']
sent_vader.loc[i,"vader_neu"]=sent['neu']
sent_vader.loc[i,"vader_pos"]=sent['pos']
sent_vader.loc[i,"vader_compound"]=sent['compound']
if i%1000==0:
print("{}% done at {}".format(x, datetime.datetime.now()))
x+=10
### text blob
sent_blob=pd.DataFrame(columns=['Text_blob_polarity', "text_blob_subjectivity"])
x=0
for i, review in enumerate(reviews_test.review):
sent=TextBlob(review).sentiment
sent_blob.loc[i,"Text_blob_polarity"]=sent[0]
sent_blob.loc[i,"text_blob_subjectivity"]=sent[1]
if i%1000==0:
print("{}% done at {}".format(x, datetime.datetime.now()))
n+=1000
master=pd.concat([sent_vader, sent_blob], axis=1)
stem_function= PorterStemmer
port=PorterStemmer
## stemer function
def stemer(tokens, stem_function):
stems=[]
for token in tokens:
stem=port().stem(token)
stems.append(stem)
return stems
## tokenizes words
def tokenizer(text):
text=re.sub("(!|\\)|\\(|`|~|,|\\.|#|$|%|\\*|\\'|\\\")", "",text) # regex to get ride of function
toks=wordpunct_tokenize(text.lower()) # converting everything to lower case
stemed=stemer(toks, stem_function)
return stemed
word_list=reviews_test.review.to_list()
vect = CountVectorizer(tokenizer=tokenizer, stop_words = en_stop, ngram_range= (1,2),max_features = 1000)
vect.fit(word_list)
dtm = vect.transform(word_list)
df2 = pd.DataFrame(dtm.toarray(), columns=vect.get_feature_names())
tfdif2 = TfidfTransformer(norm='l1', use_idf=True)
dtm_l2 = tfdif2.fit_transform(df2)
df2=pd.concat([df2, master], axis=1)
df2
sent_test=df2[['vader_neg','vader_neu', 'vader_pos', 'vader_compound','Text_blob_polarity','text_blob_subjectivity']]
drops=['vader_neg','vader_neu', 'vader_pos', 'vader_compound','Text_blob_polarity','text_blob_subjectivity']
dimension_tec=[umap_fun, lle_fun, sparce_fun,pca_fun]
cols=[col for col in df2.columns if col not in drops]
dtm=df2[cols]
umap_ft=umap_fun.transform(dtm)
col_name=["UMAP_{}".format(x) for x in range(umap_ft.shape[1])] # creating col names
umap_df_train=pd.DataFrame(umap_ft, columns=col_name) # Sparse train df
lle_ft=lle_fun.transform(dtm)
col_name=["LLE_{}".format(x) for x in range(lle_ft.shape[1])] # creating col names
lle_df_train=pd.DataFrame(lle_ft, columns=col_name) # Sparse train df
sparce_ft=sparce_fun.transform(dtm)
col_name=["Sparse_{}".format(x) for x in range(sparce_ft.shape[1])] # creating col names
sparse_df_train=pd.DataFrame(sparce_ft, columns=col_name) # Sparse train df
pca_ft=pca_fun.transform(dtm)
col_name=["PCA_{}".format(x) for x in range(pca_ft.shape[1])] # creating col names
PCA_df_train=pd.DataFrame(pca_ft, columns=col_name) # Sparse train df
train_df=pd.concat([sparse_df_train, PCA_df_train,umap_df_train,master,lle_df_train], axis=1)
chng=['vader_neg', 'vader_neu', 'vader_pos', 'vader_compound']
for col in chng:
train_df[col]=train_df[col].astype(float)
cols=ft_import_df.sort_values('ft_im', ascending=False).cols[0:100]
pred=clf.predict(train_df[cols])
# pred=cross_val_predict(clf, X, y)
print(classification_report(reviews_test.rating, pred))
i=0
now=datetime.datetime.now()
y_train_files=os.listdir(path+"y_train")
x_train_files=os.listdir(path+"dimension_rd_x_train_2")
df_names="l1"
master_2=pd.DataFrame()
##### selecting only 100 PCA's (issue with time) ##########################
# cols=["PCA_{}".format(x) for x in range(100)]
#cols=['Sparse_0','Sparse_1','Sparse_2','UMAP_0','UMAP_1','UMAP_2','vader_neg','vader_neu', 'vader_pos', 'vader_compound',
#'Text_blob_polarity','text_blob_subjectivity','LLE_0','LLE_1','LLE_2', 'PCA_1','PCA_2','PCA_3']
x_train=pd.read_pickle(path+"dimension_rd_x_train_2\\{}".format(x_train_files[0]))
y_train=pd.read_pickle(path+"y_train\\{}".format(y_train_files[1])).rating.values
#x_train=x_train.loc[:,cols]
x_train[[col for col in x_train.columns if col not in ['TSNE_1', 'TSNE_2']]]
### feature engineering: most Frequent Words ####
# x_train=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\x_train\\dtm_l1_x_train.pkl')
# y_train=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\y_train\\dtm_l1_y_train.pkl')
# x_test=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\x_test\\dtm_l1_x_test.pkl')
# y_test=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\y_test\\dtm_l1_y_test.pkl')
# ###### Most Frequent Words #########
# top=500
# df=x_train
# df['rating']=y_train.rating
# # l1_dtm['rating']=rev.rating
# # display(l1_dtm.melt(var_name='rating'))
# top_1=df.groupby('rating').sum().T.sort_values(1, ascending=False)[0:top].index
# top_2=df.groupby('rating').sum().T.sort_values(2, ascending=False)[0:top].index
# top_3=df.groupby('rating').sum().T.sort_values(3, ascending=False)[0:top].index
# top_4=df.groupby('rating').sum().T.sort_values(4, ascending=False)[0:top].index
# top_5=df.groupby('rating').sum().T.sort_values(5, ascending=False)[0:top].index
# one_toks=[col for col in top_1 if col not in top_5 | top_4 | top_3]
# two_toks=[col for col in top_2 if col not in top_5 | top_4 ]
# three_toks=[col for col in top_3 if col not in top_5 | top_1]
# four_toks=[col for col in top_4 if col not in top_2|top_1]
# five_toks=[col for col in top_5 if col not in top_3 |top_2 | top_1]
# x=['familiar' for x in range(len(one_toks))]
# g=0
# freq_map={**{i:'Low star' for i in one_toks},**{i:'Med star' for i in three_toks},**{i:'High star' for i in five_toks}}
for dex in df.T.iloc[:-9,0].index:
df.T.loc[dex,]
###### feature Engineering: Binning polarity
x_train=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\dimension_rd_x_test_2\\dtm_l1_dim_rd_test_2.pkl')
y_train=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\y_train\\dtm_l1_y_train.pkl')
pd.cut(x_train.vader_compound
#################################################################################### HDBSCAN ################################################################################################
%cd "C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\clusters"
path="C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\"
### reducing cols ####
cols=["PCA_{}".format(x) for x in range(100)]
cols.extend(['Sparse_0','Sparse_1','Sparse_2','UMAP_0','UMAP_1','UMAP_2'])
### getting file directoryies
x_train_dir=os.listdir(path+'dimension_rd_x_train_2')
y_train_dir=os.listdir(path+'y_train')
sent_dir=os.listdir(path+'sentiment')
now=datetime.datetime.now()
hdbscan_fun=HDBSCAN(min_cluster_size=500, min_samples=50, core_dist_n_jobs=-1,metric='manhattan')
names=['freq','l1','l2']
x_train_dir=os.listdir(path+'dimension_rd_x_train_2')
y_train_dir=os.listdir(path+'y_train')[1]
y_train=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\y_train\\dtm_l1_y_train.pkl')
# cols=["PCA_{}".format(x) for x in range(100)]
#cols.extend(['Sparse_0','Sparse_1','Sparse_2','UMAP_0','UMAP_1','UMAP_2'])
cols=['vader_neg', 'vader_neu','vader_pos', 'vader_compound', 'Text_blob_polarity','text_blob_subjectivity']
df=pd.read_pickle(path+'dimension_rd_x_train_2\\'+x_train_dir[0])
#hdbscan_fun.fit(df[cols].values)
hdbscan_fun.fit(df[sent])
cluster=hdbscan_fun.labels_
cluster_df=pd.DataFrame(cluster, columns=['HDBSCAN'])
# cluster_df.to_pickle(names[i]+'_cluster.pkl')
# print("############### DF {} at {}##########".format(file,now))
path="C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\"
x_train_dir=os.listdir(path+'dimension_rd_x_train_2')
df=pd.read_pickle(path+'dimension_rd_x_train_2\\'+x_train_dir[0])
df.iloc[:,-13:]
sent=['vader_compound','Text_blob_polarity']
cluster_df['rating']=y_train.rating
pd.crosstab(cluster_df.HDBSCAN.astype(str),cluster_df.rating.astype(str))
######################################################################### HDBSCAN Crosstab #######################################################################################
cluster_df['rating']=y_train.rating.astype(str)
cluster_df.HDBSCAN=cluster_df.HDBSCAN.astype(str)
pd.crosstab(cluster_df['rating'],cluster_df['HDBSCAN'],margins=True, margins_name='total')
#################################################################################### DBSCAN ################################################################################################
%cd "C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\clusters"
now=datetime.datetime.now()
db_fun= DBSCAN(eps=.5,min_samples=500,p=2, n_jobs=-1,leaf_size=100)
names=['freq','l1','l2']
x_train_dir=os.listdir(path+'dimension_rd_x_train_2')
y_train_dir=os.listdir(path+'y_train')[1]
y_train=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\y_train\\dtm_l1_y_train.pkl')
cols=["PCA_{}".format(x) for x in range(100)]
cols.extend(['Sparse_0','Sparse_1','UMAP_0','UMAP_1'])
cols.extend(['vader_neg', 'vader_neu','vader_pos', 'vader_compound', 'Text_blob_polarity','text_blob_subjectivity'])
# cols=['TSNE_1','TSNE_2']
# cols=['vader_neg', 'vader_neu','vader_pos', 'vader_compound', 'Text_blob_polarity','text_blob_subjectivity']
df=pd.read_pickle(path+'dimension_rd_x_train_2\\'+x_train_dir[0])
db_fun.fit(df[sent].values)
#db_fun.fit(df.values)
cluster=hdbscan_fun.labels_
cluster_df=pd.DataFrame(cluster, columns=['DBSCAN'])
# print("############### DF {} at {}##########".format(file,now))
################################################################ DBSCAN Cross Tab ########################################################################################
cluster_df['rating']=y_train.rating.astype(str)
cluster_df.DBSCAN=cluster_df.DBSCAN.astype(str)
pd.crosstab(cluster_df['rating'],cluster_df['DBSCAN'],margins=True, margins_name='total')
#################################################################################### K-MEANS ################################################################################################
%cd "C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\clusters"
now=datetime.datetime.now()
km_fun= KMeans(n_clusters=5, n_jobs=-1)
names=['freq','l1','l2']
x_train_dir=os.listdir(path+'dimension_rd_x_train_2')
y_train_dir=os.listdir(path+'y_train')[1]
y_train=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\y_train\\dtm_l1_y_train.pkl')
#cols=["PCA_{}".format(x) for x in range(100)]
#cols.extend(['Sparse_0','Sparse_1','UMAP_0','UMAP_1'])
#cols.extend(['vader_neg', 'vader_neu','vader_pos', 'vader_compound', 'Text_blob_polarity','text_blob_subjectivity'])
cols=['TSNE_1','TSNE_2']
#cols=['vader_neg', 'vader_neu','vader_pos', 'vader_compound', 'Text_blob_polarity','text_blob_subjectivity']
df=pd.read_pickle(path+'dimension_rd_x_train_2\\'+x_train_dir[0])
km_fun.fit(df[sent].values)
#km_fun.fit(df.values)
cluster=km_fun.labels_
cluster_df=pd.DataFrame(cluster, columns=['K_MEANS'])
# print("############### DF {} at {}##########".format(file,now))
######################################################################### K-Means Cross Tab ##################################################################################
cluster_df['rating']=y_train.rating.astype(str)
cluster_df.K_MEANS=cluster_df.K_MEANS.astype(str)
pd.crosstab(cluster_df['rating'],cluster_df['K_MEANS'],margins=True, margins_name='total')
################# Clustering on tokens #################
l1_dtm=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\df's\\dtm_l1.pkl")
rev=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\df's\\reviews.pkl")
####################################################################### Frequency of word by Ratings ##################################################
# l1_dtm=l1_dtm.iloc[:,:-6]
l1_dtm=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\df's\\dtm_l1.pkl")
top=300
l1_dtm['rating']=rev.rating
# display(l1_dtm.melt(var_name='rating'))
top_1=l1_dtm.groupby('rating').sum().T.sort_values(1, ascending=False)[0:top].index
top_2=l1_dtm.groupby('rating').sum().T.sort_values(2, ascending=False)[0:top].index
top_3=l1_dtm.groupby('rating').sum().T.sort_values(3, ascending=False)[0:top].index
top_4=l1_dtm.groupby('rating').sum().T.sort_values(4, ascending=False)[0:top].index
top_5=l1_dtm.groupby('rating').sum().T.sort_values(5, ascending=False)[0:top].index
one_toks=[col for col in top_1 if col not in top_5 | top_4 | top_3 | top_2 ]
two_toks=[col for col in top_2 if col not in top_5 | top_4 |]
three_toks=[col for col in top_3 if col not in top_5 | top_1]
four_toks=[col for col in top_4 if col not in top_2|top_1]
five_toks=[col for col in top_5 if col not in top_3 |top_2 | top_1]
################################################################################# Clustering Token Atempt ########################################################################################
trn=l1_dtm.pivot(columns='rating').T.reset_index().fillna(0)
tokens=trn.level_0
tokens=trn.rating
trn
umap_fun=umap.UMAP(n_neighbors=50, n_components=10, metric='cosine')
rd_trn=umap_fun.fit_transform(trn.iloc[:,2:])
hdbscan_fun.fit(rd_trn)
cluster=hdbscan_fun.labels_
cluster_df=pd.DataFrame(cluster, columns=['HDBSCAN'])
# print("############### DF {} at {}##########".format(file,now))
token_cluster=pd.concat([cluster_df,tokens,trn.rating], axis=1)
token_cluster.groupby('rating')
token_cluster[['HDBSCAN','rating']]
pd.crosstab(token_cluster['rating'],token_cluster['HDBSCAN'])