A Predictive Review¶

by: Joshua Roberge¶

### dependencies ###

import pandas as pd
import numpy as np
import random 
import re

### tokenizers #####
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import wordpunct_tokenize

### DTM tehcniques ###
from sklearn.feature_extraction.text import TfidfTransformer
from nltk import wordpunct_tokenize
from nltk import word_tokenize 


## DTM Makers ####
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer



#### dimension reduction techniques ######
### dimension reduction teckniques
from sklearn.decomposition import SparsePCA
from sklearn.manifold import TSNE
from sklearn.manifold import LocallyLinearEmbedding as LLE
from sklearn.decomposition import SparsePCA
import umap
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split

import os
import datetime 



#### models implenmentations ######
## models 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

### models selection ###
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt


### senitment analysis ###
from textblob import TextBlob 
from textblob.sentiments import NaiveBayesAnalyzer 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


### Clustering ####
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from hdbscan import HDBSCAN


from mpl_toolkits.mplot3d import Axes3D
from xgboost import plot_importance

############################################################################# Full Read #####################################################################################################
### will use for latter analysis ####

%cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\Amazon_reviews"
reviews= pd.read_csv("Amazon_reviews.csv", names=["rating", "title", "review"],)

C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\Amazon_reviews

#################################################################################### partial sample Read ####################################################################################

%cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\Amazon_reviews"

n = 3000000 # df size
s = 10000 #desired sample size
skip = sorted(random.sample(range(1, n), n-s)) # rows to skip
reviews = pd.read_csv("Amazon_reviews.csv", skiprows=skip, names=["rating", "title", "review"])

del(skip,n)

C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\Amazon_reviews

Sentiment Analysis:¶

TextBlob
Vader
Graphical Analysis

### text blob
TextBlob(reviews.review[1]).sentiment
sent_blob=pd.DataFrame(columns=['Text_blob_polarity', "text_blob_subjectivity"])
x=0
for i, review in enumerate(reviews.review):
    sent=TextBlob(review).sentiment
    sent_blob.loc[i,"Text_blob_polarity"]=sent[0]
    sent_blob.loc[i,"text_blob_subjectivity"]=sent[1]
    if i%1000==0: 
        print("{}% done at {}".format(x, datetime.datetime.now()))
        n+=1000



sent_blob

0% done at 2019-12-12 20:18:40.382848
0% done at 2019-12-12 20:18:43.433088
0% done at 2019-12-12 20:18:46.498303
0% done at 2019-12-12 20:18:52.927312

### vader analysis #####

now=datetime.datetime.now()
analyser = SentimentIntensityAnalyzer()
sent_vader = pd.DataFrame(columns=["vader_neg", "vader_neu",'vader_pos','vader_compound'])
x=0

for i, review in enumerate(reviews.review):
    sent=analyser.polarity_scores(review)
    sent_vader.loc[i,"vader_neg"]=sent['neg']
    sent_vader.loc[i,"vader_neu"]=sent['neu']
    sent_vader.loc[i,"vader_pos"]=sent['pos']
    sent_vader.loc[i,"vader_compound"]=sent['compound']
    if i%1000==0: 
        print("{}% done at {}".format(x, datetime.datetime.now()))
        x+=10


sent_vader

0% done at 2019-12-12 20:19:44.370207
10% done at 2019-12-12 20:19:47.443441
20% done at 2019-12-12 20:19:51.294726
30% done at 2019-12-12 20:19:55.503255
40% done at 2019-12-12 20:19:59.674867
50% done at 2019-12-12 20:20:04.239209
60% done at 2019-12-12 20:20:09.377592
70% done at 2019-12-12 20:20:15.599099
80% done at 2019-12-12 20:20:21.895568
90% done at 2019-12-12 20:20:29.317125

### saving sentiment #####
%cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\sentiment"
sent_vader.to_pickle('sent_vader.pkl')
sent_blob.to_pickle('sent_blob.pkl')

master=pd.concat([sent_vader, sent_blob], axis=1)
master.to_pickle('master_sent.pkl')
master

C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\sentiment

####################################################### graphs for sentiment ###############################################################
%cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\sentiment"
sent_master=pd.read_pickle('master_sent.pkl')
sent_master
ratings=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\df's\\reviews.pkl").rating
sent_master['rating']=ratings
sent_master.vader_neg=sent_master.vader_neg.astype(float)
sent_master.vader_neu=sent_master.vader_neu.astype(float)
sent_master.vader_pos=sent_master.vader_pos.astype(float)
sent_master.vader_compound=sent_master.vader_compound.astype(float)
sent_master=sent_master.sort_values('rating')
grp_sent=sent_master.groupby('rating', as_index=False).mean()

# grp_sent
# sent_master

# sns.barplot(y=grp_sent.vader_neg, x=grp_sent.ratting)
# plt.show()

# sns.barplot(y=grp_sent.vader_pos, x=grp_sent.ratting)
# plt.show()

# sns.barplot(y=grp_sent.text_blob_subjectivity, x=grp_sent.ratting)
# plt.show()
fig = plt.figure(figsize=(50,50))
plt.style.use(['fivethirtyeight'])
ax1=plt.subplot2grid((4,4), (0,0), rowspan=1, colspan=1)
ax2=plt.subplot2grid((4,4), (0,1), rowspan=1, colspan=1)
ax3=plt.subplot2grid((4,4), (1,0), rowspan=2, colspan=2)


sns.barplot(y=grp_sent.Text_blob_polarity, x=grp_sent.rating, ax=ax1)
sns.barplot(y=grp_sent.vader_compound, x=grp_sent.rating, ax=ax2)
x=sent_master.rating.astype(str)
sns.scatterplot(y=sent_master.Text_blob_polarity, x= sent_master.vader_compound, hue=sent_master.rating.astype(str)+'#',
                palette='twilight',ax=ax3)

ax3.set_title("\n\nText Blob Polairty Vs. Vader Compund Score", weight='bold',size=50)
ax3.set_xlabel("Vader Compound Score",size=35, weight='bold')
ax3.set_ylabel("TextBlob Polairty Score",size=35, weight='bold')
ax3.legend(loc=2, markerscale=6,fancybox=True, shadow=True,title='City Area', prop={'size': 30})

ax1.set_title("Text Blob Polairty Score", weight='bold',size=40)
ax1.set_ylabel("Text Blob Polarity",size=25, weight='bold')
ax1.set_xlabel("Star Rating", size=25, weight='bold')

ax2.set_title("Vader Polairty Score", weight='bold',size=40)
ax2.set_ylabel("Vader Compound Score",size=25, weight='bold')
ax2.set_xlabel("Star Rating",size=25, weight='bold')

plt.show()
fig.savefig('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\figs\\sentiment.png')




# sent_master

C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\sentiment

Creating Document Term Matrices¶

Document Term Frequency Matrix
Weighted Document Term Matrix (L1, L2)

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
from nltk import wordpunct_tokenize



### Stop Words ####
en_stop = stopwords.words('english')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

# Additions to the stop word list
en_stop.extend([".","-","(", ")","/", r",+", "’", "”","“", "\n",r"\\)","!","?",r"\\(",r"\\'",r'\\"'])

############################################################################ Document Term Matrixs ###################################################################
stem_function= PorterStemmer
port=PorterStemmer

## stemer function
def stemer(tokens, stem_function):
    stems=[]
    for token in tokens:
        stem=port().stem(token)
        stems.append(stem)
    return stems

## tokenizes words
def tokenizer(text):
    text=re.sub("(!|\\)|\\(|`|~|,|\\.|#|$|%|\\*|\\'|\\\")", "",text) # regex to get ride of function
    toks=wordpunct_tokenize(text.lower()) # converting everything to lower case
    stemed=stemer(toks, stem_function)
    return stemed
    
    
    
word_list=reviews.review.to_list()   
vect = CountVectorizer(tokenizer=tokenizer, stop_words = en_stop, ngram_range= (1,2),max_features = 1000)
vect.fit(word_list)
dtm = vect.transform(word_list)   
df2 = pd.DataFrame(dtm.toarray(), columns=vect.get_feature_names())
df2.head()

C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\sklearn\feature_extraction\text.py:300: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['+', '\\\\', 'abov', 'ani', 'arent', 'becaus', 'befor', 'couldnt', 'didnt', 'doe', 'doesnt', 'dont', 'dure', 'ha', 'hadnt', 'hasnt', 'havent', 'hi', 'isnt', 'mightnt', 'mustnt', 'neednt', 'onc', 'onli', 'ourselv', 'shant', 'shouldnt', 'shouldv', 'thatll', 'themselv', 'thi', 'veri', 'wa', 'wasnt', 'werent', 'whi', 'wont', 'wouldnt', 'youd', 'youll', 'yourselv', 'youv'] not in stop_words.
  'stop_words.' % sorted(inconsistent))

##################################################################### Environment Cleanup ###################################################################################################
#del(en_stop,string, text, word_list,dtm)

### saving DF2
#df2.to_pickle("freq_df.pkl")


#  del(en_stop,string, text, word_list)

########################################################################### Term Frequancey L1 & L2 ##############################################################################
from sklearn.feature_extraction.text import TfidfTransformer

tfdif2 = TfidfTransformer(norm='l1', use_idf=True)
tfdif3 = TfidfTransformer(norm='l2', use_idf=True)


dtm_l2 = tfdif2.fit_transform(df2)
dtm_l1 = tfdif3.fit_transform(df2)


%cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\df's"

### l2 DTM #####
dtm_l2=pd.DataFrame(dtm_l2.toarray(), columns=vect.get_feature_names())
dtm_l2=pd.concat([dtm_l2, master], axis=1)
# dtm_l2.shape()
dtm_l2.to_pickle("dtm_l2.pkl") ## using pickle fromat due to size
del(dtm_l2) 

# ### l1 DTM #####
dtm_l1=pd.DataFrame(dtm_l1.toarray(), columns=vect.get_feature_names())
dtm_l1=pd.concat([dtm_l1, master], axis=1)
dtm_l1.to_pickle("dtm_l1.pkl") ## using pickle format due to size
# dtm_l1.shape()

del(dtm_l1)

df2=pd.concat([df2, master], axis=1)
df2.to_pickle("freq_df.pkl")
# df2.shape()

reviews.to_pickle("reviews.pkl")

C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\df's

Train Test Split & TSNE¶

T-SNE tuning
Train Test split and T-SNE Fitting
Graphing T-SNE

############################################################################ T-SNE Tunning ########################################################################
### mapping TSNE ####
%cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\df's"
directory_main=os.listdir()[0:3]
df=pd.read_pickle(directory_main[0])

### aproximating TSNE with sample
# rows=random.sample(range(1, df.shape[0]), 1000)
# df=df.iloc[rows,:]
y=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\df's\\reviews.pkl").rating
y[rows]

x=TSNE(n_components=2, perplexity=250, verbose=10).fit_transform(df.values)

sns.scatterplot(x=x[:,0], y=x[:,1],hue=y.astype(str)+'_#')

array([[  4.4325747,  -5.829836 ],
       [ 13.77044  ,  -6.8514385],
       [ -6.1297827,   2.7519443],
       ...,
       [-14.941263 ,   2.4553525],
       [ 15.015589 ,  -4.0810432],
       [ -8.19329  ,   5.088409 ]], dtype=float32)

############################################################################ train test split & TSNE Fit #################################################

## this esection creates a train and test direcotry & fits T-SNE #####
now=datetime.datetime.now()
%cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\df's"
directory_main=os.listdir()[0:3]
name=["dtm_l1","dtm_l2", "dtm_freq"]
y=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\df's\\reviews.pkl").rating

for i, file in enumerate(directory_main):
    
    ### reading in file #####
    %cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\df's"
    df=pd.read_pickle(file)
    
    
    ### getting TSNE #####
    sne_df=TSNE(n_components=2, perplexity=250, verbose=10).fit_transform(df.values)
    sne_df=pd.DataFrame(sne_df, columns=["TSNE_1","TSNE_1"])
    TSNE_file=name[i]+"_TSNE"+".pkl"
    sne_df.to_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\TSNE\\{}".format(TSNE_file))
    df['TSNE_1']=sne_df.iloc[:,0]
    df['TSNE_2']=sne_df.iloc[:,1]
    print("TSNE calcuclation done for ", name[i])
    print("total time ", datetime.datetime.now()-now)
    print("Current Time: ", datetime.datetime.now())

    #### setting up directories for Train Test splits #####
    x_train, x_test, y_train, y_test = train_test_split(df.values, y, test_size=0.05, random_state=42)
    
    %cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\x_train"
    pd.DataFrame(x_train, columns=df.columns).to_pickle(name[i]+"_x_train"+".pkl")
    
    %cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\x_test"
    pd.DataFrame(x_test, columns=df.columns).to_pickle(name[i]+"_x_test"+".pkl")
    
    %cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\y_train"
    pd.DataFrame(y_train).to_pickle(name[i]+"_y_train"+".pkl")
    
    %cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\y_test"
    pd.DataFrame(y_test).to_pickle(name[i]+"_y_test"+".pkl")

C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\df's
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\df's
[t-SNE] Computing 751 nearest neighbors...
[t-SNE] Indexed 10000 samples in 0.454s...
[t-SNE] Computed neighbors for 10000 samples in 149.610s...
[t-SNE] Computed conditional probabilities for sample 1000 / 10000
[t-SNE] Computed conditional probabilities for sample 2000 / 10000
[t-SNE] Computed conditional probabilities for sample 3000 / 10000
[t-SNE] Computed conditional probabilities for sample 4000 / 10000
[t-SNE] Computed conditional probabilities for sample 5000 / 10000
[t-SNE] Computed conditional probabilities for sample 6000 / 10000
[t-SNE] Computed conditional probabilities for sample 7000 / 10000
[t-SNE] Computed conditional probabilities for sample 8000 / 10000
[t-SNE] Computed conditional probabilities for sample 9000 / 10000
[t-SNE] Computed conditional probabilities for sample 10000 / 10000
[t-SNE] Mean sigma: 0.318999
[t-SNE] Computed conditional probabilities in 3.087s
[t-SNE] Iteration 50: error = 69.9212418, gradient norm = 0.0189482 (50 iterations in 23.324s)
[t-SNE] Iteration 100: error = 71.1002502, gradient norm = 0.0061875 (50 iterations in 17.171s)
[t-SNE] Iteration 150: error = 71.9981155, gradient norm = 0.0022648 (50 iterations in 18.855s)
[t-SNE] Iteration 200: error = 71.8713760, gradient norm = 0.0040598 (50 iterations in 26.203s)
[t-SNE] Iteration 250: error = 71.8986206, gradient norm = 0.0035788 (50 iterations in 30.483s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 71.898621
[t-SNE] Iteration 300: error = 2.7940745, gradient norm = 0.0019683 (50 iterations in 46.084s)
[t-SNE] Iteration 350: error = 2.5963926, gradient norm = 0.0015467 (50 iterations in 76.462s)
[t-SNE] Iteration 400: error = 2.4361203, gradient norm = 0.0002727 (50 iterations in 74.450s)
[t-SNE] Iteration 450: error = 2.4035795, gradient norm = 0.0001801 (50 iterations in 51.061s)
[t-SNE] Iteration 500: error = 2.3779471, gradient norm = 0.0001856 (50 iterations in 30.629s)
[t-SNE] Iteration 550: error = 2.3771119, gradient norm = 0.0001429 (50 iterations in 36.838s)
[t-SNE] Iteration 600: error = 2.3640788, gradient norm = 0.0001627 (50 iterations in 34.245s)
[t-SNE] Iteration 650: error = 2.3159752, gradient norm = 0.0004437 (50 iterations in 26.964s)
[t-SNE] Iteration 700: error = 2.1884377, gradient norm = 0.0013692 (50 iterations in 14.183s)
[t-SNE] Iteration 750: error = 2.1858022, gradient norm = 0.0000592 (50 iterations in 13.892s)
[t-SNE] Iteration 800: error = 2.1857400, gradient norm = 0.0000254 (50 iterations in 13.840s)
[t-SNE] Iteration 850: error = 2.1857526, gradient norm = 0.0000142 (50 iterations in 13.721s)
[t-SNE] Iteration 900: error = 2.1858263, gradient norm = 0.0000185 (50 iterations in 13.658s)
[t-SNE] Iteration 950: error = 2.1859376, gradient norm = 0.0000097 (50 iterations in 13.788s)
[t-SNE] Iteration 1000: error = 2.1861193, gradient norm = 0.0000105 (50 iterations in 13.879s)
[t-SNE] KL divergence after 1000 iterations: 2.186119
TSNE calcuclation done for  dtm_l1
total time  0:12:23.792472
Current Time:  2019-12-13 00:32:03.176892
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\x_train
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\x_test
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\y_train
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\y_test
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\df's
[t-SNE] Computing 751 nearest neighbors...
[t-SNE] Indexed 10000 samples in 0.447s...
[t-SNE] Computed neighbors for 10000 samples in 81.599s...
[t-SNE] Computed conditional probabilities for sample 1000 / 10000
[t-SNE] Computed conditional probabilities for sample 2000 / 10000
[t-SNE] Computed conditional probabilities for sample 3000 / 10000
[t-SNE] Computed conditional probabilities for sample 4000 / 10000
[t-SNE] Computed conditional probabilities for sample 5000 / 10000
[t-SNE] Computed conditional probabilities for sample 6000 / 10000
[t-SNE] Computed conditional probabilities for sample 7000 / 10000
[t-SNE] Computed conditional probabilities for sample 8000 / 10000
[t-SNE] Computed conditional probabilities for sample 9000 / 10000
[t-SNE] Computed conditional probabilities for sample 10000 / 10000
[t-SNE] Mean sigma: 0.109157
[t-SNE] Computed conditional probabilities in 3.547s
[t-SNE] Iteration 50: error = 67.7459335, gradient norm = 0.0055018 (50 iterations in 15.446s)
[t-SNE] Iteration 100: error = 60.2961349, gradient norm = 0.0007880 (50 iterations in 15.800s)
[t-SNE] Iteration 150: error = 60.1451607, gradient norm = 0.0000250 (50 iterations in 13.398s)
[t-SNE] Iteration 200: error = 60.1399765, gradient norm = 0.0000413 (50 iterations in 12.312s)
[t-SNE] Iteration 250: error = 60.1398468, gradient norm = 0.0000249 (50 iterations in 12.501s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 60.139847
[t-SNE] Iteration 300: error = 1.6233674, gradient norm = 0.0019057 (50 iterations in 13.850s)
[t-SNE] Iteration 350: error = 1.3570467, gradient norm = 0.0003720 (50 iterations in 15.129s)
[t-SNE] Iteration 400: error = 1.2828726, gradient norm = 0.0001797 (50 iterations in 16.288s)
[t-SNE] Iteration 450: error = 1.2504454, gradient norm = 0.0001028 (50 iterations in 16.358s)
[t-SNE] Iteration 500: error = 1.2338892, gradient norm = 0.0000704 (50 iterations in 15.778s)
[t-SNE] Iteration 550: error = 1.2245333, gradient norm = 0.0000457 (50 iterations in 15.971s)
[t-SNE] Iteration 600: error = 1.2187227, gradient norm = 0.0000328 (50 iterations in 15.571s)
[t-SNE] Iteration 650: error = 1.2149807, gradient norm = 0.0000259 (50 iterations in 16.413s)
[t-SNE] Iteration 700: error = 1.2124690, gradient norm = 0.0000203 (50 iterations in 16.469s)
[t-SNE] Iteration 750: error = 1.2106905, gradient norm = 0.0000205 (50 iterations in 16.630s)
[t-SNE] Iteration 800: error = 1.2095976, gradient norm = 0.0000152 (50 iterations in 15.488s)
[t-SNE] Iteration 850: error = 1.2088320, gradient norm = 0.0000142 (50 iterations in 15.448s)
[t-SNE] Iteration 900: error = 1.2083148, gradient norm = 0.0000136 (50 iterations in 16.687s)
[t-SNE] Iteration 950: error = 1.2080100, gradient norm = 0.0000120 (50 iterations in 15.635s)
[t-SNE] Iteration 1000: error = 1.2077711, gradient norm = 0.0000104 (50 iterations in 15.261s)
[t-SNE] KL divergence after 1000 iterations: 1.207771
TSNE calcuclation done for  dtm_l2
total time  0:18:59.970836
Current Time:  2019-12-13 00:38:39.354255
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\x_train
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\x_test
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\y_train
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\y_test
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\df's
[t-SNE] Computing 751 nearest neighbors...
[t-SNE] Indexed 10000 samples in 0.727s...
[t-SNE] Computed neighbors for 10000 samples in 164.801s...
[t-SNE] Computed conditional probabilities for sample 1000 / 10000
[t-SNE] Computed conditional probabilities for sample 2000 / 10000
[t-SNE] Computed conditional probabilities for sample 3000 / 10000
[t-SNE] Computed conditional probabilities for sample 4000 / 10000
[t-SNE] Computed conditional probabilities for sample 5000 / 10000
[t-SNE] Computed conditional probabilities for sample 6000 / 10000
[t-SNE] Computed conditional probabilities for sample 7000 / 10000
[t-SNE] Computed conditional probabilities for sample 8000 / 10000
[t-SNE] Computed conditional probabilities for sample 9000 / 10000
[t-SNE] Computed conditional probabilities for sample 10000 / 10000
[t-SNE] Mean sigma: 1.473225
[t-SNE] Computed conditional probabilities in 2.914s
[t-SNE] Iteration 50: error = 68.2438889, gradient norm = 0.1391273 (50 iterations in 20.014s)
[t-SNE] Iteration 100: error = 69.6944733, gradient norm = 0.0865247 (50 iterations in 20.759s)
[t-SNE] Iteration 150: error = 69.5777817, gradient norm = 0.1210581 (50 iterations in 22.471s)
[t-SNE] Iteration 200: error = 69.5208130, gradient norm = 0.1019059 (50 iterations in 20.512s)
[t-SNE] Iteration 250: error = 69.9595261, gradient norm = 0.0926369 (50 iterations in 17.891s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 69.959526
[t-SNE] Iteration 300: error = 2.6270587, gradient norm = 0.0041577 (50 iterations in 16.372s)
[t-SNE] Iteration 350: error = 2.4387221, gradient norm = 0.0055062 (50 iterations in 15.815s)
[t-SNE] Iteration 400: error = 2.4200284, gradient norm = 0.0002775 (50 iterations in 15.821s)
[t-SNE] Iteration 450: error = 2.4162779, gradient norm = 0.0000789 (50 iterations in 15.602s)
[t-SNE] Iteration 500: error = 2.4151018, gradient norm = 0.0000439 (50 iterations in 15.792s)
[t-SNE] Iteration 550: error = 2.4143989, gradient norm = 0.0000415 (50 iterations in 16.028s)
[t-SNE] Iteration 600: error = 2.4138396, gradient norm = 0.0000240 (50 iterations in 15.866s)
[t-SNE] Iteration 650: error = 2.4133272, gradient norm = 0.0000871 (50 iterations in 15.958s)
[t-SNE] Iteration 700: error = 2.4129448, gradient norm = 0.0000293 (50 iterations in 15.966s)
[t-SNE] Iteration 750: error = 2.4127402, gradient norm = 0.0000227 (50 iterations in 15.986s)
[t-SNE] Iteration 800: error = 2.4125929, gradient norm = 0.0000113 (50 iterations in 16.272s)
[t-SNE] Iteration 850: error = 2.4125130, gradient norm = 0.0000088 (50 iterations in 16.384s)
[t-SNE] Iteration 900: error = 2.4124386, gradient norm = 0.0000104 (50 iterations in 16.308s)
[t-SNE] Iteration 950: error = 2.4124711, gradient norm = 0.0000077 (50 iterations in 16.290s)
[t-SNE] Iteration 1000: error = 2.4125392, gradient norm = 0.0000076 (50 iterations in 16.147s)
[t-SNE] KL divergence after 1000 iterations: 2.412539
TSNE calcuclation done for  dtm_freq
total time  0:27:35.701970
Current Time:  2019-12-13 00:47:15.085389
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\x_train
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\x_test
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\y_train
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\y_test

################################################################################ TSNE Graph ####################################################################################
# tsne=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\TSNE\\dtm_l1_TSNE.pkl')
# ratings=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\df's\\reviews.pkl").rating

# tsne['Ratings']=ratings
# tsne=tsne.sort_values('Ratings')
# tsne['Ratings']=tsne['Ratings'].astype(str)+'#'
ones=tsne[tsne.Ratings=='1#']
fives=tsne[tsne.Ratings=='5#']

fig, axe = plt.subplots(figsize=(20,20))


sns.kdeplot(data=ones.iloc[:,0],data2=ones.iloc[:,1],cmap="Blues", ax=axe, shade=True, alpha=.3 )
sns.kdeplot(data=fives.iloc[:,0],data2=fives.iloc[:,1],cmap="Reds", shade=True, ax=axe, alpha=.3)
sns.scatterplot(x=tsne.iloc[:,0], y=tsne.iloc[:,1], hue=tsne['Ratings'], palette='twilight',
                x_jitter=10,y_jitter=10,alpha=.8,style=tsne['Ratings'], ax=axe)
axe.set_title('T-SNE Graph', size=50, fontdict= {'family':'MS Reference Sans Serif'} )
axe.set_ylabel('T-SNE Component One', size=30)
axe.set_xlabel('T-SNE Component Two', size=30)
axe.legend(loc=2, markerscale=3,fancybox=True, shadow=True, prop={'size': 20})

fig.savefig('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\figs\\TSNE.png')

### Clean Up ####
# del(df, directory_main, file, i, name,names, y, x_train, x_test, y_train, y_test)

Dimension Reduction¶

This section is not in the most logical order so I will explain. The first initial dimension reduction were just fit based off of default settings. After running the first batch of models and narrowing down my DTMs the dimensions were then re-worked and more finely tunned.

1st dimension reductions
l1 dimension reductions
UMap Graph
LLE Graph

########################################################################### First Dimension Reductions #############################################################################

import warnings
warnings.simplefilter("ignore")

now=datetime.datetime.now()

#### dimension Reduction ####
dimensions=3
n_neighbores=15

#### Parameters ####
umap_fun=umap.UMAP(n_neighbors=15, n_components=3)
lle_fun=LLE(n_neighbors=5,n_components=3,n_jobs=-1)
sparce_fun=SparsePCA(n_components=3, n_jobs=-1,)
pca_fun=PCA(n_components=.8)

### initializations ####
dimension_tec=[umap_fun, lle_fun, sparce_fun]
reduction_test=pd.DataFrame()
reduction_train=pd.DataFrame()
names=["dtm_freq", "dtm_l1", "dtm_l2"]
### getting paths ######
path="C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\"
x_train_directory=os.listdir("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\x_train")
x_test_directory=os.listdir("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\x_test")


for i, file in enumerate(x_train_directory):
    
    
    ### take out [0:50] to run whole dataset ####
    drops=['vader_neg','vader_neu', 'vader_pos', 'vader_compound','Text_blob_polarity','text_blob_subjectivity','TSNE_1','TSNE_2']
    x_train=pd.read_pickle(path+"x_train\\"+file)
    sent_train=x_train[['vader_neg','vader_neu', 'vader_pos', 'vader_compound','Text_blob_polarity','text_blob_subjectivity']]
    TSNE_train=x_train[['TSNE_1','TSNE_2']]
    x_train=x_train[[col for col in x_train.columns if col not in drops]]
#     display(x_train)

    x_test=pd.read_pickle(path+"x_test\\"+x_test_directory[i])
    sent_test=x_test[['vader_neg','vader_neu', 'vader_pos', 'vader_compound','Text_blob_polarity','text_blob_subjectivity']]
    TSNE_test=x_test[['TSNE_1','TSNE_2']]
    x_test=x_test[[col for col in x_test.columns if col not in drops]]
#     display(x_test)

    ### Sparse ####
    Sparse_transform_train=sparce_fun.fit_transform(x_train) # fitting to the training set
    col_name=["Sparse_{}".format(x) for x in range(Sparse_transform_train.shape[1])] # creating col names
    sparse_df_train=pd.DataFrame(Sparse_transform_train, columns=col_name) # Sparse train df
    sparse_df_test=pd.DataFrame(sparce_fun.transform(x_test), columns=col_name) # Sparse test df
    print("\nSparse for DTM {}\ntotal time :".format(file), datetime.datetime.now()-now) ## ticker   
    
    ### PCA #####
    PCA_transform_train=pca_fun.fit_transform(x_train) # fitting to the training set
    col_name=["PCA_{}".format(x) for x in range(PCA_transform_train.shape[1])] # creating col names
    PCA_df_train=pd.DataFrame(PCA_transform_train, columns=col_name) # PCA train df
    PCA_df_test=pd.DataFrame(pca_fun.transform(x_test), columns=col_name) # PCA test df
    print("\n\n\nPCA for DTM {}\ntotal time :".format(file), datetime.datetime.now()-now) ## ticker   
    
    ### U Map #####
    u_map_transform_train=umap_fun.fit_transform(x_train) # fitting to the training set
    col_name=["UMAP_{}".format(x) for x in range(u_map_transform_train.shape[1])] # creating col names
    umap_df_train=pd.DataFrame(u_map_transform_train, columns=col_name) # umap train df
    umap_df_test=pd.DataFrame(umap_fun.transform(x_test), columns=col_name) # umap test df
    print("\n\n\nUMAP for DTM {}\ntotal time :".format(file), datetime.datetime.now()-now) ## ticker   
    
    ### lle ####
    lle_transform_train=lle_fun.fit_transform(x_train) # fitting to the training set
    col_name=["LLE_{}".format(x) for x in range(lle_transform_train.shape[1])] # creating col names
    lle_df_train=pd.DataFrame(lle_transform_train, columns=col_name) # lle train df
    lle_df_test=pd.DataFrame(lle_fun.transform(x_test), columns=col_name) # lle test df
    print("\nLLE for DTM {}\ntotal time :".format(file), datetime.datetime.now()-now) ## ticker   


    ### Saving\creating DF's ###
    train_df=pd.concat([sparse_df_train, PCA_df_train,umap_df_train,sent_train,lle_df_train,TSNE_train], axis=1)
    test_df=pd.concat([sparse_df_test, PCA_df_test, umap_df_test, sent_test, lle_df_test,TSNE_test], axis=1)
    
    test_df.to_pickle(path+"dimension_rd_x_test\\"+names[i]+"_dim_rd_test.pkl")
    train_df.to_pickle(path+"dimension_rd_x_train\\"+names[i]+"_dim_rd_train.pkl")
    
    print("\n############################# File Saved At {} #################################".format(datetime.datetime.now()))

C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\sklearn\decomposition\sparse_pca.py:170: DeprecationWarning: normalize_components=False is a backward-compatible setting that implements a non-standard definition of sparse PCA. This compatibility mode will be removed in 0.22.
  DeprecationWarning)

Sparse for DTM dtm_freq_x_train.pkl
total time : 0:00:50.817578


PCA for DTM dtm_freq_x_train.pkl
total time : 0:00:54.807404

C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\numba\typed_passes.py:271: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "..\..\..\..\Anaconda3\envs\PyhtonAndR\lib\site-packages\umap\rp_tree.py", line 135:
@numba.njit(fastmath=True, nogil=True, parallel=True)
def euclidean_random_projection_split(data, indices, rng_state):
^

  state.func_ir.loc))
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\numba\typed_passes.py:271: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "..\..\..\..\Anaconda3\envs\PyhtonAndR\lib\site-packages\umap\rp_tree.py", line 135:
@numba.njit(fastmath=True, nogil=True, parallel=True)
def euclidean_random_projection_split(data, indices, rng_state):
^

  state.func_ir.loc))
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\umap\nndescent.py:92: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "..\..\..\..\Anaconda3\envs\PyhtonAndR\lib\site-packages\umap\utils.py", line 409:
@numba.njit(parallel=True)
def build_candidates(current_graph, n_vertices, n_neighbors, max_candidates, rng_state):
^

  current_graph, n_vertices, n_neighbors, max_candidates, rng_state
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\numba\typed_passes.py:271: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "..\..\..\..\Anaconda3\envs\PyhtonAndR\lib\site-packages\umap\nndescent.py", line 47:
    @numba.njit(parallel=True)
    def nn_descent(
    ^

  state.func_ir.loc))
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\numba\typed_passes.py:271: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "..\..\..\..\Anaconda3\envs\PyhtonAndR\lib\site-packages\umap\nndescent.py", line 124:
    @numba.njit(parallel=True)
    def init_from_random(n_neighbors, data, query_points, heap, rng_state):
    ^

  state.func_ir.loc))
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\numba\typed_passes.py:271: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "..\..\..\..\Anaconda3\envs\PyhtonAndR\lib\site-packages\umap\nndescent.py", line 135:
    @numba.njit(parallel=True)
    def init_from_tree(tree, data, query_points, heap, rng_state):
    ^

  state.func_ir.loc))



UMAP for DTM dtm_freq_x_train.pkl
total time : 0:02:53.363985

LLE for DTM dtm_freq_x_train.pkl
total time : 0:04:28.076542

############################# File Saved At 2019-12-13 00:51:45.254088 #################################

C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\sklearn\decomposition\sparse_pca.py:170: DeprecationWarning: normalize_components=False is a backward-compatible setting that implements a non-standard definition of sparse PCA. This compatibility mode will be removed in 0.22.
  DeprecationWarning)

Sparse for DTM dtm_l1_x_train.pkl
total time : 0:04:36.999753


PCA for DTM dtm_l1_x_train.pkl
total time : 0:04:41.105059

C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\numba\typed_passes.py:271: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "..\..\..\..\Anaconda3\envs\PyhtonAndR\lib\site-packages\umap\nndescent.py", line 47:
    @numba.njit(parallel=True)
    def nn_descent(
    ^

  state.func_ir.loc))
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\numba\typed_passes.py:271: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "..\..\..\..\Anaconda3\envs\PyhtonAndR\lib\site-packages\umap\nndescent.py", line 124:
    @numba.njit(parallel=True)
    def init_from_random(n_neighbors, data, query_points, heap, rng_state):
    ^

  state.func_ir.loc))
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\numba\typed_passes.py:271: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "..\..\..\..\Anaconda3\envs\PyhtonAndR\lib\site-packages\umap\nndescent.py", line 135:
    @numba.njit(parallel=True)
    def init_from_tree(tree, data, query_points, heap, rng_state):
    ^

  state.func_ir.loc))



UMAP for DTM dtm_l1_x_train.pkl
total time : 0:06:07.503040

LLE for DTM dtm_l1_x_train.pkl
total time : 0:07:21.703124

############################# File Saved At 2019-12-13 00:54:38.914673 #################################

C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\sklearn\decomposition\sparse_pca.py:170: DeprecationWarning: normalize_components=False is a backward-compatible setting that implements a non-standard definition of sparse PCA. This compatibility mode will be removed in 0.22.
  DeprecationWarning)

Sparse for DTM dtm_l2_x_train.pkl
total time : 0:07:29.011671


PCA for DTM dtm_l2_x_train.pkl
total time : 0:07:33.044988

####################################################################### L1 Dimension Reduction ##############################################################################

import warnings
warnings.simplefilter("ignore")

now=datetime.datetime.now()

#### Parameters ####
umap_fun=umap.UMAP(n_neighbors=50, n_components=2, metric='cosine')
lle_fun=LLE(n_neighbors=40,n_components=2,n_jobs=-1)
sparce_fun=SparsePCA(n_components=3, n_jobs=-1,)
pca_fun=PCA(n_components=.8)

### initializations ####
dimension_tec=[umap_fun, lle_fun, sparce_fun]
reduction_test=pd.DataFrame()
reduction_train=pd.DataFrame()
names=[ "dtm_l1"]
### getting paths ######
path="C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\"
x_train_directory=os.listdir("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\x_train")[1]
x_test_directory=os.listdir("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\x_test")[1]

file=x_train_directory
i=0


################################ reading DataFrame ###########################################################
### take out [0:50] to run whole dataset ####
drops=['vader_neg','vader_neu', 'vader_pos', 'vader_compound','Text_blob_polarity','text_blob_subjectivity','TSNE_1','TSNE_2']
x_train=pd.read_pickle(path+"x_train\\"+file)
sent_train=x_train[['vader_neg','vader_neu', 'vader_pos', 'vader_compound','Text_blob_polarity','text_blob_subjectivity']]
TSNE_train=x_train[['TSNE_1','TSNE_2']]
x_train=x_train[[col for col in x_train.columns if col not in drops]]
#     display(x_train)

x_test=pd.read_pickle(path+"x_test\\"+x_test_directory)
sent_test=x_test[['vader_neg','vader_neu', 'vader_pos', 'vader_compound','Text_blob_polarity','text_blob_subjectivity']]
TSNE_test=x_test[['TSNE_1','TSNE_2']]
x_test=x_test[[col for col in x_test.columns if col not in drops]]
#     display(x_test)

#################################### Sparse #################################
Sparse_transform_train=sparce_fun.fit_transform(x_train) # fitting to the training set
col_name=["Sparse_{}".format(x) for x in range(Sparse_transform_train.shape[1])] # creating col names
sparse_df_train=pd.DataFrame(Sparse_transform_train, columns=col_name) # Sparse train df
sparse_df_test=pd.DataFrame(sparce_fun.transform(x_test), columns=col_name) # Sparse test df
print("\nSparse for DTM {}\ntotal time :".format(file), datetime.datetime.now()-now) ## ticker   

##################################### PCA ########################################
PCA_transform_train=pca_fun.fit_transform(x_train) # fitting to the training set
col_name=["PCA_{}".format(x) for x in range(PCA_transform_train.shape[1])] # creating col names
PCA_df_train=pd.DataFrame(PCA_transform_train, columns=col_name) # PCA train df
PCA_df_test=pd.DataFrame(pca_fun.transform(x_test), columns=col_name) # PCA test df
print("\n\n\nPCA for DTM {}\ntotal time :".format(file), datetime.datetime.now()-now) ## ticker   

##################################### U Map #######################################
u_map_transform_train=umap_fun.fit_transform(x_train) # fitting to the training set
col_name=["UMAP_{}".format(x) for x in range(u_map_transform_train.shape[1])] # creating col names
umap_df_train=pd.DataFrame(u_map_transform_train, columns=col_name) # umap train df
umap_df_test=pd.DataFrame(umap_fun.transform(x_test), columns=col_name) # umap test df
print("\n\n\nUMAP for DTM {}\ntotal time :".format(file), datetime.datetime.now()-now) ## ticker   

#################################### lle ############################################
lle_transform_train=lle_fun.fit_transform(x_train) # fitting to the training set
col_name=["LLE_{}".format(x) for x in range(lle_transform_train.shape[1])] # creating col names
lle_df_train=pd.DataFrame(lle_transform_train, columns=col_name) # lle train df
lle_df_test=pd.DataFrame(lle_fun.transform(x_test), columns=col_name) # lle test df
print("\nLLE for DTM {}\ntotal time :".format(file), datetime.datetime.now()-now) ## ticker   


############################## Saving\creating DF's ####################################################
train_df=pd.concat([sparse_df_train, PCA_df_train,umap_df_train,sent_train,lle_df_train,TSNE_train], axis=1)
test_df=pd.concat([sparse_df_test, PCA_df_test, umap_df_test, sent_test, lle_df_test,TSNE_test], axis=1)

test_df.to_pickle(path+"dimension_rd_x_test_2\\"+names[i]+"_dim_rd_test_2.pkl")
train_df.to_pickle(path+"dimension_rd_x_train_2\\"+names[i]+"_dim_rd_train_2.pkl")

print("\n############################# File Saved At {} #################################".format(datetime.datetime.now()))

Sparse for DTM dtm_l1_x_train.pkl
total time : 0:00:10.212767


PCA for DTM dtm_l1_x_train.pkl
total time : 0:00:14.086052


UMAP for DTM dtm_l1_x_train.pkl
total time : 0:08:17.256118

LLE for DTM dtm_l1_x_train.pkl
total time : 0:10:18.598185

############################# File Saved At 2019-12-15 20:52:53.456199 #################################

####################################################### Saving fits ####################################################
import pickle
dimension_tec=[umap_fun, lle_fun, sparce_fun, pca_fun]
dimension_name=['umap_fun.pickle', 'lle_fun.pickle', 'sparce_fun.pickle', 'PCA_fun.pickle']
%cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\models objects"
for i, fit in enumerate(dimension_tec):
    pickle_out = open(dimension_name[i],"wb")
    pickle.dump(fit, pickle_out)
    pickle_out.close()

C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\models objects

########################################################################### UMAP Graph ##############################################################################################

#### reading in files
y_train_l1=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\y_train\\dtm_l1_y_train.pkl')
x_train=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\x_train\\dtm_l1_x_train.pkl')


#### used for tunning UMap Wich was latter applied to the finale dimensions
# umap_fun=umap.UMAP(n_neighbors=50, n_components=2, metric='cosine')
# u_map_transform_train=umap_fun.fit_transform(x_train.iloc[:,:-8])
# raw=u_map_transform_train
# u_map_transform_train=pd.DataFrame(u_map_transform_train, columns=['UMAP_1', 'UMAP_2'])
# u_map_transform_train['rating']=y_train_l1.rating
# u_map_transform_train=u_map_transform_train.sort_values('rating')
# u_map_transform_train.rating=u_map_transform_train.rating.astype(str)+'#'
# u_map_transform_train=u_map_transform_train[u_map_transform_train.rating!='nan#']
# ones=u_map_transform_train[u_map_transform_train.rating=='1#']
# fives=u_map_transform_train[u_map_transform_train.rating=='5#']


########################################### plotting UMAP ##############################################################
fig, axe = plt.subplots(figsize=(20,20))

### ones and Fives Densities
# sns.kdeplot(data=ones.iloc[:,0],data2=ones.iloc[:,1],cmap="Blues", ax=axe, shade=True, alpha=.4 )
# sns.kdeplot(data=fives.iloc[:,0],data2=fives.iloc[:,1],cmap="Reds", shade=True, ax=axe, alpha=.4)

### Total Densities
sns.kdeplot(data=u_map_transform_train.iloc[:,0],data2=u_map_transform_train.iloc[:,1], cmap="twilight", ax=axe, shade=True, alpha=.4 )

### individual Points
sns.scatterplot(u_map_transform_train.UMAP_1, u_map_transform_train.UMAP_2, hue=u_map_transform_train.rating ,
                style=u_map_transform_train.rating, palette='twilight', alpha=.8)
axe.set_title('UMAP Graph', size=50, fontdict= {'family':'MS Reference Sans Serif'} )
axe.set_ylabel('UMAP Component One', size=30)
axe.set_xlabel('UMAP Component Two', size=30)
axe.legend(loc=2, markerscale=3,fancybox=True, shadow=True, prop={'size': 20})



#plt.savefig('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\figs\\UMAP.png')

####################################################################### LLE Graph ##############################################################################################


# lle_fun=LLE(n_neighbors=40,n_components=2,n_jobs=-1)
# lle_transform_train=lle_fun.fit_transform(x_train.iloc[:3000,:-8]) # fitting to the training set

lle_transform_train=pd.DataFrame(lle_transform_train, columns=['LLE_1', 'LLE_2'])
lle_transform_train['rating']=y_train_l1.rating
lle_transform_train=lle_transform_train.sort_values('rating')
lle_transform_train.rating=lle_transform_train.rating.astype(str)+'#'
lle_transform_train=lle_transform_train[lle_transform_train.rating!='nan#']
ones=lle_transform_train[lle_transform_train.rating=='1#']
fives=lle_transform_train[lle_transform_train.rating=='5#']



##################################################### Plotting LLE ############################################################################
fig, axe = plt.subplots(figsize=(20,20))

### ones Desnsity ###
#sns.kdeplot(data=ones.iloc[:,0],data2=ones.iloc[:,1],cmap="Blues", ax=axe, shade=True, alpha=.4 )

### Twos Desnity #####
#sns.kdeplot(data=fives.iloc[:,0],data2=fives.iloc[:,1],cmap="Reds", shade=True, ax=axe, alpha=.4)

### total Density ####
sns.kdeplot(data=lle_transform_train.iloc[:,1],data2=lle_transform_train.iloc[:,0],cmap="twilight", ax=axe, shade=True, alpha=.4 )

sns.scatterplot(lle_transform_train.LLE_2, lle_transform_train.LLE_1, hue=lle_transform_train.rating ,
                style=lle_transform_train.rating, palette='twilight', alpha=.8)
axe.set_title('LLE Graph', size=50, fontdict= {'family':'MS Reference Sans Serif'} )
axe.set_ylabel('LLE Component One', size=30)
axe.set_xlabel('LLE Component Two', size=30)

axe.set_ylim(-.02,.03)
axe.legend(loc=2, markerscale=3,fancybox=True, shadow=True, prop={'size': 20})

# axe.set_xlim(-.05,.08)


plt.savefig('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\figs\\LLE.png')
plt.show()

Building the Models¶

First Param Grid
First Initial Large Grid Search
Analyzing / Graphing Top Model
Fitting XgBoost on new features and more parameters
Plotting Feature Importance
Single Fit Algorithum Cell
Y test Fit Cell
Data Pipe Line Test

######################################################################### first Paramgrid ####################################################################################

knn_params={
    'p':[2],
    'n_neighbors':[5,10,20],
}

random_params={
    'n_estimators':[100,500],
    'min_samples_split':[5,10],
    'min_samples_leaf':[5,10],
}
gradient_params={
    'learning_rate':[.1,.01,.001],
    'n_estimators':[100,500],
    'subsample':[.6,.8,1],
    'min_samples_split':[5],
    'min_samples_leaf':[5],
    'random_state':[4],
}
xg_params={
    'learning_rate':[.1,.01,.001],
    'n_estimators':[100,500],
    'subsample':[.6,.8,1],
    'max_depth':[5,7,9],
}

master_params=[knn_params, random_params, gradient_params, xg_params]
models=[KNeighborsClassifier, RandomForestClassifier, GradientBoostingClassifier, XGBClassifier]

######################################################################### first inital GridSearch ##################################################################################


path="C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\"

now=datetime.datetime.now()
y_train_files=os.listdir(path+"y_train")
x_train_files=os.listdir(path+"dimension_rd_x_train")
df_names=["freq", "l1","l2"]
master_1=pd.DataFrame()

##### selecting only 100 PCA's (issue with time) ##########################
cols=["PCA_{}".format(x) for x in range(100)]
cols.extend(['Sparse_0','Sparse_1','Sparse_2','UMAP_0','UMAP_1','UMAP_2','vader_neg','vader_neu', 'vader_pos', 'vader_compound',
             'Text_blob_polarity','text_blob_subjectivity','LLE_0','LLE_1','LLE_2', 'TSNE_1','TSNE_2'])

for i, x_train in enumerate(x_train_files):
    #### reading files #######
    x_train=pd.read_pickle(path+"dimension_rd_x_train\\{}".format(x_train))
    y_train=pd.read_pickle(path+"y_train\\{}".format(y_train_files[i])).rating.values
    x_train=x_train.loc[:,cols].values
    
    for j, model in enumerate(models):
        #### fitting models ####
        clf=GridSearchCV(model(), master_params[j], n_jobs= -1 ,cv=5, verbose=10)
        new_models=clf.fit(x_train, y_train)
        df=pd.DataFrame(new_models.cv_results_)
        df['df']=df_names[i]
        df['cols']='All'
        df['model']=model.__name__
        master_1=pd.concat([master_1,df], axis=0)
        print('Finished with df {} model {}'.format(df_names[i],model.__name__))
        print("Total time:", datetime.datetime.now()-now)
        print("\n\n#################################################### Fin #############################################################")

Fitting 5 folds for each of 3 candidates, totalling 15 fits

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of  15 | elapsed:    6.2s remaining:   41.1s
[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed:    6.8s remaining:   18.8s
[Parallel(n_jobs=-1)]: Done   6 out of  15 | elapsed:    7.4s remaining:   11.2s
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed:    7.8s remaining:    6.8s
[Parallel(n_jobs=-1)]: Done  10 out of  15 | elapsed:   12.5s remaining:    6.2s
[Parallel(n_jobs=-1)]: Done  12 out of  15 | elapsed:   12.8s remaining:    3.1s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   13.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.

Finished with df freq model KNeighborsClassifier
Total time: 0:00:13.897160


#################################################### Fin #############################################################
Fitting 5 folds for each of 8 candidates, totalling 40 fits

[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   42.0s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  30 out of  40 | elapsed:  1.3min remaining:   26.0s
[Parallel(n_jobs=-1)]: Done  35 out of  40 | elapsed:  1.5min remaining:   12.5s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.8min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.8min finished
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\ipykernel_launcher.py:28: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.

Finished with df freq model RandomForestClassifier
Total time: 0:02:36.127865


#################################################### Fin #############################################################
Fitting 5 folds for each of 18 candidates, totalling 90 fits

[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   59.5s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 14.7min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed: 17.3min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed: 26.8min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed: 30.3min
[Parallel(n_jobs=-1)]: Done  85 out of  90 | elapsed: 41.3min remaining:  2.4min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed: 43.6min finished
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\ipykernel_launcher.py:28: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.

Finished with df freq model GradientBoostingClassifier
Total time: 0:50:03.433879


#################################################### Fin #############################################################
Fitting 5 folds for each of 54 candidates, totalling 270 fits

[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   50.7s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 11.1min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed: 13.4min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed: 22.8min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed: 25.5min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed: 33.2min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed: 39.4min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed: 44.6min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed: 50.2min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 61.8min
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed: 66.3min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 80.7min
[Parallel(n_jobs=-1)]: Done 205 tasks      | elapsed: 90.0min
[Parallel(n_jobs=-1)]: Done 226 tasks      | elapsed: 97.8min
[Parallel(n_jobs=-1)]: Done 249 tasks      | elapsed: 107.2min
[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed: 125.1min finished
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\ipykernel_launcher.py:28: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.

Finished with df freq model XGBClassifier
Total time: 2:59:11.766535


#################################################### Fin #############################################################
Fitting 5 folds for each of 3 candidates, totalling 15 fits

[Parallel(n_jobs=-1)]: Done   2 out of  15 | elapsed:    1.0s remaining:    7.3s
[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed:    2.6s remaining:    7.2s
[Parallel(n_jobs=-1)]: Done   6 out of  15 | elapsed:    2.9s remaining:    4.3s
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed:    3.4s remaining:    3.0s
[Parallel(n_jobs=-1)]: Done  10 out of  15 | elapsed:    3.7s remaining:    1.8s
[Parallel(n_jobs=-1)]: Done  12 out of  15 | elapsed:    4.0s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    4.3s finished
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\ipykernel_launcher.py:28: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.

######################################################################### top Model Graphs Round One #########################################################################################

rd_one=pd.read_csv('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\results\\master_all_cols.csv')
display(rd_one.groupby(['model','df'], as_index=False)['mean_test_score'].max())
df_3=rd_one.groupby(['model','df'], as_index=False)['mean_test_score'].agg(['min','max','mean','std'],as_index=False)

df_3=df_3.reset_index()
sns.set_style('darkgrid')
df_3=df_3.sort_values('max', ascending=False)
fig, ax= plt.subplots(figsize=(11,5))
sns.barplot(y=df_3['model'], x=df_3['max'], hue=df_3.df, palette='mako')
ax.set_title('Top Model by Algorithum and DTM')
ax.set_ylabel('')
ax.set_xlabel('Accuracey')
#ax.annotate('afg',(34,45))
axe.legend(loc=2, markerscale=5,fancybox=True, shadow=True, prop={'size': 30})
plt.savefig('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\figs\\top_models.png')
#ax.table(df_3)

###################################################################### NEW Feature Fit + Various Paremeters ########################################################################
### this cell dynamically changed as new features and parameters were test on feature importance


xg_params={
    'learning_rate':[.1,.001,.01],
    'n_estimators':[100,500],
    'subsample':[.6,.8,1],
    'max_depth':[5,9,15],
    
}

path="C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\"

i=0
now=datetime.datetime.now()
y_train_files=os.listdir(path+"y_train")
x_train_files=os.listdir(path+"dimension_rd_x_train_2")
df_names="l1"
master_3=pd.DataFrame()

####### feature importance selection ################3
cols=ft_import_df.sort_values('ft_im', ascending=False).cols[0:200]
# cols=["PCA_{}".format(x) for x in range(100)]
# cols=['Sparse_0','Sparse_1','Sparse_2','UMAP_0','UMAP_1','UMAP_2','vader_neg','vader_neu', 'vader_pos', 'vader_compound',
#              'Text_blob_polarity','text_blob_subjectivity','LLE_0','LLE_1','LLE_2', 'PCA_1','PCA_2','PCA_3']


##################### Reading in File ############################################
x_train=pd.read_pickle(path+"dimension_rd_x_train_2\\{}".format(x_train_files[0]))
x_train=x_train[cols].values
y_train=pd.read_pickle(path+"y_train\\{}".format(y_train_files[1])).rating.values
    

#### fitting models ####
clf=GridSearchCV(XGBClassifier(tree_method='gpu_hist'), xg_params ,cv=3, verbose=10)
new_models=clf.fit(x_train, y_train)
df=pd.DataFrame(new_models.cv_results_)
df['df']=df_names
df['cols']='to 50 ft'
df['model']=XGBClassifier.__name__
master_3=pd.concat([master_2,df], axis=0)
print('Finished with df {} model {}'.format(df_names[i],model.__name__))
print("Total time:", datetime.datetime.now()-now)
print("\n\n#################################################### Fin #############################################################")

Fitting 3 folds for each of 54 candidates, totalling 162 fits
[CV] learning_rate=0.1, max_depth=5, n_estimators=100, subsample=0.6 .

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.

[CV]  learning_rate=0.1, max_depth=5, n_estimators=100, subsample=0.6, score=0.408, total=   6.6s
[CV] learning_rate=0.1, max_depth=5, n_estimators=100, subsample=0.6 .

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.5s remaining:    0.0s

[CV]  learning_rate=0.1, max_depth=5, n_estimators=100, subsample=0.6, score=0.409, total=   6.5s
[CV] learning_rate=0.1, max_depth=5, n_estimators=100, subsample=0.6 .

[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   13.1s remaining:    0.0s

[CV]  learning_rate=0.1, max_depth=5, n_estimators=100, subsample=0.6, score=0.399, total=   6.7s
[CV] learning_rate=0.1, max_depth=5, n_estimators=100, subsample=0.8 .

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   19.9s remaining:    0.0s

[CV]  learning_rate=0.1, max_depth=5, n_estimators=100, subsample=0.8, score=0.405, total=   6.4s
[CV] learning_rate=0.1, max_depth=5, n_estimators=100, subsample=0.8 .

[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   26.3s remaining:    0.0s

####################################################################### Creating Feature Importance DataFrame ################################################################################
###
#clf.estimator.fit(x_train, y_train)
# clf_1=XGBClassifier(**clf.best_params_)
# clf_1.fit(x_train, y_train)
# clf_1.feature_importances_
# plot_importance(clf_1,max_num_features=30 )
# x_train=pd.read_pickle(path+"dimension_rd_x_train_2\\{}".format(x_train_files[0]))
# x_train[[col for col in x_train if col not in ['TSNE_1', 'TSNE_2']]]
# ft_import_df=pd.DataFrame()
# ft_import_df['ft_im']=clf_1.feature_importances_
# ft_import_df['cols']=x_train[[col for col in x_train if col not in ['TSNE_1', 'TSNE_2']]].columns
# ft_import_df.sort_values('ft_im', ascending=False).cols[0:100]
#ft_import_df.to_pickle('ft_imp_df.pkl')
# fin_100=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\results\\results_ft_selct_100.pkl')
# fin_100.mean_test_score.max()

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=500, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.8, tree_method='gpu_hist', verbosity=1)

####################################################################### Plotting Feature Importance  ####################################################################################
sns.set_style("dark")
ft_import_df_1=ft_import_df.sort_values('ft_im', ascending=False)
fig, ax= plt.subplots(figsize=(20,10))
sns.barplot(ft_import_df_1.ft_im[0:16],ft_import_df_1.cols[0:16],ax=ax, palette='twilight')
ax.set_title("XGBoost's Top 16 Most Important Features\n", size=30,fontdict= {'family':'MS Reference Sans Serif','weight':'bold'}, weight='bold'  )
ax.set_ylabel('')
ax.set_xlabel('F-Score', weight='bold', size=15)
plt.yticks(fontsize=18)
plt.tight_layout()
plt.savefig('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\figs\\XG_ft_importance_fig.png')

fin_100=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\results\\results_ft_selct_100.pkl')
fin_100.mean_test_score.max()

0.4203157894736842

########################################################################## Single Fit Algorithum Cell #####################################################################################
path="C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\"

i=0
now=datetime.datetime.now()
y_train_files=os.listdir(path+"y_train")
x_train_files=os.listdir(path+"dimension_rd_x_train_2")
x_train=pd.read_pickle(path+"dimension_rd_x_train_2\\{}".format(x_train_files[0]))
y_train_files=os.listdir(path+"y_train")
y_train=pd.read_pickle(path+"y_train\\{}".format(y_train_files[1]))

path="C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\"
params={'learning_rate': 0.1,
 'max_depth': 5,
 'n_estimators': 100,
 'subsample': 0.6,
 'tree_method': 'gpu_hist'}


col_list_fix=['vader_compound', 'Text_blob_polarity', 'vader_pos', 'vader_neg', 'text_blob_subjectivity', 'vader_neu']



i=0
y_train_files=os.listdir(path+"y_train")
x_train_files=os.listdir(path+"dimension_rd_x_train_2")
ft_import_df=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\ft_imp_df.pkl')

##### Selecting top 100 ft ##########################
cols=ft_import_df.sort_values('ft_im', ascending=False).cols[0:100]

x_train=pd.read_pickle(path+"dimension_rd_x_train_2\\{}".format(x_train_files[0]))
x_train=x_train[cols]
y_train=pd.read_pickle(path+"y_train\\dtm_l1_y_train.pkl").rating.values
    

#### fitting models ####
for col in col_list_fix:
    x_test[col]=x_test[col].astype(float)
    x_train[col]=x_train[col].astype(float)


from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
clf=XGBClassifier(**params)
clf.fit(x_train,y_train)
scores = cross_validate(clf, x_train , y_train, cv=5)
scores

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-260-6a65c6ee9079> in <module>
     25 y_train_files=os.listdir(path+"y_train")
     26 x_train_files=os.listdir(path+"dimension_rd_x_train_2")
---> 27 ft_import_df=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\ft_imp_df.pkl')
     28 
     29 ##### Selecting top 100 ft ##########################

~\Anaconda3\envs\PyhtonAndR\lib\site-packages\pandas\io\pickle.py in read_pickle(path, compression)
    143     """
    144     path = _stringify_path(path)
--> 145     f, fh = _get_handle(path, "rb", compression=compression, is_text=False)
    146 
    147     # 1) try standard libary Pickle

~\Anaconda3\envs\PyhtonAndR\lib\site-packages\pandas\io\common.py in _get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text)
    403         else:
    404             # Binary mode
--> 405             f = open(path_or_buf, mode)
    406         handles.append(f)
    407 

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\ft_imp_df.pkl'

############################################################### Test fit For Y Variable ###################################################################

x_test=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\dimension_rd_x_test\\dtm_l1_dim_rd_test.pkl")
y_test=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\y_test\\dtm_l1_y_test.pkl")
y_train=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\y_train\\dtm_l1_y_train.pkl")
x_train=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\dimension_rd_x_train\\dtm_l1_dim_rd_train.pkl")
col_list_fix=['vader_compound', 'Text_blob_polarity', 'vader_pos', 'vader_neg', 'text_blob_subjectivity', 'vader_neu']

for col in col_list_fix:
    x_test[col]=x_test[col].astype(float)
    x_train[col]=x_train[col].astype(float)

y_train=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\\y_train\\dtm_l1_y_train.pkl")

# display(x_test.head())
# x_train.head()
# clf.fit(x_train[cols].values,y_train)
cols=ft_import_df.sort_values('ft_im', ascending=False).cols[0:100]

pred=clf.predict(x_test[cols])
pred_1=clf.predict(x_train[cols])
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

print("\n\n\n test df results")
confusion_matrix(y_test,pred_1)
print(classification_report(y_train, pred_1))



print("\n\n\n test df results")
confusion_matrix(y_test,pred)
print(classification_report(y_test, pred))

####################################################### saving model ####################################################
import pickle

pickle_out = open("clf.pickle","wb")
pickle.dump(clf, pickle_out)
pickle_out.close()

##################################################################### Creating a Data Pipline to Test #################################################################################

%cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\Amazon_reviews"
from sklearn.feature_extraction.text import TfidfTransformer

n = 3000000 # df size
s = 800 #desired sample size
skip = sorted(random.sample(range(1, n), n-s)) # rows to skip
reviews_test = pd.read_csv("Amazon_reviews.csv", skiprows=skip, names=["rating", "title", "review"])

now=datetime.datetime.now()
analyser = SentimentIntensityAnalyzer()
sent_vader = pd.DataFrame(columns=["vader_neg", "vader_neu",'vader_pos','vader_compound'])
x=0

###### vader
for i, review in enumerate(reviews_test.review):
    sent=analyser.polarity_scores(review)
    sent_vader.loc[i,"vader_neg"]=sent['neg']
    sent_vader.loc[i,"vader_neu"]=sent['neu']
    sent_vader.loc[i,"vader_pos"]=sent['pos']
    sent_vader.loc[i,"vader_compound"]=sent['compound']
    if i%1000==0: 
        print("{}% done at {}".format(x, datetime.datetime.now()))
        x+=10

### text blob
sent_blob=pd.DataFrame(columns=['Text_blob_polarity', "text_blob_subjectivity"])
x=0
for i, review in enumerate(reviews_test.review):
    sent=TextBlob(review).sentiment
    sent_blob.loc[i,"Text_blob_polarity"]=sent[0]
    sent_blob.loc[i,"text_blob_subjectivity"]=sent[1]
    if i%1000==0: 
        print("{}% done at {}".format(x, datetime.datetime.now()))
        n+=1000
    
master=pd.concat([sent_vader, sent_blob], axis=1)

stem_function= PorterStemmer
port=PorterStemmer

## stemer function
def stemer(tokens, stem_function):
    stems=[]
    for token in tokens:
        stem=port().stem(token)
        stems.append(stem)
    return stems

## tokenizes words
def tokenizer(text):
    text=re.sub("(!|\\)|\\(|`|~|,|\\.|#|$|%|\\*|\\'|\\\")", "",text) # regex to get ride of function
    toks=wordpunct_tokenize(text.lower()) # converting everything to lower case
    stemed=stemer(toks, stem_function)
    return stemed
    
    
    
word_list=reviews_test.review.to_list()   
vect = CountVectorizer(tokenizer=tokenizer, stop_words = en_stop, ngram_range= (1,2),max_features = 1000)
vect.fit(word_list)
dtm = vect.transform(word_list)   
df2 = pd.DataFrame(dtm.toarray(), columns=vect.get_feature_names())

tfdif2 = TfidfTransformer(norm='l1', use_idf=True)
dtm_l2 = tfdif2.fit_transform(df2)
df2=pd.concat([df2, master], axis=1)
df2

sent_test=df2[['vader_neg','vader_neu', 'vader_pos', 'vader_compound','Text_blob_polarity','text_blob_subjectivity']]
drops=['vader_neg','vader_neu', 'vader_pos', 'vader_compound','Text_blob_polarity','text_blob_subjectivity']
dimension_tec=[umap_fun, lle_fun, sparce_fun,pca_fun]
cols=[col for col in df2.columns if col not in drops]
dtm=df2[cols]

umap_ft=umap_fun.transform(dtm)
col_name=["UMAP_{}".format(x) for x in range(umap_ft.shape[1])] # creating col names
umap_df_train=pd.DataFrame(umap_ft, columns=col_name) # Sparse train df

lle_ft=lle_fun.transform(dtm)
col_name=["LLE_{}".format(x) for x in range(lle_ft.shape[1])] # creating col names
lle_df_train=pd.DataFrame(lle_ft, columns=col_name) # Sparse train df

sparce_ft=sparce_fun.transform(dtm)
col_name=["Sparse_{}".format(x) for x in range(sparce_ft.shape[1])] # creating col names
sparse_df_train=pd.DataFrame(sparce_ft, columns=col_name) # Sparse train df

pca_ft=pca_fun.transform(dtm)
col_name=["PCA_{}".format(x) for x in range(pca_ft.shape[1])] # creating col names
PCA_df_train=pd.DataFrame(pca_ft, columns=col_name) # Sparse train df

train_df=pd.concat([sparse_df_train, PCA_df_train,umap_df_train,master,lle_df_train], axis=1)

chng=['vader_neg', 'vader_neu', 'vader_pos', 'vader_compound']
for col in chng:
    train_df[col]=train_df[col].astype(float)
    


cols=ft_import_df.sort_values('ft_im', ascending=False).cols[0:100]

pred=clf.predict(train_df[cols])
# pred=cross_val_predict(clf, X, y)

print(classification_report(reviews_test.rating, pred))

i=0
now=datetime.datetime.now()
y_train_files=os.listdir(path+"y_train")
x_train_files=os.listdir(path+"dimension_rd_x_train_2")
df_names="l1"
master_2=pd.DataFrame()

##### selecting only 100 PCA's (issue with time) ##########################
# cols=["PCA_{}".format(x) for x in range(100)]
#cols=['Sparse_0','Sparse_1','Sparse_2','UMAP_0','UMAP_1','UMAP_2','vader_neg','vader_neu', 'vader_pos', 'vader_compound',
             #'Text_blob_polarity','text_blob_subjectivity','LLE_0','LLE_1','LLE_2', 'PCA_1','PCA_2','PCA_3']


x_train=pd.read_pickle(path+"dimension_rd_x_train_2\\{}".format(x_train_files[0]))
y_train=pd.read_pickle(path+"y_train\\{}".format(y_train_files[1])).rating.values
#x_train=x_train.loc[:,cols]

x_train[[col for col in x_train.columns if col not in ['TSNE_1', 'TSNE_2']]]

### feature engineering: most Frequent Words ####
# x_train=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\x_train\\dtm_l1_x_train.pkl')
# y_train=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\y_train\\dtm_l1_y_train.pkl')
# x_test=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\x_test\\dtm_l1_x_test.pkl')
# y_test=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\y_test\\dtm_l1_y_test.pkl')
# ###### Most Frequent Words #########

# top=500
# df=x_train
# df['rating']=y_train.rating
# # l1_dtm['rating']=rev.rating
# # display(l1_dtm.melt(var_name='rating'))
# top_1=df.groupby('rating').sum().T.sort_values(1, ascending=False)[0:top].index
# top_2=df.groupby('rating').sum().T.sort_values(2, ascending=False)[0:top].index
# top_3=df.groupby('rating').sum().T.sort_values(3, ascending=False)[0:top].index
# top_4=df.groupby('rating').sum().T.sort_values(4, ascending=False)[0:top].index
# top_5=df.groupby('rating').sum().T.sort_values(5, ascending=False)[0:top].index

# one_toks=[col for col in top_1 if col not in top_5 | top_4 | top_3]
# two_toks=[col for col in top_2 if col not in top_5 | top_4 ]
# three_toks=[col for col in top_3 if col not in top_5 | top_1]
# four_toks=[col for col in top_4 if col not in top_2|top_1]
# five_toks=[col for col in top_5 if col not in top_3 |top_2 | top_1]


# x=['familiar' for x in range(len(one_toks))]
# g=0
# freq_map={**{i:'Low star' for i in one_toks},**{i:'Med star' for i in three_toks},**{i:'High star' for i in five_toks}}


for dex in df.T.iloc[:-9,0].index:
    df.T.loc[dex,]

Index(['$', '&', '--', '1', '1 2', '10', '100', '12', '15', '2',
       ...
       'wrong', 'wrote', 'ye', 'year', 'year ago', 'year old', 'yet', 'youll',
       'young', 'youv'],
      dtype='object', length=1000)

###### feature Engineering: Binning polarity
x_train=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\dimension_rd_x_test_2\\dtm_l1_dim_rd_test_2.pkl')
y_train=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\y_train\\dtm_l1_y_train.pkl')

pd.cut(x_train.vader_compound

0      0.7906
1      0.4926
2      0.1265
3      0.1263
4      0.9531
        ...  
495    0.8233
496    0.9351
497    0.4215
498   -0.6537
499    0.7003
Name: vader_compound, Length: 500, dtype: object

Clustering Process¶

HDBSCAN
DBSCAN
K-Means
Most Frequenct Words
Token Cluster Attempt

#################################################################################### HDBSCAN ################################################################################################
%cd "C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\clusters"

path="C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\"

### reducing cols ####
cols=["PCA_{}".format(x) for x in range(100)]
cols.extend(['Sparse_0','Sparse_1','Sparse_2','UMAP_0','UMAP_1','UMAP_2'])

### getting file directoryies 
x_train_dir=os.listdir(path+'dimension_rd_x_train_2')
y_train_dir=os.listdir(path+'y_train')
sent_dir=os.listdir(path+'sentiment')


now=datetime.datetime.now()
hdbscan_fun=HDBSCAN(min_cluster_size=500, min_samples=50, core_dist_n_jobs=-1,metric='manhattan')
names=['freq','l1','l2']
x_train_dir=os.listdir(path+'dimension_rd_x_train_2')
y_train_dir=os.listdir(path+'y_train')[1]
y_train=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\y_train\\dtm_l1_y_train.pkl')
# cols=["PCA_{}".format(x) for x in range(100)]
#cols.extend(['Sparse_0','Sparse_1','Sparse_2','UMAP_0','UMAP_1','UMAP_2'])


cols=['vader_neg', 'vader_neu','vader_pos', 'vader_compound', 'Text_blob_polarity','text_blob_subjectivity']

df=pd.read_pickle(path+'dimension_rd_x_train_2\\'+x_train_dir[0])
#hdbscan_fun.fit(df[cols].values)
hdbscan_fun.fit(df[sent])

cluster=hdbscan_fun.labels_
cluster_df=pd.DataFrame(cluster, columns=['HDBSCAN'])  
# cluster_df.to_pickle(names[i]+'_cluster.pkl')
# print("############### DF {} at {}##########".format(file,now))

C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\clusters

path="C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\"

x_train_dir=os.listdir(path+'dimension_rd_x_train_2')
df=pd.read_pickle(path+'dimension_rd_x_train_2\\'+x_train_dir[0])
df.iloc[:,-13:]
sent=['vader_compound','Text_blob_polarity']

cluster_df['rating']=y_train.rating
pd.crosstab(cluster_df.HDBSCAN.astype(str),cluster_df.rating.astype(str))

######################################################################### HDBSCAN Crosstab #######################################################################################

cluster_df['rating']=y_train.rating.astype(str)
cluster_df.HDBSCAN=cluster_df.HDBSCAN.astype(str)
pd.crosstab(cluster_df['rating'],cluster_df['HDBSCAN'],margins=True, margins_name='total')

#################################################################################### DBSCAN ################################################################################################
%cd "C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\clusters"
now=datetime.datetime.now()
db_fun= DBSCAN(eps=.5,min_samples=500,p=2, n_jobs=-1,leaf_size=100)
names=['freq','l1','l2']
x_train_dir=os.listdir(path+'dimension_rd_x_train_2')
y_train_dir=os.listdir(path+'y_train')[1]
y_train=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\y_train\\dtm_l1_y_train.pkl')
cols=["PCA_{}".format(x) for x in range(100)]
cols.extend(['Sparse_0','Sparse_1','UMAP_0','UMAP_1'])
cols.extend(['vader_neg', 'vader_neu','vader_pos', 'vader_compound', 'Text_blob_polarity','text_blob_subjectivity'])
# cols=['TSNE_1','TSNE_2']
# cols=['vader_neg', 'vader_neu','vader_pos', 'vader_compound', 'Text_blob_polarity','text_blob_subjectivity']

df=pd.read_pickle(path+'dimension_rd_x_train_2\\'+x_train_dir[0])
db_fun.fit(df[sent].values)
#db_fun.fit(df.values)

cluster=hdbscan_fun.labels_
cluster_df=pd.DataFrame(cluster, columns=['DBSCAN'])  
# print("############### DF {} at {}##########".format(file,now))

C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\clusters

################################################################ DBSCAN Cross Tab ########################################################################################
cluster_df['rating']=y_train.rating.astype(str)
cluster_df.DBSCAN=cluster_df.DBSCAN.astype(str)
pd.crosstab(cluster_df['rating'],cluster_df['DBSCAN'],margins=True, margins_name='total')

#################################################################################### K-MEANS ################################################################################################
%cd "C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\clusters"
now=datetime.datetime.now()
km_fun= KMeans(n_clusters=5, n_jobs=-1)
names=['freq','l1','l2']
x_train_dir=os.listdir(path+'dimension_rd_x_train_2')
y_train_dir=os.listdir(path+'y_train')[1]
y_train=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\y_train\\dtm_l1_y_train.pkl')
#cols=["PCA_{}".format(x) for x in range(100)]
#cols.extend(['Sparse_0','Sparse_1','UMAP_0','UMAP_1'])
#cols.extend(['vader_neg', 'vader_neu','vader_pos', 'vader_compound', 'Text_blob_polarity','text_blob_subjectivity'])
cols=['TSNE_1','TSNE_2']
#cols=['vader_neg', 'vader_neu','vader_pos', 'vader_compound', 'Text_blob_polarity','text_blob_subjectivity']

df=pd.read_pickle(path+'dimension_rd_x_train_2\\'+x_train_dir[0])
km_fun.fit(df[sent].values)
#km_fun.fit(df.values)

cluster=km_fun.labels_
cluster_df=pd.DataFrame(cluster, columns=['K_MEANS'])  
# print("############### DF {} at {}##########".format(file,now))

C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\clusters

######################################################################### K-Means Cross Tab ##################################################################################
cluster_df['rating']=y_train.rating.astype(str)
cluster_df.K_MEANS=cluster_df.K_MEANS.astype(str)
pd.crosstab(cluster_df['rating'],cluster_df['K_MEANS'],margins=True, margins_name='total')

################# Clustering on tokens #################
l1_dtm=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\df's\\dtm_l1.pkl")
rev=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\df's\\reviews.pkl")

####################################################################### Frequency of word by Ratings ##################################################

# l1_dtm=l1_dtm.iloc[:,:-6]
l1_dtm=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\df's\\dtm_l1.pkl")

top=300
l1_dtm['rating']=rev.rating
# display(l1_dtm.melt(var_name='rating'))
top_1=l1_dtm.groupby('rating').sum().T.sort_values(1, ascending=False)[0:top].index
top_2=l1_dtm.groupby('rating').sum().T.sort_values(2, ascending=False)[0:top].index
top_3=l1_dtm.groupby('rating').sum().T.sort_values(3, ascending=False)[0:top].index
top_4=l1_dtm.groupby('rating').sum().T.sort_values(4, ascending=False)[0:top].index
top_5=l1_dtm.groupby('rating').sum().T.sort_values(5, ascending=False)[0:top].index

one_toks=[col for col in top_1 if col not in top_5 | top_4 | top_3 | top_2 ]
two_toks=[col for col in top_2 if col not in top_5 | top_4 |]
three_toks=[col for col in top_3 if col not in top_5 | top_1]
four_toks=[col for col in top_4 if col not in top_2|top_1]
five_toks=[col for col in top_5 if col not in top_3 |top_2 | top_1]

################################################################################# Clustering Token Atempt  ########################################################################################
trn=l1_dtm.pivot(columns='rating').T.reset_index().fillna(0)

tokens=trn.level_0
tokens=trn.rating
trn



umap_fun=umap.UMAP(n_neighbors=50, n_components=10, metric='cosine')

rd_trn=umap_fun.fit_transform(trn.iloc[:,2:])

hdbscan_fun.fit(rd_trn)

cluster=hdbscan_fun.labels_
cluster_df=pd.DataFrame(cluster, columns=['HDBSCAN'])  
# print("############### DF {} at {}##########".format(file,now))

token_cluster=pd.concat([cluster_df,tokens,trn.rating], axis=1)
token_cluster.groupby('rating')
token_cluster[['HDBSCAN','rating']]
pd.crosstab(token_cluster['rating'],token_cluster['HDBSCAN'])

	Text_blob_polarity	text_blob_subjectivity
0	0.000000	0.000000
1	-0.021429	0.576190
2	0.277273	0.511364
3	0.253030	0.492424
4	0.284762	0.508333
...	...	...
9995	0.500000	0.400000
9996	0.068750	0.456250
9997	0.234091	0.437500
9998	0.136364	0.454545
9999	0.296667	0.626667

	vader_neg	vader_neu	vader_pos	vader_compound
0	0.111	0.741	0.148	0.2003
1	0.106	0.853	0.041	-0.3855
2	0	0.863	0.137	0.9272
3	0.071	0.858	0.071	0.0249
4	0.058	0.746	0.197	0.92
...	...	...	...	...
9995	0.195	0.805	0	-0.3867
9996	0.064	0.837	0.1	0.1747
9997	0	0.78	0.22	0.875
9998	0.097	0.903	0	-0.575
9999	0	0.567	0.433	0.8655

	vader_neg	vader_neu	vader_pos	vader_compound	Text_blob_polarity	text_blob_subjectivity
0	0.111	0.741	0.148	0.2003	0.000000	0.000000
1	0.106	0.853	0.041	-0.3855	-0.021429	0.576190
2	0	0.863	0.137	0.9272	0.277273	0.511364
3	0.071	0.858	0.071	0.0249	0.253030	0.492424
4	0.058	0.746	0.197	0.92	0.284762	0.508333
...	...	...	...	...	...	...
9995	0.195	0.805	0	-0.3867	0.500000	0.400000
9996	0.064	0.837	0.1	0.1747	0.068750	0.456250
9997	0	0.78	0.22	0.875	0.234091	0.437500
9998	0.097	0.903	0	-0.575	0.136364	0.454545
9999	0	0.567	0.433	0.8655	0.296667	0.626667

	Sparse_0	Sparse_1	Sparse_2	PCA_0	PCA_1	PCA_2	PCA_3	PCA_4	PCA_5	PCA_6	...	UMAP_0	UMAP_1	vader_neg	vader_neu	vader_pos	vader_compound	Text_blob_polarity	text_blob_subjectivity	LLE_0	LLE_1
0	-0.003447	-0.000146	-0.001338	-0.065991	0.028139	0.072930	-0.085083	-0.060750	0.044697	0.038091	...	-4.084264	6.528867	0.094	0.906	0	-0.6486	-0.0642857	0.52619	0.007993	-0.005878
1	0.000000	0.000000	0.000000	-0.071607	-0.082483	-0.001451	0.001518	-0.027734	0.044720	0.046058	...	-1.139816	2.391321	0	0.826	0.174	0.4939	-0.00416667	0.178571	-0.002267	-0.003622
2	-0.004382	0.000167	-0.015122	-0.053353	0.035840	0.105020	-0.068836	-0.044808	0.004433	-0.011878	...	-4.211996	5.050555	0.113	0.704	0.182	0.6674	-0.019725	0.657407	0.002207	-0.002527
3	-0.008564	0.000079	0.000000	-0.087841	-0.113705	-0.019849	-0.004054	-0.066870	-0.046401	-0.023208	...	-1.442914	1.960394	0.082	0.779	0.139	0.7287	0.206667	0.555	0.002733	0.005821
4	-0.000158	0.000006	-0.022199	-0.048134	0.052211	0.162864	-0.100061	-0.066620	0.021119	0.039493	...	-2.902161	5.783535	0.018	0.83	0.152	0.8338	0.151786	0.642857	0.011837	-0.003232
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
9495	-0.011959	-0.026144	0.000000	0.310136	0.018771	-0.015299	-0.018336	-0.029080	-0.018617	0.035390	...	-5.325161	4.488866	0	0.764	0.236	0.9779	0.323214	0.514286	0.001513	0.001566
9496	-0.007915	-0.013018	0.000000	0.128538	-0.011163	0.011558	-0.024980	-0.019052	0.040633	0.002340	...	-5.991697	3.430374	0.118	0.782	0.099	-0.4734	0.300278	0.63381	0.002872	0.002746
9497	-0.011722	0.000165	0.000000	-0.094820	0.130341	-0.144374	-0.007765	0.021522	-0.123656	0.041907	...	-3.833734	-0.690987	0.239	0.691	0.069	-0.946	-0.328571	0.614286	0.001560	0.002668
9498	-0.003490	-0.020510	0.000000	0.221674	-0.010095	-0.064216	-0.096943	-0.049029	0.029237	0.077646	...	-5.227555	3.110168	0.041	0.83	0.129	0.5859	0.266667	0.55	-0.000388	0.000630
9499	-0.003749	-0.009046	0.000000	0.084163	-0.015090	0.007058	-0.025886	-0.013741	0.067081	0.047346	...	-5.806878	5.106863	0.051	0.648	0.301	0.9185	-0.0745833	0.473333	-0.000098	0.000991

K_MEANS	0	1	2	3	4	total
rating
1	296	286	591	367	293	1833
2	329	289	606	368	248	1840
3	341	281	567	364	244	1797
4	313	272	572	351	258	1766
5	297	271	619	343	247	1777
total	1576	1399	2955	1793	1290	9013

	2	...
0	0	...
1	0	...
2	1	...
3	0	...
4	0	...

	model	df	mean_test_score
0	GradientBoostingClassifier	freq	0.396526
1	GradientBoostingClassifier	l1	0.405474
2	GradientBoostingClassifier	l2	0.399895
3	KNeighborsClassifier	freq	0.307579
4	KNeighborsClassifier	l1	0.317263
5	KNeighborsClassifier	l2	0.317368
6	RandomForestClassifier	freq	0.389474
7	RandomForestClassifier	l1	0.398421
8	RandomForestClassifier	l2	0.393474
9	XGBClassifier	freq	0.404000
10	XGBClassifier	l1	0.413579
11	XGBClassifier	l2	0.408316

rating	1.0	2.0	3.0	4.0	5.0	nan
HDBSCAN
-1	493	499	482	481	455	148
0	187	167	164	167	170	51
1	1153	1174	1151	1118	1152	288

HDBSCAN	-1	0	1	2	3	4	5	6	7	total
rating
1	1343	29	130	73	2	1	183	60	12	1833
2	1408	31	115	57	4	1	154	61	9	1840
3	1350	39	109	54	3	3	177	53	9	1797
4	1325	34	114	57	1	4	169	55	7	1766
5	1359	29	115	52	1	1	157	53	10	1777
total	6785	162	583	293	11	10	840	282	47	9013

	2	...
0	0	...
1	0	...
2	1	...
3	0	...
4	0	...

	2	...
0	0	...
1	0	...
2	1	...
3	0	...
4	0	...