A Predictive Review

by: Joshua Roberge

In [2]:
### dependencies ###

import pandas as pd
import numpy as np
import random 
import re

### tokenizers #####
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import wordpunct_tokenize

### DTM tehcniques ###
from sklearn.feature_extraction.text import TfidfTransformer
from nltk import wordpunct_tokenize
from nltk import word_tokenize 

## DTM Makers ####
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

#### dimension reduction techniques ######
### dimension reduction teckniques
from sklearn.decomposition import SparsePCA
from sklearn.manifold import TSNE
from sklearn.manifold import LocallyLinearEmbedding as LLE
from sklearn.decomposition import SparsePCA
import umap
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split

import os
import datetime 

#### models implenmentations ######
## models 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

### models selection ###
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt

### senitment analysis ###
from textblob import TextBlob 
from textblob.sentiments import NaiveBayesAnalyzer 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

### Clustering ####
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from hdbscan import HDBSCAN

from mpl_toolkits.mplot3d import Axes3D
from xgboost import plot_importance
In [7]:
############################################################################# Full Read #####################################################################################################
### will use for latter analysis ####

reviews= pd.read_csv("Amazon_reviews.csv", names=["rating", "title", "review"],)
In [2]:
#################################################################################### partial sample Read ####################################################################################

n = 3000000 # df size
s = 10000 #desired sample size
skip = sorted(random.sample(range(1, n), n-s)) # rows to skip
reviews = pd.read_csv("Amazon_reviews.csv", skiprows=skip, names=["rating", "title", "review"])

Sentiment Analysis:

  • TextBlob
  • Vader
  • Graphical Analysis
In [4]:
### text blob
sent_blob=pd.DataFrame(columns=['Text_blob_polarity', "text_blob_subjectivity"])
for i, review in enumerate(reviews.review):
    if i%1000==0: 
        print("{}% done at {}".format(x, datetime.datetime.now()))

Text_blob_polarity text_blob_subjectivity
0 0.000000 0.000000
1 -0.021429 0.576190
2 0.277273 0.511364
3 0.253030 0.492424
4 0.284762 0.508333
... ... ...
9995 0.500000 0.400000
9996 0.068750 0.456250
9997 0.234091 0.437500
9998 0.136364 0.454545
9999 0.296667 0.626667

10000 rows × 2 columns

In [5]:
### vader analysis #####

analyser = SentimentIntensityAnalyzer()
sent_vader = pd.DataFrame(columns=["vader_neg", "vader_neu",'vader_pos','vader_compound'])

for i, review in enumerate(reviews.review):
    if i%1000==0: 
        print("{}% done at {}".format(x, datetime.datetime.now()))

0 0.111 0.741 0.148 0.2003
1 0.106 0.853 0.041 -0.3855
2 0 0.863 0.137 0.9272
3 0.071 0.858 0.071 0.0249
4 0.058 0.746 0.197 0.92
... ... ... ... ...
9995 0.195 0.805 0 -0.3867
9996 0.064 0.837 0.1 0.1747
9997 0 0.78 0.22 0.875
9998 0.097 0.903 0 -0.575
9999 0 0.567 0.433 0.8655

10000 rows × 4 columns

In [14]:
### saving sentiment #####
master=pd.concat([sent_vader, sent_blob], axis=1)
vader_neg vader_neu vader_pos vader_compound Text_blob_polarity text_blob_subjectivity
0 0.111 0.741 0.148 0.2003 0.000000 0.000000
1 0.106 0.853 0.041 -0.3855 -0.021429 0.576190
2 0 0.863 0.137 0.9272 0.277273 0.511364
3 0.071 0.858 0.071 0.0249 0.253030 0.492424
4 0.058 0.746 0.197 0.92 0.284762 0.508333
... ... ... ... ... ... ...
9995 0.195 0.805 0 -0.3867 0.500000 0.400000
9996 0.064 0.837 0.1 0.1747 0.068750 0.456250
9997 0 0.78 0.22 0.875 0.234091 0.437500
9998 0.097 0.903 0 -0.575 0.136364 0.454545
9999 0 0.567 0.433 0.8655 0.296667 0.626667

10000 rows × 6 columns

In [278]:
####################################################### graphs for sentiment ###############################################################
ratings=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\df's\\reviews.pkl").rating
grp_sent=sent_master.groupby('rating', as_index=False).mean()

# grp_sent
# sent_master

# sns.barplot(y=grp_sent.vader_neg, x=grp_sent.ratting)
# plt.show()

# sns.barplot(y=grp_sent.vader_pos, x=grp_sent.ratting)
# plt.show()

# sns.barplot(y=grp_sent.text_blob_subjectivity, x=grp_sent.ratting)
# plt.show()
fig = plt.figure(figsize=(50,50))
ax1=plt.subplot2grid((4,4), (0,0), rowspan=1, colspan=1)
ax2=plt.subplot2grid((4,4), (0,1), rowspan=1, colspan=1)
ax3=plt.subplot2grid((4,4), (1,0), rowspan=2, colspan=2)

sns.barplot(y=grp_sent.Text_blob_polarity, x=grp_sent.rating, ax=ax1)
sns.barplot(y=grp_sent.vader_compound, x=grp_sent.rating, ax=ax2)
sns.scatterplot(y=sent_master.Text_blob_polarity, x= sent_master.vader_compound, hue=sent_master.rating.astype(str)+'#',

ax3.set_title("\n\nText Blob Polairty Vs. Vader Compund Score", weight='bold',size=50)
ax3.set_xlabel("Vader Compound Score",size=35, weight='bold')
ax3.set_ylabel("TextBlob Polairty Score",size=35, weight='bold')
ax3.legend(loc=2, markerscale=6,fancybox=True, shadow=True,title='City Area', prop={'size': 30})

ax1.set_title("Text Blob Polairty Score", weight='bold',size=40)
ax1.set_ylabel("Text Blob Polarity",size=25, weight='bold')
ax1.set_xlabel("Star Rating", size=25, weight='bold')

ax2.set_title("Vader Polairty Score", weight='bold',size=40)
ax2.set_ylabel("Vader Compound Score",size=25, weight='bold')
ax2.set_xlabel("Star Rating",size=25, weight='bold')

fig.savefig('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\figs\\sentiment.png')

# sent_master
Creating Document Term Matrices

  • Document Term Frequency Matrix
  • Weighted Document Term Matrix (L1, L2)
In [63]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
from nltk import wordpunct_tokenize

### Stop Words ####
en_stop = stopwords.words('english')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

# Additions to the stop word list
en_stop.extend([".","-","(", ")","/", r",+", "’", "”","“", "\n",r"\\)","!","?",r"\\(",r"\\'",r'\\"'])
In [11]:
############################################################################ Document Term Matrixs ###################################################################
stem_function= PorterStemmer

## stemer function
def stemer(tokens, stem_function):
    for token in tokens:
    return stems

## tokenizes words
def tokenizer(text):
    text=re.sub("(!|\\)|\\(|`|~|,|\\.|#|$|%|\\*|\\'|\\\")", "",text) # regex to get ride of function
    toks=wordpunct_tokenize(text.lower()) # converting everything to lower case
    stemed=stemer(toks, stem_function)
    return stemed
vect = CountVectorizer(tokenizer=tokenizer, stop_words = en_stop, ngram_range= (1,2),max_features = 1000)
dtm = vect.transform(word_list)   
df2 = pd.DataFrame(dtm.toarray(), columns=vect.get_feature_names())
$ & -- 1 1 2 10 100 12 15 2 ... wrong wrote ye year year ago year old yet youll young youv
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 1000 columns

In [61]:
In [16]:
########################################################################### Term Frequancey L1 & L2 ##############################################################################
from sklearn.feature_extraction.text import TfidfTransformer

tfdif2 = TfidfTransformer(norm='l1', use_idf=True)
tfdif3 = TfidfTransformer(norm='l2', use_idf=True)

dtm_l2 = tfdif2.fit_transform(df2)
dtm_l1 = tfdif3.fit_transform(df2)

### l2 DTM #####
dtm_l2=pd.DataFrame(dtm_l2.toarray(), columns=vect.get_feature_names())
dtm_l2=pd.concat([dtm_l2, master], axis=1)
# dtm_l2.shape()
dtm_l2.to_pickle("dtm_l2.pkl") ## using pickle fromat due to size

# ### l1 DTM #####
dtm_l1=pd.DataFrame(dtm_l1.toarray(), columns=vect.get_feature_names())
dtm_l1=pd.concat([dtm_l1, master], axis=1)
dtm_l1.to_pickle("dtm_l1.pkl") ## using pickle format due to size
# dtm_l1.shape()


df2=pd.concat([df2, master], axis=1)
# df2.shape()

Train Test Split & TSNE

  • T-SNE tuning
  • Train Test split and T-SNE Fitting
  • Graphing T-SNE
In [31]:
############################################################################ T-SNE Tunning ########################################################################
### mapping TSNE ####
### aproximating TSNE with sample
# rows=random.sample(range(1, df.shape[0]), 1000)
# df=df.iloc[rows,:]
y=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\df's\\reviews.pkl").rating

x=TSNE(n_components=2, perplexity=250, verbose=10).fit_transform(df.values)

sns.scatterplot(x=x[:,0], y=x[:,1],hue=y.astype(str)+'_#')
array([[  4.4325747,  -5.829836 ],
       [ 13.77044  ,  -6.8514385],
       [ -6.1297827,   2.7519443],
       [-14.941263 ,   2.4553525],
       [ 15.015589 ,  -4.0810432],
       [ -8.19329  ,   5.088409 ]], dtype=float32)
In [3]:
############################################################################ train test split & TSNE Fit #################################################

## this esection creates a train and test direcotry & fits T-SNE #####
name=["dtm_l1","dtm_l2", "dtm_freq"]
y=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\df's\\reviews.pkl").rating

for i, file in enumerate(directory_main):
    ### reading in file #####
    ### getting TSNE #####
    sne_df=TSNE(n_components=2, perplexity=250, verbose=10).fit_transform(df.values)
    sne_df=pd.DataFrame(sne_df, columns=["TSNE_1","TSNE_1"])
    sne_df.to_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\TSNE\\{}".format(TSNE_file))
    print("TSNE calcuclation done for ", name[i])
    print("total time ", datetime.datetime.now()-now)
    print("Current Time: ", datetime.datetime.now())

    #### setting up directories for Train Test splits #####
    x_train, x_test, y_train, y_test = train_test_split(df.values, y, test_size=0.05, random_state=42)
TSNE calcuclation done for  dtm_l1
total time  0:12:23.792472
Current Time:  2019-12-13 00:32:03.176892
In [294]:
################################################################################ TSNE Graph ####################################################################################
# tsne=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\TSNE\\dtm_l1_TSNE.pkl')
# ratings=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\df's\\reviews.pkl").rating

# tsne['Ratings']=ratings
# tsne=tsne.sort_values('Ratings')
# tsne['Ratings']=tsne['Ratings'].astype(str)+'#'

fig, axe = plt.subplots(figsize=(20,20))

sns.kdeplot(data=ones.iloc[:,0],data2=ones.iloc[:,1],cmap="Blues", ax=axe, shade=True, alpha=.3 )
sns.kdeplot(data=fives.iloc[:,0],data2=fives.iloc[:,1],cmap="Reds", shade=True, ax=axe, alpha=.3)
sns.scatterplot(x=tsne.iloc[:,0], y=tsne.iloc[:,1], hue=tsne['Ratings'], palette='twilight',
                x_jitter=10,y_jitter=10,alpha=.8,style=tsne['Ratings'], ax=axe)
axe.set_title('T-SNE Graph', size=50, fontdict= {'family':'MS Reference Sans Serif'} )
axe.set_ylabel('T-SNE Component One', size=30)
axe.set_xlabel('T-SNE Component Two', size=30)
axe.legend(loc=2, markerscale=3,fancybox=True, shadow=True, prop={'size': 20})

fig.savefig('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\figs\\TSNE.png')
In [ ]:
### Clean Up ####
# del(df, directory_main, file, i, name,names, y, x_train, x_test, y_train, y_test)

Dimension Reduction

This section is not in the most logical order so I will explain. The first initial dimension reduction were just fit based off of default settings. After running the first batch of models and narrowing down my DTMs the dimensions were then re-worked and more finely tunned.

  • 1st dimension reductions
  • l1 dimension reductions
  • UMap Graph
  • LLE Graph
In [4]:
########################################################################### First Dimension Reductions #############################################################################

import warnings


#### dimension Reduction ####

#### Parameters ####
umap_fun=umap.UMAP(n_neighbors=15, n_components=3)
sparce_fun=SparsePCA(n_components=3, n_jobs=-1,)

### initializations ####
dimension_tec=[umap_fun, lle_fun, sparce_fun]
names=["dtm_freq", "dtm_l1", "dtm_l2"]
### getting paths ######
path="C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\"
x_train_directory=os.listdir("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\x_train")
x_test_directory=os.listdir("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\x_test")

for i, file in enumerate(x_train_directory):
    ### take out [0:50] to run whole dataset ####
    drops=['vader_neg','vader_neu', 'vader_pos', 'vader_compound','Text_blob_polarity','text_blob_subjectivity','TSNE_1','TSNE_2']
    sent_train=x_train[['vader_neg','vader_neu', 'vader_pos', 'vader_compound','Text_blob_polarity','text_blob_subjectivity']]
    x_train=x_train[[col for col in x_train.columns if col not in drops]]
#     display(x_train)

    sent_test=x_test[['vader_neg','vader_neu', 'vader_pos', 'vader_compound','Text_blob_polarity','text_blob_subjectivity']]
    x_test=x_test[[col for col in x_test.columns if col not in drops]]
#     display(x_test)

    ### Sparse ####
    Sparse_transform_train=sparce_fun.fit_transform(x_train) # fitting to the training set
    col_name=["Sparse_{}".format(x) for x in range(Sparse_transform_train.shape[1])] # creating col names
    sparse_df_train=pd.DataFrame(Sparse_transform_train, columns=col_name) # Sparse train df
    sparse_df_test=pd.DataFrame(sparce_fun.transform(x_test), columns=col_name) # Sparse test df
    print("\nSparse for DTM {}\ntotal time :".format(file), datetime.datetime.now()-now) ## ticker   
    ### PCA #####
    PCA_transform_train=pca_fun.fit_transform(x_train) # fitting to the training set
    col_name=["PCA_{}".format(x) for x in range(PCA_transform_train.shape[1])] # creating col names
    PCA_df_train=pd.DataFrame(PCA_transform_train, columns=col_name) # PCA train df
    PCA_df_test=pd.DataFrame(pca_fun.transform(x_test), columns=col_name) # PCA test df
    print("\n\n\nPCA for DTM {}\ntotal time :".format(file), datetime.datetime.now()-now) ## ticker   
    ### U Map #####
    u_map_transform_train=umap_fun.fit_transform(x_train) # fitting to the training set
    col_name=["UMAP_{}".format(x) for x in range(u_map_transform_train.shape[1])] # creating col names
    umap_df_train=pd.DataFrame(u_map_transform_train, columns=col_name) # umap train df
    umap_df_test=pd.DataFrame(umap_fun.transform(x_test), columns=col_name) # umap test df
    print("\n\n\nUMAP for DTM {}\ntotal time :".format(file), datetime.datetime.now()-now) ## ticker   
    ### lle ####
    lle_transform_train=lle_fun.fit_transform(x_train) # fitting to the training set
    col_name=["LLE_{}".format(x) for x in range(lle_transform_train.shape[1])] # creating col names
    lle_df_train=pd.DataFrame(lle_transform_train, columns=col_name) # lle train df
    lle_df_test=pd.DataFrame(lle_fun.transform(x_test), columns=col_name) # lle test df
    print("\nLLE for DTM {}\ntotal time :".format(file), datetime.datetime.now()-now) ## ticker   

    ### Saving\creating DF's ###
    train_df=pd.concat([sparse_df_train, PCA_df_train,umap_df_train,sent_train,lle_df_train,TSNE_train], axis=1)
    test_df=pd.concat([sparse_df_test, PCA_df_test, umap_df_test, sent_test, lle_df_test,TSNE_test], axis=1)
    print("\n############################# File Saved At {} #################################".format(datetime.datetime.now()))
UMAP for DTM dtm_freq_x_train.pkl
total time : 0:02:53.363985

LLE for DTM dtm_freq_x_train.pkl
total time : 0:04:28.076542

############################# File Saved At 2019-12-13 00:51:45.254088 #################################
In [108]:
####################################################################### L1 Dimension Reduction ##############################################################################

import warnings


#### Parameters ####
umap_fun=umap.UMAP(n_neighbors=50, n_components=2, metric='cosine')
sparce_fun=SparsePCA(n_components=3, n_jobs=-1,)

### initializations ####
dimension_tec=[umap_fun, lle_fun, sparce_fun]
names=[ "dtm_l1"]
### getting paths ######
path="C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\"
x_train_directory=os.listdir("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\x_train")[1]
x_test_directory=os.listdir("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\x_test")[1]


################################ reading DataFrame ###########################################################
### take out [0:50] to run whole dataset ####
drops=['vader_neg','vader_neu', 'vader_pos', 'vader_compound','Text_blob_polarity','text_blob_subjectivity','TSNE_1','TSNE_2']
sent_train=x_train[['vader_neg','vader_neu', 'vader_pos', 'vader_compound','Text_blob_polarity','text_blob_subjectivity']]
x_train=x_train[[col for col in x_train.columns if col not in drops]]
#     display(x_train)

sent_test=x_test[['vader_neg','vader_neu', 'vader_pos', 'vader_compound','Text_blob_polarity','text_blob_subjectivity']]
x_test=x_test[[col for col in x_test.columns if col not in drops]]
#     display(x_test)

#################################### Sparse #################################
Sparse_transform_train=sparce_fun.fit_transform(x_train) # fitting to the training set
col_name=["Sparse_{}".format(x) for x in range(Sparse_transform_train.shape[1])] # creating col names
sparse_df_train=pd.DataFrame(Sparse_transform_train, columns=col_name) # Sparse train df
sparse_df_test=pd.DataFrame(sparce_fun.transform(x_test), columns=col_name) # Sparse test df
print("\nSparse for DTM {}\ntotal time :".format(file), datetime.datetime.now()-now) ## ticker   

##################################### PCA ########################################
PCA_transform_train=pca_fun.fit_transform(x_train) # fitting to the training set
col_name=["PCA_{}".format(x) for x in range(PCA_transform_train.shape[1])] # creating col names
PCA_df_train=pd.DataFrame(PCA_transform_train, columns=col_name) # PCA train df
PCA_df_test=pd.DataFrame(pca_fun.transform(x_test), columns=col_name) # PCA test df
print("\n\n\nPCA for DTM {}\ntotal time :".format(file), datetime.datetime.now()-now) ## ticker   

##################################### U Map #######################################
u_map_transform_train=umap_fun.fit_transform(x_train) # fitting to the training set
col_name=["UMAP_{}".format(x) for x in range(u_map_transform_train.shape[1])] # creating col names
umap_df_train=pd.DataFrame(u_map_transform_train, columns=col_name) # umap train df
umap_df_test=pd.DataFrame(umap_fun.transform(x_test), columns=col_name) # umap test df
print("\n\n\nUMAP for DTM {}\ntotal time :".format(file), datetime.datetime.now()-now) ## ticker   

#################################### lle ############################################
lle_transform_train=lle_fun.fit_transform(x_train) # fitting to the training set
col_name=["LLE_{}".format(x) for x in range(lle_transform_train.shape[1])] # creating col names
lle_df_train=pd.DataFrame(lle_transform_train, columns=col_name) # lle train df
lle_df_test=pd.DataFrame(lle_fun.transform(x_test), columns=col_name) # lle test df
print("\nLLE for DTM {}\ntotal time :".format(file), datetime.datetime.now()-now) ## ticker   

############################## Saving\creating DF's ####################################################
train_df=pd.concat([sparse_df_train, PCA_df_train,umap_df_train,sent_train,lle_df_train,TSNE_train], axis=1)
test_df=pd.concat([sparse_df_test, PCA_df_test, umap_df_test, sent_test, lle_df_test,TSNE_test], axis=1)


print("\n############################# File Saved At {} #################################".format(datetime.datetime.now()))
In [82]:
####################################################### Saving fits ####################################################
import pickle
dimension_tec=[umap_fun, lle_fun, sparce_fun, pca_fun]
dimension_name=['umap_fun.pickle', 'lle_fun.pickle', 'sparce_fun.pickle', 'PCA_fun.pickle']
%cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\models objects"
for i, fit in enumerate(dimension_tec):
    pickle_out = open(dimension_name[i],"wb")
    pickle.dump(fit, pickle_out)
In [289]:
########################################################################### UMAP Graph ##############################################################################################

#### reading in files
y_train_l1=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\y_train\\dtm_l1_y_train.pkl')
x_train=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\x_train\\dtm_l1_x_train.pkl')

#### used for tunning UMap Wich was latter applied to the finale dimensions
# umap_fun=umap.UMAP(n_neighbors=50, n_components=2, metric='cosine')
# u_map_transform_train=umap_fun.fit_transform(x_train.iloc[:,:-8])
# raw=u_map_transform_train
# u_map_transform_train=pd.DataFrame(u_map_transform_train, columns=['UMAP_1', 'UMAP_2'])
# u_map_transform_train['rating']=y_train_l1.rating
# u_map_transform_train=u_map_transform_train.sort_values('rating')
# u_map_transform_train.rating=u_map_transform_train.rating.astype(str)+'#'
# u_map_transform_train=u_map_transform_train[u_map_transform_train.rating!='nan#']
# ones=u_map_transform_train[u_map_transform_train.rating=='1#']
# fives=u_map_transform_train[u_map_transform_train.rating=='5#']

########################################### plotting UMAP ##############################################################
fig, axe = plt.subplots(figsize=(20,20))

### ones and Fives Densities
# sns.kdeplot(data=ones.iloc[:,0],data2=ones.iloc[:,1],cmap="Blues", ax=axe, shade=True, alpha=.4 )
# sns.kdeplot(data=fives.iloc[:,0],data2=fives.iloc[:,1],cmap="Reds", shade=True, ax=axe, alpha=.4)

### Total Densities
sns.kdeplot(data=u_map_transform_train.iloc[:,0],data2=u_map_transform_train.iloc[:,1], cmap="twilight", ax=axe, shade=True, alpha=.4 )

### individual Points
sns.scatterplot(u_map_transform_train.UMAP_1, u_map_transform_train.UMAP_2, hue=u_map_transform_train.rating ,
                style=u_map_transform_train.rating, palette='twilight', alpha=.8)
axe.set_title('UMAP Graph', size=50, fontdict= {'family':'MS Reference Sans Serif'} )
axe.set_ylabel('UMAP Component One', size=30)
axe.set_xlabel('UMAP Component Two', size=30)
axe.legend(loc=2, markerscale=3,fancybox=True, shadow=True, prop={'size': 20})

#plt.savefig('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\figs\\UMAP.png')
In [292]:
####################################################################### LLE Graph ##############################################################################################

# lle_fun=LLE(n_neighbors=40,n_components=2,n_jobs=-1)
# lle_transform_train=lle_fun.fit_transform(x_train.iloc[:3000,:-8]) # fitting to the training set

lle_transform_train=pd.DataFrame(lle_transform_train, columns=['LLE_1', 'LLE_2'])

##################################################### Plotting LLE ############################################################################
fig, axe = plt.subplots(figsize=(20,20))

### ones Desnsity ###
#sns.kdeplot(data=ones.iloc[:,0],data2=ones.iloc[:,1],cmap="Blues", ax=axe, shade=True, alpha=.4 )

### Twos Desnity #####
#sns.kdeplot(data=fives.iloc[:,0],data2=fives.iloc[:,1],cmap="Reds", shade=True, ax=axe, alpha=.4)

### total Density ####
sns.kdeplot(data=lle_transform_train.iloc[:,1],data2=lle_transform_train.iloc[:,0],cmap="twilight", ax=axe, shade=True, alpha=.4 )

sns.scatterplot(lle_transform_train.LLE_2, lle_transform_train.LLE_1, hue=lle_transform_train.rating ,
                style=lle_transform_train.rating, palette='twilight', alpha=.8)
axe.set_title('LLE Graph', size=50, fontdict= {'family':'MS Reference Sans Serif'} )
axe.set_ylabel('LLE Component One', size=30)
axe.set_xlabel('LLE Component Two', size=30)

axe.legend(loc=2, markerscale=3,fancybox=True, shadow=True, prop={'size': 20})

# axe.set_xlim(-.05,.08)

plt.savefig('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\figs\\LLE.png')

Building the Models

  • First Param Grid
  • First Initial Large Grid Search
  • Analyzing / Graphing Top Model
  • Fitting XgBoost on new features and more parameters
  • Plotting Feature Importance
  • Single Fit Algorithum Cell
  • Y test Fit Cell
  • Data Pipe Line Test
In [5]:
######################################################################### first Paramgrid ####################################################################################



master_params=[knn_params, random_params, gradient_params, xg_params]
models=[KNeighborsClassifier, RandomForestClassifier, GradientBoostingClassifier, XGBClassifier]
In [6]:
######################################################################### first inital GridSearch ##################################################################################

path="C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\"

df_names=["freq", "l1","l2"]

##### selecting only 100 PCA's (issue with time) ##########################
cols=["PCA_{}".format(x) for x in range(100)]
cols.extend(['Sparse_0','Sparse_1','Sparse_2','UMAP_0','UMAP_1','UMAP_2','vader_neg','vader_neu', 'vader_pos', 'vader_compound',
             'Text_blob_polarity','text_blob_subjectivity','LLE_0','LLE_1','LLE_2', 'TSNE_1','TSNE_2'])

for i, x_train in enumerate(x_train_files):
    #### reading files #######
    for j, model in enumerate(models):
        #### fitting models ####
        clf=GridSearchCV(model(), master_params[j], n_jobs= -1 ,cv=5, verbose=10)
        new_models=clf.fit(x_train, y_train)
        master_1=pd.concat([master_1,df], axis=0)
        print('Finished with df {} model {}'.format(df_names[i],model.__name__))
        print("Total time:", datetime.datetime.now()-now)
        print("\n\n#################################################### Fin #############################################################")
In [320]:
######################################################################### top Model Graphs Round One #########################################################################################

rd_one=pd.read_csv('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\results\\master_all_cols.csv')
display(rd_one.groupby(['model','df'], as_index=False)['mean_test_score'].max())
df_3=rd_one.groupby(['model','df'], as_index=False)['mean_test_score'].agg(['min','max','mean','std'],as_index=False)

df_3=df_3.sort_values('max', ascending=False)
fig, ax= plt.subplots(figsize=(11,5))
sns.barplot(y=df_3['model'], x=df_3['max'], hue=df_3.df, palette='mako')
ax.set_title('Top Model by Algorithum and DTM')
axe.legend(loc=2, markerscale=5,fancybox=True, shadow=True, prop={'size': 30})
plt.savefig('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\figs\\top_models.png')
model df mean_test_score
0 GradientBoostingClassifier freq 0.396526
1 GradientBoostingClassifier l1 0.405474
2 GradientBoostingClassifier l2 0.399895
3 KNeighborsClassifier freq 0.307579
4 KNeighborsClassifier l1 0.317263
5 KNeighborsClassifier l2 0.317368
6 RandomForestClassifier freq 0.389474
7 RandomForestClassifier l1 0.398421
8 RandomForestClassifier l2 0.393474
9 XGBClassifier freq 0.404000
10 XGBClassifier l1 0.413579
11 XGBClassifier l2 0.408316
In [132]:
###################################################################### NEW Feature Fit + Various Paremeters ########################################################################
### this cell dynamically changed as new features and parameters were test on feature importance


path="C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\"


####### feature importance selection ################3
cols=ft_import_df.sort_values('ft_im', ascending=False).cols[0:200]
# cols=["PCA_{}".format(x) for x in range(100)]
# cols=['Sparse_0','Sparse_1','Sparse_2','UMAP_0','UMAP_1','UMAP_2','vader_neg','vader_neu', 'vader_pos', 'vader_compound',
#              'Text_blob_polarity','text_blob_subjectivity','LLE_0','LLE_1','LLE_2', 'PCA_1','PCA_2','PCA_3']

##################### Reading in File ############################################

#### fitting models ####
clf=GridSearchCV(XGBClassifier(tree_method='gpu_hist'), xg_params ,cv=3, verbose=10)
new_models=clf.fit(x_train, y_train)
df['cols']='to 50 ft'
master_3=pd.concat([master_2,df], axis=0)
print('Finished with df {} model {}'.format(df_names[i],model.__name__))
print("Total time:", datetime.datetime.now()-now)
print("\n\n#################################################### Fin #############################################################")
In [126]:
####################################################################### Creating Feature Importance DataFrame ################################################################################
#clf.estimator.fit(x_train, y_train)
# clf_1=XGBClassifier(**clf.best_params_)
# clf_1.fit(x_train, y_train)
# clf_1.feature_importances_
# plot_importance(clf_1,max_num_features=30 )
# x_train=pd.read_pickle(path+"dimension_rd_x_train_2\\{}".format(x_train_files[0]))
# x_train[[col for col in x_train if col not in ['TSNE_1', 'TSNE_2']]]
# ft_import_df=pd.DataFrame()
# ft_import_df['ft_im']=clf_1.feature_importances_
# ft_import_df['cols']=x_train[[col for col in x_train if col not in ['TSNE_1', 'TSNE_2']]].columns
# ft_import_df.sort_values('ft_im', ascending=False).cols[0:100]
# fin_100=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\results\\results_ft_selct_100.pkl')
# fin_100.mean_test_score.max()
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=500, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.8, tree_method='gpu_hist', verbosity=1)
In [332]:
####################################################################### Plotting Feature Importance  ####################################################################################
ft_import_df_1=ft_import_df.sort_values('ft_im', ascending=False)
fig, ax= plt.subplots(figsize=(20,10))
sns.barplot(ft_import_df_1.ft_im[0:16],ft_import_df_1.cols[0:16],ax=ax, palette='twilight')
ax.set_title("XGBoost's Top 16 Most Important Features\n", size=30,fontdict= {'family':'MS Reference Sans Serif','weight':'bold'}, weight='bold'  )
ax.set_xlabel('F-Score', weight='bold', size=15)
plt.savefig('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\figs\\XG_ft_importance_fig.png')
In [8]:
fin_100=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\results\\results_ft_selct_100.pkl')
In [260]:
########################################################################## Single Fit Algorithum Cell #####################################################################################
path="C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\"


path="C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\"
params={'learning_rate': 0.1,
 'max_depth': 5,
 'n_estimators': 100,
 'subsample': 0.6,
 'tree_method': 'gpu_hist'}

col_list_fix=['vader_compound', 'Text_blob_polarity', 'vader_pos', 'vader_neg', 'text_blob_subjectivity', 'vader_neu']

ft_import_df=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\ft_imp_df.pkl')

##### Selecting top 100 ft ##########################
cols=ft_import_df.sort_values('ft_im', ascending=False).cols[0:100]


#### fitting models ####
for col in col_list_fix:

from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
scores = cross_validate(clf, x_train , y_train, cv=5)
In [ ]:
############################################################### Test fit For Y Variable ###################################################################

x_test=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\dimension_rd_x_test\\dtm_l1_dim_rd_test.pkl")
y_test=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\y_test\\dtm_l1_y_test.pkl")
y_train=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\y_train\\dtm_l1_y_train.pkl")
x_train=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\dimension_rd_x_train\\dtm_l1_dim_rd_train.pkl")
col_list_fix=['vader_compound', 'Text_blob_polarity', 'vader_pos', 'vader_neg', 'text_blob_subjectivity', 'vader_neu']

for col in col_list_fix:

y_train=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\\y_train\\dtm_l1_y_train.pkl")

# display(x_test.head())
# x_train.head()
# clf.fit(x_train[cols].values,y_train)
cols=ft_import_df.sort_values('ft_im', ascending=False).cols[0:100]

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

print("\n\n\n test df results")
print(classification_report(y_train, pred_1))

print("\n\n\n test df results")
print(classification_report(y_test, pred))
In [166]:
####################################################### saving model ####################################################
import pickle

pickle_out = open("clf.pickle","wb")
pickle.dump(clf, pickle_out)
In [ ]:
##################################################################### Creating a Data Pipline to Test #################################################################################

%cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\Amazon_reviews"
from sklearn.feature_extraction.text import TfidfTransformer

n = 3000000 # df size
s = 800 #desired sample size
skip = sorted(random.sample(range(1, n), n-s)) # rows to skip
reviews_test = pd.read_csv("Amazon_reviews.csv", skiprows=skip, names=["rating", "title", "review"])

analyser = SentimentIntensityAnalyzer()
sent_vader = pd.DataFrame(columns=["vader_neg", "vader_neu",'vader_pos','vader_compound'])

###### vader
for i, review in enumerate(reviews_test.review):
    if i%1000==0: 
        print("{}% done at {}".format(x, datetime.datetime.now()))

### text blob
sent_blob=pd.DataFrame(columns=['Text_blob_polarity', "text_blob_subjectivity"])
for i, review in enumerate(reviews_test.review):
    if i%1000==0: 
        print("{}% done at {}".format(x, datetime.datetime.now()))
master=pd.concat([sent_vader, sent_blob], axis=1)

stem_function= PorterStemmer

## stemer function
def stemer(tokens, stem_function):
    for token in tokens:
    return stems

## tokenizes words
def tokenizer(text):
    text=re.sub("(!|\\)|\\(|`|~|,|\\.|#|$|%|\\*|\\'|\\\")", "",text) # regex to get ride of function
    toks=wordpunct_tokenize(text.lower()) # converting everything to lower case
    stemed=stemer(toks, stem_function)
    return stemed
vect = CountVectorizer(tokenizer=tokenizer, stop_words = en_stop, ngram_range= (1,2),max_features = 1000)
dtm = vect.transform(word_list)   
df2 = pd.DataFrame(dtm.toarray(), columns=vect.get_feature_names())

tfdif2 = TfidfTransformer(norm='l1', use_idf=True)
dtm_l2 = tfdif2.fit_transform(df2)
df2=pd.concat([df2, master], axis=1)

sent_test=df2[['vader_neg','vader_neu', 'vader_pos', 'vader_compound','Text_blob_polarity','text_blob_subjectivity']]
drops=['vader_neg','vader_neu', 'vader_pos', 'vader_compound','Text_blob_polarity','text_blob_subjectivity']
dimension_tec=[umap_fun, lle_fun, sparce_fun,pca_fun]
cols=[col for col in df2.columns if col not in drops]

col_name=["UMAP_{}".format(x) for x in range(umap_ft.shape[1])] # creating col names
umap_df_train=pd.DataFrame(umap_ft, columns=col_name) # Sparse train df

col_name=["LLE_{}".format(x) for x in range(lle_ft.shape[1])] # creating col names
lle_df_train=pd.DataFrame(lle_ft, columns=col_name) # Sparse train df

col_name=["Sparse_{}".format(x) for x in range(sparce_ft.shape[1])] # creating col names
sparse_df_train=pd.DataFrame(sparce_ft, columns=col_name) # Sparse train df

col_name=["PCA_{}".format(x) for x in range(pca_ft.shape[1])] # creating col names
PCA_df_train=pd.DataFrame(pca_ft, columns=col_name) # Sparse train df

train_df=pd.concat([sparse_df_train, PCA_df_train,umap_df_train,master,lle_df_train], axis=1)

chng=['vader_neg', 'vader_neu', 'vader_pos', 'vader_compound']
for col in chng:

cols=ft_import_df.sort_values('ft_im', ascending=False).cols[0:100]

# pred=cross_val_predict(clf, X, y)

print(classification_report(reviews_test.rating, pred))
##### selecting only 100 PCA's (issue with time) ##########################
# cols=["PCA_{}".format(x) for x in range(100)]
#cols=['Sparse_0','Sparse_1','Sparse_2','UMAP_0','UMAP_1','UMAP_2','vader_neg','vader_neu', 'vader_pos', 'vader_compound',
             #'Text_blob_polarity','text_blob_subjectivity','LLE_0','LLE_1','LLE_2', 'PCA_1','PCA_2','PCA_3']


x_train[[col for col in x_train.columns if col not in ['TSNE_1', 'TSNE_2']]]
Sparse_0 Sparse_1 Sparse_2 PCA_0 PCA_1 PCA_2 PCA_3 PCA_4 PCA_5 PCA_6 ... UMAP_0 UMAP_1 vader_neg vader_neu vader_pos vader_compound Text_blob_polarity text_blob_subjectivity LLE_0 LLE_1
0 -0.003447 -0.000146 -0.001338 -0.065991 0.028139 0.072930 -0.085083 -0.060750 0.044697 0.038091 ... -4.084264 6.528867 0.094 0.906 0 -0.6486 -0.0642857 0.52619 0.007993 -0.005878
1 0.000000 0.000000 0.000000 -0.071607 -0.082483 -0.001451 0.001518 -0.027734 0.044720 0.046058 ... -1.139816 2.391321 0 0.826 0.174 0.4939 -0.00416667 0.178571 -0.002267 -0.003622
2 -0.004382 0.000167 -0.015122 -0.053353 0.035840 0.105020 -0.068836 -0.044808 0.004433 -0.011878 ... -4.211996 5.050555 0.113 0.704 0.182 0.6674 -0.019725 0.657407 0.002207 -0.002527
3 -0.008564 0.000079 0.000000 -0.087841 -0.113705 -0.019849 -0.004054 -0.066870 -0.046401 -0.023208 ... -1.442914 1.960394 0.082 0.779 0.139 0.7287 0.206667 0.555 0.002733 0.005821
4 -0.000158 0.000006 -0.022199 -0.048134 0.052211 0.162864 -0.100061 -0.066620 0.021119 0.039493 ... -2.902161 5.783535 0.018 0.83 0.152 0.8338 0.151786 0.642857 0.011837 -0.003232
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
9495 -0.011959 -0.026144 0.000000 0.310136 0.018771 -0.015299 -0.018336 -0.029080 -0.018617 0.035390 ... -5.325161 4.488866 0 0.764 0.236 0.9779 0.323214 0.514286 0.001513 0.001566
9496 -0.007915 -0.013018 0.000000 0.128538 -0.011163 0.011558 -0.024980 -0.019052 0.040633 0.002340 ... -5.991697 3.430374 0.118 0.782 0.099 -0.4734 0.300278 0.63381 0.002872 0.002746
9497 -0.011722 0.000165 0.000000 -0.094820 0.130341 -0.144374 -0.007765 0.021522 -0.123656 0.041907 ... -3.833734 -0.690987 0.239 0.691 0.069 -0.946 -0.328571 0.614286 0.001560 0.002668
9498 -0.003490 -0.020510 0.000000 0.221674 -0.010095 -0.064216 -0.096943 -0.049029 0.029237 0.077646 ... -5.227555 3.110168 0.041 0.83 0.129 0.5859 0.266667 0.55 -0.000388 0.000630
9499 -0.003749 -0.009046 0.000000 0.084163 -0.015090 0.007058 -0.025886 -0.013741 0.067081 0.047346 ... -5.806878 5.106863 0.051 0.648 0.301 0.9185 -0.0745833 0.473333 -0.000098 0.000991

9500 rows × 572 columns

In [40]:
### feature engineering: most Frequent Words ####
# x_train=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\x_train\\dtm_l1_x_train.pkl')
# y_train=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\y_train\\dtm_l1_y_train.pkl')
# x_test=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\x_test\\dtm_l1_x_test.pkl')
# y_test=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\y_test\\dtm_l1_y_test.pkl')
# ###### Most Frequent Words #########

# top=500
# df=x_train
# df['rating']=y_train.rating
# # l1_dtm['rating']=rev.rating
# # display(l1_dtm.melt(var_name='rating'))
# top_1=df.groupby('rating').sum().T.sort_values(1, ascending=False)[0:top].index
# top_2=df.groupby('rating').sum().T.sort_values(2, ascending=False)[0:top].index
# top_3=df.groupby('rating').sum().T.sort_values(3, ascending=False)[0:top].index
# top_4=df.groupby('rating').sum().T.sort_values(4, ascending=False)[0:top].index
# top_5=df.groupby('rating').sum().T.sort_values(5, ascending=False)[0:top].index

# one_toks=[col for col in top_1 if col not in top_5 | top_4 | top_3]
# two_toks=[col for col in top_2 if col not in top_5 | top_4 ]
# three_toks=[col for col in top_3 if col not in top_5 | top_1]
# four_toks=[col for col in top_4 if col not in top_2|top_1]
# five_toks=[col for col in top_5 if col not in top_3 |top_2 | top_1]

# x=['familiar' for x in range(len(one_toks))]
# g=0
# freq_map={**{i:'Low star' for i in one_toks},**{i:'Med star' for i in three_toks},**{i:'High star' for i in five_toks}}

for dex in df.T.iloc[:-9,0].index:
Index(['$', '&', '--', '1', '1 2', '10', '100', '12', '15', '2',
       'wrong', 'wrote', 'ye', 'year', 'year ago', 'year old', 'yet', 'youll',
       'young', 'youv'],
      dtype='object', length=1000)
In [556]:
###### feature Engineering: Binning polarity
x_train=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\dimension_rd_x_test_2\\dtm_l1_dim_rd_test_2.pkl')
y_train=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\y_train\\dtm_l1_y_train.pkl')

0      0.7906
1      0.4926
2      0.1265
3      0.1263
4      0.9531
495    0.8233
496    0.9351
497    0.4215
498   -0.6537
499    0.7003
Name: vader_compound, Length: 500, dtype: object

Clustering Process

  • K-Means
  • Most Frequenct Words
  • Token Cluster Attempt
In [25]:
In [25]:
%cd "C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\clusters"

path="C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\"

### reducing cols ####
cols=["PCA_{}".format(x) for x in range(100)]

### getting file directoryies 

hdbscan_fun=HDBSCAN(min_cluster_size=500, min_samples=50, core_dist_n_jobs=-1,metric='manhattan')
y_train=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\y_train\\dtm_l1_y_train.pkl')
# cols=["PCA_{}".format(x) for x in range(100)]

cols=['vader_neg', 'vader_neu','vader_pos', 'vader_compound', 'Text_blob_polarity','text_blob_subjectivity']


cluster_df=pd.DataFrame(cluster, columns=['HDBSCAN'])  
# cluster_df.to_pickle(names[i]+'_cluster.pkl')
# print("############### DF {} at {}##########".format(file,now))
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\clusters
In [22]:
path="C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\"

In [26]:
rating 1.0 2.0 3.0 4.0 5.0 nan
-1 493 499 482 481 455 148
0 187 167 164 167 170 51
1 1153 1174 1151 1118 1152 288
In [110]:
######################################################################### HDBSCAN Crosstab #######################################################################################

pd.crosstab(cluster_df['rating'],cluster_df['HDBSCAN'],margins=True, margins_name='total')
HDBSCAN -1 0 1 2 3 4 5 6 7 total
1 1343 29 130 73 2 1 183 60 12 1833
2 1408 31 115 57 4 1 154 61 9 1840
3 1350 39 109 54 3 3 177 53 9 1797
4 1325 34 114 57 1 4 169 55 7 1766
5 1359 29 115 52 1 1 157 53 10 1777
total 6785 162 583 293 11 10 840 282 47 9013
In [31]:
#################################################################################### DBSCAN ################################################################################################
%cd "C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\clusters"
db_fun= DBSCAN(eps=.5,min_samples=500,p=2, n_jobs=-1,leaf_size=100)
y_train=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\y_train\\dtm_l1_y_train.pkl')
cols=["PCA_{}".format(x) for x in range(100)]
cols.extend(['vader_neg', 'vader_neu','vader_pos', 'vader_compound', 'Text_blob_polarity','text_blob_subjectivity'])
# cols=['TSNE_1','TSNE_2']
# cols=['vader_neg', 'vader_neu','vader_pos', 'vader_compound', 'Text_blob_polarity','text_blob_subjectivity']


cluster_df=pd.DataFrame(cluster, columns=['DBSCAN'])  
# print("############### DF {} at {}##########".format(file,now))
In [32]:
################################################################ DBSCAN Cross Tab ########################################################################################
pd.crosstab(cluster_df['rating'],cluster_df['DBSCAN'],margins=True, margins_name='total')
DBSCAN -1 0 1 total
1 493 187 1153 1833
2 499 167 1174 1840
3 482 164 1151 1797
4 481 167 1118 1766
5 455 170 1152 1777
total 2410 855 5748 9013
In [29]:
#################################################################################### K-MEANS ################################################################################################
km_fun= KMeans(n_clusters=5, n_jobs=-1)
y_train=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\y_train\\dtm_l1_y_train.pkl')
#cols=["PCA_{}".format(x) for x in range(100)]
#cols.extend(['vader_neg', 'vader_neu','vader_pos', 'vader_compound', 'Text_blob_polarity','text_blob_subjectivity'])
#cols=['vader_neg', 'vader_neu','vader_pos', 'vader_compound', 'Text_blob_polarity','text_blob_subjectivity']


cluster_df=pd.DataFrame(cluster, columns=['K_MEANS'])  
# print("############### DF {} at {}##########".format(file,now))
In [30]:
######################################################################### K-Means Cross Tab ##################################################################################
pd.crosstab(cluster_df['rating'],cluster_df['K_MEANS'],margins=True, margins_name='total')
K_MEANS 0 1 2 3 4 total
1 296 286 591 367 293 1833
2 329 289 606 368 248 1840
3 341 281 567 364 244 1797
4 313 272 572 351 258 1766
5 297 271 619 343 247 1777
total 1576 1399 2955 1793 1290 9013
In [333]:
################# Clustering on tokens #################
l1_dtm=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\df's\\dtm_l1.pkl")
In [487]:
In [487]:
####################################################################### Frequency of word by Ratings ##################################################

# l1_dtm=l1_dtm.iloc[:,:-6]
l1_dtm=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\df's\\dtm_l1.pkl")

# display(l1_dtm.melt(var_name='rating'))
top_1=l1_dtm.groupby('rating').sum().T.sort_values(1, ascending=False)[0:top].index
top_2=l1_dtm.groupby('rating').sum().T.sort_values(2, ascending=False)[0:top].index
top_3=l1_dtm.groupby('rating').sum().T.sort_values(3, ascending=False)[0:top].index
top_4=l1_dtm.groupby('rating').sum().T.sort_values(4, ascending=False)[0:top].index
top_5=l1_dtm.groupby('rating').sum().T.sort_values(5, ascending=False)[0:top].index

one_toks=[col for col in top_1 if col not in top_5 | top_4 | top_3 | top_2 ]
two_toks=[col for col in top_2 if col not in top_5 | top_4 |]
three_toks=[col for col in top_3 if col not in top_5 | top_1]
four_toks=[col for col in top_4 if col not in top_2|top_1]
five_toks=[col for col in top_5 if col not in top_3 |top_2 | top_1]
In [410]:
################################################################################# Clustering Token Atempt  ########################################################################################


umap_fun=umap.UMAP(n_neighbors=50, n_components=10, metric='cosine')



cluster_df=pd.DataFrame(cluster, columns=['HDBSCAN'])  
# print("############### DF {} at {}##########".format(file,now))
In [427]:
token_cluster=pd.concat([cluster_df,tokens,trn.rating], axis=1)
HDBSCAN 0 1 2 3 4
1 0 0 1000 0 0
2 0 0 0 1000 0
3 0 1000 0 0 0
4 999 0 0 1 0
5 0 0 0 0 1000