A Predictive Review

by: Joshua Roberge

In [2]:
### dependencies ###

import pandas as pd
import numpy as np
import random 
import re

### tokenizers #####
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import wordpunct_tokenize

### DTM tehcniques ###
from sklearn.feature_extraction.text import TfidfTransformer
from nltk import wordpunct_tokenize
from nltk import word_tokenize 


## DTM Makers ####
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer



#### dimension reduction techniques ######
### dimension reduction teckniques
from sklearn.decomposition import SparsePCA
from sklearn.manifold import TSNE
from sklearn.manifold import LocallyLinearEmbedding as LLE
from sklearn.decomposition import SparsePCA
import umap
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split

import os
import datetime 



#### models implenmentations ######
## models 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

### models selection ###
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt


### senitment analysis ###
from textblob import TextBlob 
from textblob.sentiments import NaiveBayesAnalyzer 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


### Clustering ####
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from hdbscan import HDBSCAN


from mpl_toolkits.mplot3d import Axes3D
from xgboost import plot_importance
In [7]:
############################################################################# Full Read #####################################################################################################
### will use for latter analysis ####

%cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\Amazon_reviews"
reviews= pd.read_csv("Amazon_reviews.csv", names=["rating", "title", "review"],)
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\Amazon_reviews
In [2]:
#################################################################################### partial sample Read ####################################################################################

%cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\Amazon_reviews"

n = 3000000 # df size
s = 10000 #desired sample size
skip = sorted(random.sample(range(1, n), n-s)) # rows to skip
reviews = pd.read_csv("Amazon_reviews.csv", skiprows=skip, names=["rating", "title", "review"])

del(skip,n)
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\Amazon_reviews

Sentiment Analysis:

  • TextBlob
  • Vader
  • Graphical Analysis
In [4]:
### text blob
TextBlob(reviews.review[1]).sentiment
sent_blob=pd.DataFrame(columns=['Text_blob_polarity', "text_blob_subjectivity"])
x=0
for i, review in enumerate(reviews.review):
    sent=TextBlob(review).sentiment
    sent_blob.loc[i,"Text_blob_polarity"]=sent[0]
    sent_blob.loc[i,"text_blob_subjectivity"]=sent[1]
    if i%1000==0: 
        print("{}% done at {}".format(x, datetime.datetime.now()))
        n+=1000



sent_blob
    
    
0% done at 2019-12-12 20:18:40.382848
0% done at 2019-12-12 20:18:43.433088
0% done at 2019-12-12 20:18:46.498303
0% done at 2019-12-12 20:18:52.927312
Out[4]:
Text_blob_polarity text_blob_subjectivity
0 0.000000 0.000000
1 -0.021429 0.576190
2 0.277273 0.511364
3 0.253030 0.492424
4 0.284762 0.508333
... ... ...
9995 0.500000 0.400000
9996 0.068750 0.456250
9997 0.234091 0.437500
9998 0.136364 0.454545
9999 0.296667 0.626667

10000 rows × 2 columns

In [5]:
### vader analysis #####

now=datetime.datetime.now()
analyser = SentimentIntensityAnalyzer()
sent_vader = pd.DataFrame(columns=["vader_neg", "vader_neu",'vader_pos','vader_compound'])
x=0

for i, review in enumerate(reviews.review):
    sent=analyser.polarity_scores(review)
    sent_vader.loc[i,"vader_neg"]=sent['neg']
    sent_vader.loc[i,"vader_neu"]=sent['neu']
    sent_vader.loc[i,"vader_pos"]=sent['pos']
    sent_vader.loc[i,"vader_compound"]=sent['compound']
    if i%1000==0: 
        print("{}% done at {}".format(x, datetime.datetime.now()))
        x+=10


sent_vader
0% done at 2019-12-12 20:19:44.370207
10% done at 2019-12-12 20:19:47.443441
20% done at 2019-12-12 20:19:51.294726
30% done at 2019-12-12 20:19:55.503255
40% done at 2019-12-12 20:19:59.674867
50% done at 2019-12-12 20:20:04.239209
60% done at 2019-12-12 20:20:09.377592
70% done at 2019-12-12 20:20:15.599099
80% done at 2019-12-12 20:20:21.895568
90% done at 2019-12-12 20:20:29.317125
Out[5]:
vader_neg vader_neu vader_pos vader_compound
0 0.111 0.741 0.148 0.2003
1 0.106 0.853 0.041 -0.3855
2 0 0.863 0.137 0.9272
3 0.071 0.858 0.071 0.0249
4 0.058 0.746 0.197 0.92
... ... ... ... ...
9995 0.195 0.805 0 -0.3867
9996 0.064 0.837 0.1 0.1747
9997 0 0.78 0.22 0.875
9998 0.097 0.903 0 -0.575
9999 0 0.567 0.433 0.8655

10000 rows × 4 columns

In [14]:
### saving sentiment #####
%cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\sentiment"
sent_vader.to_pickle('sent_vader.pkl')
sent_blob.to_pickle('sent_blob.pkl')

master=pd.concat([sent_vader, sent_blob], axis=1)
master.to_pickle('master_sent.pkl')
master
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\sentiment
Out[14]:
vader_neg vader_neu vader_pos vader_compound Text_blob_polarity text_blob_subjectivity
0 0.111 0.741 0.148 0.2003 0.000000 0.000000
1 0.106 0.853 0.041 -0.3855 -0.021429 0.576190
2 0 0.863 0.137 0.9272 0.277273 0.511364
3 0.071 0.858 0.071 0.0249 0.253030 0.492424
4 0.058 0.746 0.197 0.92 0.284762 0.508333
... ... ... ... ... ... ...
9995 0.195 0.805 0 -0.3867 0.500000 0.400000
9996 0.064 0.837 0.1 0.1747 0.068750 0.456250
9997 0 0.78 0.22 0.875 0.234091 0.437500
9998 0.097 0.903 0 -0.575 0.136364 0.454545
9999 0 0.567 0.433 0.8655 0.296667 0.626667

10000 rows × 6 columns

In [278]:
####################################################### graphs for sentiment ###############################################################
%cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\sentiment"
sent_master=pd.read_pickle('master_sent.pkl')
sent_master
ratings=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\df's\\reviews.pkl").rating
sent_master['rating']=ratings
sent_master.vader_neg=sent_master.vader_neg.astype(float)
sent_master.vader_neu=sent_master.vader_neu.astype(float)
sent_master.vader_pos=sent_master.vader_pos.astype(float)
sent_master.vader_compound=sent_master.vader_compound.astype(float)
sent_master=sent_master.sort_values('rating')
grp_sent=sent_master.groupby('rating', as_index=False).mean()

# grp_sent
# sent_master

# sns.barplot(y=grp_sent.vader_neg, x=grp_sent.ratting)
# plt.show()

# sns.barplot(y=grp_sent.vader_pos, x=grp_sent.ratting)
# plt.show()

# sns.barplot(y=grp_sent.text_blob_subjectivity, x=grp_sent.ratting)
# plt.show()
fig = plt.figure(figsize=(50,50))
plt.style.use(['fivethirtyeight'])
ax1=plt.subplot2grid((4,4), (0,0), rowspan=1, colspan=1)
ax2=plt.subplot2grid((4,4), (0,1), rowspan=1, colspan=1)
ax3=plt.subplot2grid((4,4), (1,0), rowspan=2, colspan=2)


sns.barplot(y=grp_sent.Text_blob_polarity, x=grp_sent.rating, ax=ax1)
sns.barplot(y=grp_sent.vader_compound, x=grp_sent.rating, ax=ax2)
x=sent_master.rating.astype(str)
sns.scatterplot(y=sent_master.Text_blob_polarity, x= sent_master.vader_compound, hue=sent_master.rating.astype(str)+'#',
                palette='twilight',ax=ax3)

ax3.set_title("\n\nText Blob Polairty Vs. Vader Compund Score", weight='bold',size=50)
ax3.set_xlabel("Vader Compound Score",size=35, weight='bold')
ax3.set_ylabel("TextBlob Polairty Score",size=35, weight='bold')
ax3.legend(loc=2, markerscale=6,fancybox=True, shadow=True,title='City Area', prop={'size': 30})

ax1.set_title("Text Blob Polairty Score", weight='bold',size=40)
ax1.set_ylabel("Text Blob Polarity",size=25, weight='bold')
ax1.set_xlabel("Star Rating", size=25, weight='bold')

ax2.set_title("Vader Polairty Score", weight='bold',size=40)
ax2.set_ylabel("Vader Compound Score",size=25, weight='bold')
ax2.set_xlabel("Star Rating",size=25, weight='bold')

plt.show()
fig.savefig('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\figs\\sentiment.png')




# sent_master
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\sentiment

Creating Document Term Matrices

  • Document Term Frequency Matrix
  • Weighted Document Term Matrix (L1, L2)
In [63]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
from nltk import wordpunct_tokenize



### Stop Words ####
en_stop = stopwords.words('english')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

# Additions to the stop word list
en_stop.extend([".","-","(", ")","/", r",+", "’", "”","“", "\n",r"\\)","!","?",r"\\(",r"\\'",r'\\"'])
In [11]:
############################################################################ Document Term Matrixs ###################################################################
stem_function= PorterStemmer
port=PorterStemmer

## stemer function
def stemer(tokens, stem_function):
    stems=[]
    for token in tokens:
        stem=port().stem(token)
        stems.append(stem)
    return stems

## tokenizes words
def tokenizer(text):
    text=re.sub("(!|\\)|\\(|`|~|,|\\.|#|$|%|\\*|\\'|\\\")", "",text) # regex to get ride of function
    toks=wordpunct_tokenize(text.lower()) # converting everything to lower case
    stemed=stemer(toks, stem_function)
    return stemed
    
    
    
word_list=reviews.review.to_list()   
vect = CountVectorizer(tokenizer=tokenizer, stop_words = en_stop, ngram_range= (1,2),max_features = 1000)
vect.fit(word_list)
dtm = vect.transform(word_list)   
df2 = pd.DataFrame(dtm.toarray(), columns=vect.get_feature_names())
df2.head()
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\sklearn\feature_extraction\text.py:300: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['+', '\\\\', 'abov', 'ani', 'arent', 'becaus', 'befor', 'couldnt', 'didnt', 'doe', 'doesnt', 'dont', 'dure', 'ha', 'hadnt', 'hasnt', 'havent', 'hi', 'isnt', 'mightnt', 'mustnt', 'neednt', 'onc', 'onli', 'ourselv', 'shant', 'shouldnt', 'shouldv', 'thatll', 'themselv', 'thi', 'veri', 'wa', 'wasnt', 'werent', 'whi', 'wont', 'wouldnt', 'youd', 'youll', 'yourselv', 'youv'] not in stop_words.
  'stop_words.' % sorted(inconsistent))
Out[11]:
$ & -- 1 1 2 10 100 12 15 2 ... wrong wrote ye year year ago year old yet youll young youv
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 1000 columns

In [61]:
##################################################################### Environment Cleanup ###################################################################################################
#del(en_stop,string, text, word_list,dtm)

### saving DF2
#df2.to_pickle("freq_df.pkl")


#  del(en_stop,string, text, word_list)
In [16]:
########################################################################### Term Frequancey L1 & L2 ##############################################################################
from sklearn.feature_extraction.text import TfidfTransformer

tfdif2 = TfidfTransformer(norm='l1', use_idf=True)
tfdif3 = TfidfTransformer(norm='l2', use_idf=True)


dtm_l2 = tfdif2.fit_transform(df2)
dtm_l1 = tfdif3.fit_transform(df2)


%cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\df's"

### l2 DTM #####
dtm_l2=pd.DataFrame(dtm_l2.toarray(), columns=vect.get_feature_names())
dtm_l2=pd.concat([dtm_l2, master], axis=1)
# dtm_l2.shape()
dtm_l2.to_pickle("dtm_l2.pkl") ## using pickle fromat due to size
del(dtm_l2) 

# ### l1 DTM #####
dtm_l1=pd.DataFrame(dtm_l1.toarray(), columns=vect.get_feature_names())
dtm_l1=pd.concat([dtm_l1, master], axis=1)
dtm_l1.to_pickle("dtm_l1.pkl") ## using pickle format due to size
# dtm_l1.shape()

del(dtm_l1)

df2=pd.concat([df2, master], axis=1)
df2.to_pickle("freq_df.pkl")
# df2.shape()

reviews.to_pickle("reviews.pkl")
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\df's

Train Test Split & TSNE

  • T-SNE tuning
  • Train Test split and T-SNE Fitting
  • Graphing T-SNE
In [31]:
############################################################################ T-SNE Tunning ########################################################################
### mapping TSNE ####
%cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\df's"
directory_main=os.listdir()[0:3]
df=pd.read_pickle(directory_main[0])

### aproximating TSNE with sample
# rows=random.sample(range(1, df.shape[0]), 1000)
# df=df.iloc[rows,:]
y=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\df's\\reviews.pkl").rating
y[rows]

x=TSNE(n_components=2, perplexity=250, verbose=10).fit_transform(df.values)

sns.scatterplot(x=x[:,0], y=x[:,1],hue=y.astype(str)+'_#')
Out[31]:
array([[  4.4325747,  -5.829836 ],
       [ 13.77044  ,  -6.8514385],
       [ -6.1297827,   2.7519443],
       ...,
       [-14.941263 ,   2.4553525],
       [ 15.015589 ,  -4.0810432],
       [ -8.19329  ,   5.088409 ]], dtype=float32)
In [3]:
############################################################################ train test split & TSNE Fit #################################################

## this esection creates a train and test direcotry & fits T-SNE #####
now=datetime.datetime.now()
%cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\df's"
directory_main=os.listdir()[0:3]
name=["dtm_l1","dtm_l2", "dtm_freq"]
y=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\df's\\reviews.pkl").rating

for i, file in enumerate(directory_main):
    
    ### reading in file #####
    %cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\df's"
    df=pd.read_pickle(file)
    
    
    ### getting TSNE #####
    sne_df=TSNE(n_components=2, perplexity=250, verbose=10).fit_transform(df.values)
    sne_df=pd.DataFrame(sne_df, columns=["TSNE_1","TSNE_1"])
    TSNE_file=name[i]+"_TSNE"+".pkl"
    sne_df.to_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\TSNE\\{}".format(TSNE_file))
    df['TSNE_1']=sne_df.iloc[:,0]
    df['TSNE_2']=sne_df.iloc[:,1]
    print("TSNE calcuclation done for ", name[i])
    print("total time ", datetime.datetime.now()-now)
    print("Current Time: ", datetime.datetime.now())

    #### setting up directories for Train Test splits #####
    x_train, x_test, y_train, y_test = train_test_split(df.values, y, test_size=0.05, random_state=42)
    
    %cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\x_train"
    pd.DataFrame(x_train, columns=df.columns).to_pickle(name[i]+"_x_train"+".pkl")
    
    %cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\x_test"
    pd.DataFrame(x_test, columns=df.columns).to_pickle(name[i]+"_x_test"+".pkl")
    
    %cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\y_train"
    pd.DataFrame(y_train).to_pickle(name[i]+"_y_train"+".pkl")
    
    %cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\y_test"
    pd.DataFrame(y_test).to_pickle(name[i]+"_y_test"+".pkl")

    
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\df's
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\df's
[t-SNE] Computing 751 nearest neighbors...
[t-SNE] Indexed 10000 samples in 0.454s...
[t-SNE] Computed neighbors for 10000 samples in 149.610s...
[t-SNE] Computed conditional probabilities for sample 1000 / 10000
[t-SNE] Computed conditional probabilities for sample 2000 / 10000
[t-SNE] Computed conditional probabilities for sample 3000 / 10000
[t-SNE] Computed conditional probabilities for sample 4000 / 10000
[t-SNE] Computed conditional probabilities for sample 5000 / 10000
[t-SNE] Computed conditional probabilities for sample 6000 / 10000
[t-SNE] Computed conditional probabilities for sample 7000 / 10000
[t-SNE] Computed conditional probabilities for sample 8000 / 10000
[t-SNE] Computed conditional probabilities for sample 9000 / 10000
[t-SNE] Computed conditional probabilities for sample 10000 / 10000
[t-SNE] Mean sigma: 0.318999
[t-SNE] Computed conditional probabilities in 3.087s
[t-SNE] Iteration 50: error = 69.9212418, gradient norm = 0.0189482 (50 iterations in 23.324s)
[t-SNE] Iteration 100: error = 71.1002502, gradient norm = 0.0061875 (50 iterations in 17.171s)
[t-SNE] Iteration 150: error = 71.9981155, gradient norm = 0.0022648 (50 iterations in 18.855s)
[t-SNE] Iteration 200: error = 71.8713760, gradient norm = 0.0040598 (50 iterations in 26.203s)
[t-SNE] Iteration 250: error = 71.8986206, gradient norm = 0.0035788 (50 iterations in 30.483s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 71.898621
[t-SNE] Iteration 300: error = 2.7940745, gradient norm = 0.0019683 (50 iterations in 46.084s)
[t-SNE] Iteration 350: error = 2.5963926, gradient norm = 0.0015467 (50 iterations in 76.462s)
[t-SNE] Iteration 400: error = 2.4361203, gradient norm = 0.0002727 (50 iterations in 74.450s)
[t-SNE] Iteration 450: error = 2.4035795, gradient norm = 0.0001801 (50 iterations in 51.061s)
[t-SNE] Iteration 500: error = 2.3779471, gradient norm = 0.0001856 (50 iterations in 30.629s)
[t-SNE] Iteration 550: error = 2.3771119, gradient norm = 0.0001429 (50 iterations in 36.838s)
[t-SNE] Iteration 600: error = 2.3640788, gradient norm = 0.0001627 (50 iterations in 34.245s)
[t-SNE] Iteration 650: error = 2.3159752, gradient norm = 0.0004437 (50 iterations in 26.964s)
[t-SNE] Iteration 700: error = 2.1884377, gradient norm = 0.0013692 (50 iterations in 14.183s)
[t-SNE] Iteration 750: error = 2.1858022, gradient norm = 0.0000592 (50 iterations in 13.892s)
[t-SNE] Iteration 800: error = 2.1857400, gradient norm = 0.0000254 (50 iterations in 13.840s)
[t-SNE] Iteration 850: error = 2.1857526, gradient norm = 0.0000142 (50 iterations in 13.721s)
[t-SNE] Iteration 900: error = 2.1858263, gradient norm = 0.0000185 (50 iterations in 13.658s)
[t-SNE] Iteration 950: error = 2.1859376, gradient norm = 0.0000097 (50 iterations in 13.788s)
[t-SNE] Iteration 1000: error = 2.1861193, gradient norm = 0.0000105 (50 iterations in 13.879s)
[t-SNE] KL divergence after 1000 iterations: 2.186119
TSNE calcuclation done for  dtm_l1
total time  0:12:23.792472
Current Time:  2019-12-13 00:32:03.176892
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\x_train
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\x_test
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\y_train
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\y_test
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\df's
[t-SNE] Computing 751 nearest neighbors...
[t-SNE] Indexed 10000 samples in 0.447s...
[t-SNE] Computed neighbors for 10000 samples in 81.599s...
[t-SNE] Computed conditional probabilities for sample 1000 / 10000
[t-SNE] Computed conditional probabilities for sample 2000 / 10000
[t-SNE] Computed conditional probabilities for sample 3000 / 10000
[t-SNE] Computed conditional probabilities for sample 4000 / 10000
[t-SNE] Computed conditional probabilities for sample 5000 / 10000
[t-SNE] Computed conditional probabilities for sample 6000 / 10000
[t-SNE] Computed conditional probabilities for sample 7000 / 10000
[t-SNE] Computed conditional probabilities for sample 8000 / 10000
[t-SNE] Computed conditional probabilities for sample 9000 / 10000
[t-SNE] Computed conditional probabilities for sample 10000 / 10000
[t-SNE] Mean sigma: 0.109157
[t-SNE] Computed conditional probabilities in 3.547s
[t-SNE] Iteration 50: error = 67.7459335, gradient norm = 0.0055018 (50 iterations in 15.446s)
[t-SNE] Iteration 100: error = 60.2961349, gradient norm = 0.0007880 (50 iterations in 15.800s)
[t-SNE] Iteration 150: error = 60.1451607, gradient norm = 0.0000250 (50 iterations in 13.398s)
[t-SNE] Iteration 200: error = 60.1399765, gradient norm = 0.0000413 (50 iterations in 12.312s)
[t-SNE] Iteration 250: error = 60.1398468, gradient norm = 0.0000249 (50 iterations in 12.501s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 60.139847
[t-SNE] Iteration 300: error = 1.6233674, gradient norm = 0.0019057 (50 iterations in 13.850s)
[t-SNE] Iteration 350: error = 1.3570467, gradient norm = 0.0003720 (50 iterations in 15.129s)
[t-SNE] Iteration 400: error = 1.2828726, gradient norm = 0.0001797 (50 iterations in 16.288s)
[t-SNE] Iteration 450: error = 1.2504454, gradient norm = 0.0001028 (50 iterations in 16.358s)
[t-SNE] Iteration 500: error = 1.2338892, gradient norm = 0.0000704 (50 iterations in 15.778s)
[t-SNE] Iteration 550: error = 1.2245333, gradient norm = 0.0000457 (50 iterations in 15.971s)
[t-SNE] Iteration 600: error = 1.2187227, gradient norm = 0.0000328 (50 iterations in 15.571s)
[t-SNE] Iteration 650: error = 1.2149807, gradient norm = 0.0000259 (50 iterations in 16.413s)
[t-SNE] Iteration 700: error = 1.2124690, gradient norm = 0.0000203 (50 iterations in 16.469s)
[t-SNE] Iteration 750: error = 1.2106905, gradient norm = 0.0000205 (50 iterations in 16.630s)
[t-SNE] Iteration 800: error = 1.2095976, gradient norm = 0.0000152 (50 iterations in 15.488s)
[t-SNE] Iteration 850: error = 1.2088320, gradient norm = 0.0000142 (50 iterations in 15.448s)
[t-SNE] Iteration 900: error = 1.2083148, gradient norm = 0.0000136 (50 iterations in 16.687s)
[t-SNE] Iteration 950: error = 1.2080100, gradient norm = 0.0000120 (50 iterations in 15.635s)
[t-SNE] Iteration 1000: error = 1.2077711, gradient norm = 0.0000104 (50 iterations in 15.261s)
[t-SNE] KL divergence after 1000 iterations: 1.207771
TSNE calcuclation done for  dtm_l2
total time  0:18:59.970836
Current Time:  2019-12-13 00:38:39.354255
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\x_train
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\x_test
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\y_train
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\y_test
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\df's
[t-SNE] Computing 751 nearest neighbors...
[t-SNE] Indexed 10000 samples in 0.727s...
[t-SNE] Computed neighbors for 10000 samples in 164.801s...
[t-SNE] Computed conditional probabilities for sample 1000 / 10000
[t-SNE] Computed conditional probabilities for sample 2000 / 10000
[t-SNE] Computed conditional probabilities for sample 3000 / 10000
[t-SNE] Computed conditional probabilities for sample 4000 / 10000
[t-SNE] Computed conditional probabilities for sample 5000 / 10000
[t-SNE] Computed conditional probabilities for sample 6000 / 10000
[t-SNE] Computed conditional probabilities for sample 7000 / 10000
[t-SNE] Computed conditional probabilities for sample 8000 / 10000
[t-SNE] Computed conditional probabilities for sample 9000 / 10000
[t-SNE] Computed conditional probabilities for sample 10000 / 10000
[t-SNE] Mean sigma: 1.473225
[t-SNE] Computed conditional probabilities in 2.914s
[t-SNE] Iteration 50: error = 68.2438889, gradient norm = 0.1391273 (50 iterations in 20.014s)
[t-SNE] Iteration 100: error = 69.6944733, gradient norm = 0.0865247 (50 iterations in 20.759s)
[t-SNE] Iteration 150: error = 69.5777817, gradient norm = 0.1210581 (50 iterations in 22.471s)
[t-SNE] Iteration 200: error = 69.5208130, gradient norm = 0.1019059 (50 iterations in 20.512s)
[t-SNE] Iteration 250: error = 69.9595261, gradient norm = 0.0926369 (50 iterations in 17.891s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 69.959526
[t-SNE] Iteration 300: error = 2.6270587, gradient norm = 0.0041577 (50 iterations in 16.372s)
[t-SNE] Iteration 350: error = 2.4387221, gradient norm = 0.0055062 (50 iterations in 15.815s)
[t-SNE] Iteration 400: error = 2.4200284, gradient norm = 0.0002775 (50 iterations in 15.821s)
[t-SNE] Iteration 450: error = 2.4162779, gradient norm = 0.0000789 (50 iterations in 15.602s)
[t-SNE] Iteration 500: error = 2.4151018, gradient norm = 0.0000439 (50 iterations in 15.792s)
[t-SNE] Iteration 550: error = 2.4143989, gradient norm = 0.0000415 (50 iterations in 16.028s)
[t-SNE] Iteration 600: error = 2.4138396, gradient norm = 0.0000240 (50 iterations in 15.866s)
[t-SNE] Iteration 650: error = 2.4133272, gradient norm = 0.0000871 (50 iterations in 15.958s)
[t-SNE] Iteration 700: error = 2.4129448, gradient norm = 0.0000293 (50 iterations in 15.966s)
[t-SNE] Iteration 750: error = 2.4127402, gradient norm = 0.0000227 (50 iterations in 15.986s)
[t-SNE] Iteration 800: error = 2.4125929, gradient norm = 0.0000113 (50 iterations in 16.272s)
[t-SNE] Iteration 850: error = 2.4125130, gradient norm = 0.0000088 (50 iterations in 16.384s)
[t-SNE] Iteration 900: error = 2.4124386, gradient norm = 0.0000104 (50 iterations in 16.308s)
[t-SNE] Iteration 950: error = 2.4124711, gradient norm = 0.0000077 (50 iterations in 16.290s)
[t-SNE] Iteration 1000: error = 2.4125392, gradient norm = 0.0000076 (50 iterations in 16.147s)
[t-SNE] KL divergence after 1000 iterations: 2.412539
TSNE calcuclation done for  dtm_freq
total time  0:27:35.701970
Current Time:  2019-12-13 00:47:15.085389
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\x_train
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\x_test
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\y_train
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\y_test
In [294]:
################################################################################ TSNE Graph ####################################################################################
# tsne=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\TSNE\\dtm_l1_TSNE.pkl')
# ratings=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\df's\\reviews.pkl").rating

# tsne['Ratings']=ratings
# tsne=tsne.sort_values('Ratings')
# tsne['Ratings']=tsne['Ratings'].astype(str)+'#'
ones=tsne[tsne.Ratings=='1#']
fives=tsne[tsne.Ratings=='5#']

fig, axe = plt.subplots(figsize=(20,20))


sns.kdeplot(data=ones.iloc[:,0],data2=ones.iloc[:,1],cmap="Blues", ax=axe, shade=True, alpha=.3 )
sns.kdeplot(data=fives.iloc[:,0],data2=fives.iloc[:,1],cmap="Reds", shade=True, ax=axe, alpha=.3)
sns.scatterplot(x=tsne.iloc[:,0], y=tsne.iloc[:,1], hue=tsne['Ratings'], palette='twilight',
                x_jitter=10,y_jitter=10,alpha=.8,style=tsne['Ratings'], ax=axe)
axe.set_title('T-SNE Graph', size=50, fontdict= {'family':'MS Reference Sans Serif'} )
axe.set_ylabel('T-SNE Component One', size=30)
axe.set_xlabel('T-SNE Component Two', size=30)
axe.legend(loc=2, markerscale=3,fancybox=True, shadow=True, prop={'size': 20})

fig.savefig('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\figs\\TSNE.png')
In [ ]:
### Clean Up ####
# del(df, directory_main, file, i, name,names, y, x_train, x_test, y_train, y_test)

Dimension Reduction

This section is not in the most logical order so I will explain. The first initial dimension reduction were just fit based off of default settings. After running the first batch of models and narrowing down my DTMs the dimensions were then re-worked and more finely tunned.

  • 1st dimension reductions
  • l1 dimension reductions
  • UMap Graph
  • LLE Graph
In [4]:
########################################################################### First Dimension Reductions #############################################################################

import warnings
warnings.simplefilter("ignore")

now=datetime.datetime.now()

#### dimension Reduction ####
dimensions=3
n_neighbores=15

#### Parameters ####
umap_fun=umap.UMAP(n_neighbors=15, n_components=3)
lle_fun=LLE(n_neighbors=5,n_components=3,n_jobs=-1)
sparce_fun=SparsePCA(n_components=3, n_jobs=-1,)
pca_fun=PCA(n_components=.8)

### initializations ####
dimension_tec=[umap_fun, lle_fun, sparce_fun]
reduction_test=pd.DataFrame()
reduction_train=pd.DataFrame()
names=["dtm_freq", "dtm_l1", "dtm_l2"]
### getting paths ######
path="C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\"
x_train_directory=os.listdir("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\x_train")
x_test_directory=os.listdir("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\x_test")


for i, file in enumerate(x_train_directory):
    
    
    ### take out [0:50] to run whole dataset ####
    drops=['vader_neg','vader_neu', 'vader_pos', 'vader_compound','Text_blob_polarity','text_blob_subjectivity','TSNE_1','TSNE_2']
    x_train=pd.read_pickle(path+"x_train\\"+file)
    sent_train=x_train[['vader_neg','vader_neu', 'vader_pos', 'vader_compound','Text_blob_polarity','text_blob_subjectivity']]
    TSNE_train=x_train[['TSNE_1','TSNE_2']]
    x_train=x_train[[col for col in x_train.columns if col not in drops]]
#     display(x_train)

    x_test=pd.read_pickle(path+"x_test\\"+x_test_directory[i])
    sent_test=x_test[['vader_neg','vader_neu', 'vader_pos', 'vader_compound','Text_blob_polarity','text_blob_subjectivity']]
    TSNE_test=x_test[['TSNE_1','TSNE_2']]
    x_test=x_test[[col for col in x_test.columns if col not in drops]]
#     display(x_test)

    ### Sparse ####
    Sparse_transform_train=sparce_fun.fit_transform(x_train) # fitting to the training set
    col_name=["Sparse_{}".format(x) for x in range(Sparse_transform_train.shape[1])] # creating col names
    sparse_df_train=pd.DataFrame(Sparse_transform_train, columns=col_name) # Sparse train df
    sparse_df_test=pd.DataFrame(sparce_fun.transform(x_test), columns=col_name) # Sparse test df
    print("\nSparse for DTM {}\ntotal time :".format(file), datetime.datetime.now()-now) ## ticker   
    
    ### PCA #####
    PCA_transform_train=pca_fun.fit_transform(x_train) # fitting to the training set
    col_name=["PCA_{}".format(x) for x in range(PCA_transform_train.shape[1])] # creating col names
    PCA_df_train=pd.DataFrame(PCA_transform_train, columns=col_name) # PCA train df
    PCA_df_test=pd.DataFrame(pca_fun.transform(x_test), columns=col_name) # PCA test df
    print("\n\n\nPCA for DTM {}\ntotal time :".format(file), datetime.datetime.now()-now) ## ticker   
    
    ### U Map #####
    u_map_transform_train=umap_fun.fit_transform(x_train) # fitting to the training set
    col_name=["UMAP_{}".format(x) for x in range(u_map_transform_train.shape[1])] # creating col names
    umap_df_train=pd.DataFrame(u_map_transform_train, columns=col_name) # umap train df
    umap_df_test=pd.DataFrame(umap_fun.transform(x_test), columns=col_name) # umap test df
    print("\n\n\nUMAP for DTM {}\ntotal time :".format(file), datetime.datetime.now()-now) ## ticker   
    
    ### lle ####
    lle_transform_train=lle_fun.fit_transform(x_train) # fitting to the training set
    col_name=["LLE_{}".format(x) for x in range(lle_transform_train.shape[1])] # creating col names
    lle_df_train=pd.DataFrame(lle_transform_train, columns=col_name) # lle train df
    lle_df_test=pd.DataFrame(lle_fun.transform(x_test), columns=col_name) # lle test df
    print("\nLLE for DTM {}\ntotal time :".format(file), datetime.datetime.now()-now) ## ticker   


    ### Saving\creating DF's ###
    train_df=pd.concat([sparse_df_train, PCA_df_train,umap_df_train,sent_train,lle_df_train,TSNE_train], axis=1)
    test_df=pd.concat([sparse_df_test, PCA_df_test, umap_df_test, sent_test, lle_df_test,TSNE_test], axis=1)
    
    test_df.to_pickle(path+"dimension_rd_x_test\\"+names[i]+"_dim_rd_test.pkl")
    train_df.to_pickle(path+"dimension_rd_x_train\\"+names[i]+"_dim_rd_train.pkl")
    
    print("\n############################# File Saved At {} #################################".format(datetime.datetime.now()))
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\sklearn\decomposition\sparse_pca.py:170: DeprecationWarning: normalize_components=False is a backward-compatible setting that implements a non-standard definition of sparse PCA. This compatibility mode will be removed in 0.22.
  DeprecationWarning)
Sparse for DTM dtm_freq_x_train.pkl
total time : 0:00:50.817578



PCA for DTM dtm_freq_x_train.pkl
total time : 0:00:54.807404
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\numba\typed_passes.py:271: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "..\..\..\..\Anaconda3\envs\PyhtonAndR\lib\site-packages\umap\rp_tree.py", line 135:
@numba.njit(fastmath=True, nogil=True, parallel=True)
def euclidean_random_projection_split(data, indices, rng_state):
^

  state.func_ir.loc))
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\numba\typed_passes.py:271: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "..\..\..\..\Anaconda3\envs\PyhtonAndR\lib\site-packages\umap\rp_tree.py", line 135:
@numba.njit(fastmath=True, nogil=True, parallel=True)
def euclidean_random_projection_split(data, indices, rng_state):
^

  state.func_ir.loc))
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\umap\nndescent.py:92: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "..\..\..\..\Anaconda3\envs\PyhtonAndR\lib\site-packages\umap\utils.py", line 409:
@numba.njit(parallel=True)
def build_candidates(current_graph, n_vertices, n_neighbors, max_candidates, rng_state):
^

  current_graph, n_vertices, n_neighbors, max_candidates, rng_state
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\numba\typed_passes.py:271: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "..\..\..\..\Anaconda3\envs\PyhtonAndR\lib\site-packages\umap\nndescent.py", line 47:
    @numba.njit(parallel=True)
    def nn_descent(
    ^

  state.func_ir.loc))
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\numba\typed_passes.py:271: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "..\..\..\..\Anaconda3\envs\PyhtonAndR\lib\site-packages\umap\nndescent.py", line 124:
    @numba.njit(parallel=True)
    def init_from_random(n_neighbors, data, query_points, heap, rng_state):
    ^

  state.func_ir.loc))
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\numba\typed_passes.py:271: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "..\..\..\..\Anaconda3\envs\PyhtonAndR\lib\site-packages\umap\nndescent.py", line 135:
    @numba.njit(parallel=True)
    def init_from_tree(tree, data, query_points, heap, rng_state):
    ^

  state.func_ir.loc))


UMAP for DTM dtm_freq_x_train.pkl
total time : 0:02:53.363985

LLE for DTM dtm_freq_x_train.pkl
total time : 0:04:28.076542

############################# File Saved At 2019-12-13 00:51:45.254088 #################################
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\sklearn\decomposition\sparse_pca.py:170: DeprecationWarning: normalize_components=False is a backward-compatible setting that implements a non-standard definition of sparse PCA. This compatibility mode will be removed in 0.22.
  DeprecationWarning)
Sparse for DTM dtm_l1_x_train.pkl
total time : 0:04:36.999753



PCA for DTM dtm_l1_x_train.pkl
total time : 0:04:41.105059
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\numba\typed_passes.py:271: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "..\..\..\..\Anaconda3\envs\PyhtonAndR\lib\site-packages\umap\nndescent.py", line 47:
    @numba.njit(parallel=True)
    def nn_descent(
    ^

  state.func_ir.loc))
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\numba\typed_passes.py:271: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "..\..\..\..\Anaconda3\envs\PyhtonAndR\lib\site-packages\umap\nndescent.py", line 124:
    @numba.njit(parallel=True)
    def init_from_random(n_neighbors, data, query_points, heap, rng_state):
    ^

  state.func_ir.loc))
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\numba\typed_passes.py:271: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "..\..\..\..\Anaconda3\envs\PyhtonAndR\lib\site-packages\umap\nndescent.py", line 135:
    @numba.njit(parallel=True)
    def init_from_tree(tree, data, query_points, heap, rng_state):
    ^

  state.func_ir.loc))


UMAP for DTM dtm_l1_x_train.pkl
total time : 0:06:07.503040

LLE for DTM dtm_l1_x_train.pkl
total time : 0:07:21.703124

############################# File Saved At 2019-12-13 00:54:38.914673 #################################
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\sklearn\decomposition\sparse_pca.py:170: DeprecationWarning: normalize_components=False is a backward-compatible setting that implements a non-standard definition of sparse PCA. This compatibility mode will be removed in 0.22.
  DeprecationWarning)
Sparse for DTM dtm_l2_x_train.pkl
total time : 0:07:29.011671



PCA for DTM dtm_l2_x_train.pkl
total time : 0:07:33.044988
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\numba\typed_passes.py:271: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "..\..\..\..\Anaconda3\envs\PyhtonAndR\lib\site-packages\umap\nndescent.py", line 47:
    @numba.njit(parallel=True)
    def nn_descent(
    ^

  state.func_ir.loc))
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\numba\typed_passes.py:271: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "..\..\..\..\Anaconda3\envs\PyhtonAndR\lib\site-packages\umap\nndescent.py", line 124:
    @numba.njit(parallel=True)
    def init_from_random(n_neighbors, data, query_points, heap, rng_state):
    ^

  state.func_ir.loc))
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\numba\typed_passes.py:271: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "..\..\..\..\Anaconda3\envs\PyhtonAndR\lib\site-packages\umap\nndescent.py", line 135:
    @numba.njit(parallel=True)
    def init_from_tree(tree, data, query_points, heap, rng_state):
    ^

  state.func_ir.loc))


UMAP for DTM dtm_l2_x_train.pkl
total time : 0:09:12.871087

LLE for DTM dtm_l2_x_train.pkl
total time : 0:10:19.748636

############################# File Saved At 2019-12-13 00:57:36.959185 #################################
In [108]:
####################################################################### L1 Dimension Reduction ##############################################################################

import warnings
warnings.simplefilter("ignore")

now=datetime.datetime.now()

#### Parameters ####
umap_fun=umap.UMAP(n_neighbors=50, n_components=2, metric='cosine')
lle_fun=LLE(n_neighbors=40,n_components=2,n_jobs=-1)
sparce_fun=SparsePCA(n_components=3, n_jobs=-1,)
pca_fun=PCA(n_components=.8)

### initializations ####
dimension_tec=[umap_fun, lle_fun, sparce_fun]
reduction_test=pd.DataFrame()
reduction_train=pd.DataFrame()
names=[ "dtm_l1"]
### getting paths ######
path="C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\"
x_train_directory=os.listdir("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\x_train")[1]
x_test_directory=os.listdir("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\x_test")[1]

file=x_train_directory
i=0


################################ reading DataFrame ###########################################################
### take out [0:50] to run whole dataset ####
drops=['vader_neg','vader_neu', 'vader_pos', 'vader_compound','Text_blob_polarity','text_blob_subjectivity','TSNE_1','TSNE_2']
x_train=pd.read_pickle(path+"x_train\\"+file)
sent_train=x_train[['vader_neg','vader_neu', 'vader_pos', 'vader_compound','Text_blob_polarity','text_blob_subjectivity']]
TSNE_train=x_train[['TSNE_1','TSNE_2']]
x_train=x_train[[col for col in x_train.columns if col not in drops]]
#     display(x_train)

x_test=pd.read_pickle(path+"x_test\\"+x_test_directory)
sent_test=x_test[['vader_neg','vader_neu', 'vader_pos', 'vader_compound','Text_blob_polarity','text_blob_subjectivity']]
TSNE_test=x_test[['TSNE_1','TSNE_2']]
x_test=x_test[[col for col in x_test.columns if col not in drops]]
#     display(x_test)

#################################### Sparse #################################
Sparse_transform_train=sparce_fun.fit_transform(x_train) # fitting to the training set
col_name=["Sparse_{}".format(x) for x in range(Sparse_transform_train.shape[1])] # creating col names
sparse_df_train=pd.DataFrame(Sparse_transform_train, columns=col_name) # Sparse train df
sparse_df_test=pd.DataFrame(sparce_fun.transform(x_test), columns=col_name) # Sparse test df
print("\nSparse for DTM {}\ntotal time :".format(file), datetime.datetime.now()-now) ## ticker   

##################################### PCA ########################################
PCA_transform_train=pca_fun.fit_transform(x_train) # fitting to the training set
col_name=["PCA_{}".format(x) for x in range(PCA_transform_train.shape[1])] # creating col names
PCA_df_train=pd.DataFrame(PCA_transform_train, columns=col_name) # PCA train df
PCA_df_test=pd.DataFrame(pca_fun.transform(x_test), columns=col_name) # PCA test df
print("\n\n\nPCA for DTM {}\ntotal time :".format(file), datetime.datetime.now()-now) ## ticker   

##################################### U Map #######################################
u_map_transform_train=umap_fun.fit_transform(x_train) # fitting to the training set
col_name=["UMAP_{}".format(x) for x in range(u_map_transform_train.shape[1])] # creating col names
umap_df_train=pd.DataFrame(u_map_transform_train, columns=col_name) # umap train df
umap_df_test=pd.DataFrame(umap_fun.transform(x_test), columns=col_name) # umap test df
print("\n\n\nUMAP for DTM {}\ntotal time :".format(file), datetime.datetime.now()-now) ## ticker   

#################################### lle ############################################
lle_transform_train=lle_fun.fit_transform(x_train) # fitting to the training set
col_name=["LLE_{}".format(x) for x in range(lle_transform_train.shape[1])] # creating col names
lle_df_train=pd.DataFrame(lle_transform_train, columns=col_name) # lle train df
lle_df_test=pd.DataFrame(lle_fun.transform(x_test), columns=col_name) # lle test df
print("\nLLE for DTM {}\ntotal time :".format(file), datetime.datetime.now()-now) ## ticker   


############################## Saving\creating DF's ####################################################
train_df=pd.concat([sparse_df_train, PCA_df_train,umap_df_train,sent_train,lle_df_train,TSNE_train], axis=1)
test_df=pd.concat([sparse_df_test, PCA_df_test, umap_df_test, sent_test, lle_df_test,TSNE_test], axis=1)

test_df.to_pickle(path+"dimension_rd_x_test_2\\"+names[i]+"_dim_rd_test_2.pkl")
train_df.to_pickle(path+"dimension_rd_x_train_2\\"+names[i]+"_dim_rd_train_2.pkl")

print("\n############################# File Saved At {} #################################".format(datetime.datetime.now()))
Sparse for DTM dtm_l1_x_train.pkl
total time : 0:00:10.212767



PCA for DTM dtm_l1_x_train.pkl
total time : 0:00:14.086052



UMAP for DTM dtm_l1_x_train.pkl
total time : 0:08:17.256118

LLE for DTM dtm_l1_x_train.pkl
total time : 0:10:18.598185

############################# File Saved At 2019-12-15 20:52:53.456199 #################################
In [82]:
####################################################### Saving fits ####################################################
import pickle
dimension_tec=[umap_fun, lle_fun, sparce_fun, pca_fun]
dimension_name=['umap_fun.pickle', 'lle_fun.pickle', 'sparce_fun.pickle', 'PCA_fun.pickle']
%cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\models objects"
for i, fit in enumerate(dimension_tec):
    pickle_out = open(dimension_name[i],"wb")
    pickle.dump(fit, pickle_out)
    pickle_out.close()
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\models objects
In [289]:
########################################################################### UMAP Graph ##############################################################################################

#### reading in files
y_train_l1=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\y_train\\dtm_l1_y_train.pkl')
x_train=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\x_train\\dtm_l1_x_train.pkl')


#### used for tunning UMap Wich was latter applied to the finale dimensions
# umap_fun=umap.UMAP(n_neighbors=50, n_components=2, metric='cosine')
# u_map_transform_train=umap_fun.fit_transform(x_train.iloc[:,:-8])
# raw=u_map_transform_train
# u_map_transform_train=pd.DataFrame(u_map_transform_train, columns=['UMAP_1', 'UMAP_2'])
# u_map_transform_train['rating']=y_train_l1.rating
# u_map_transform_train=u_map_transform_train.sort_values('rating')
# u_map_transform_train.rating=u_map_transform_train.rating.astype(str)+'#'
# u_map_transform_train=u_map_transform_train[u_map_transform_train.rating!='nan#']
# ones=u_map_transform_train[u_map_transform_train.rating=='1#']
# fives=u_map_transform_train[u_map_transform_train.rating=='5#']


########################################### plotting UMAP ##############################################################
fig, axe = plt.subplots(figsize=(20,20))

### ones and Fives Densities
# sns.kdeplot(data=ones.iloc[:,0],data2=ones.iloc[:,1],cmap="Blues", ax=axe, shade=True, alpha=.4 )
# sns.kdeplot(data=fives.iloc[:,0],data2=fives.iloc[:,1],cmap="Reds", shade=True, ax=axe, alpha=.4)

### Total Densities
sns.kdeplot(data=u_map_transform_train.iloc[:,0],data2=u_map_transform_train.iloc[:,1], cmap="twilight", ax=axe, shade=True, alpha=.4 )

### individual Points
sns.scatterplot(u_map_transform_train.UMAP_1, u_map_transform_train.UMAP_2, hue=u_map_transform_train.rating ,
                style=u_map_transform_train.rating, palette='twilight', alpha=.8)
axe.set_title('UMAP Graph', size=50, fontdict= {'family':'MS Reference Sans Serif'} )
axe.set_ylabel('UMAP Component One', size=30)
axe.set_xlabel('UMAP Component Two', size=30)
axe.legend(loc=2, markerscale=3,fancybox=True, shadow=True, prop={'size': 20})



#plt.savefig('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\figs\\UMAP.png')
In [292]:
####################################################################### LLE Graph ##############################################################################################


# lle_fun=LLE(n_neighbors=40,n_components=2,n_jobs=-1)
# lle_transform_train=lle_fun.fit_transform(x_train.iloc[:3000,:-8]) # fitting to the training set

lle_transform_train=pd.DataFrame(lle_transform_train, columns=['LLE_1', 'LLE_2'])
lle_transform_train['rating']=y_train_l1.rating
lle_transform_train=lle_transform_train.sort_values('rating')
lle_transform_train.rating=lle_transform_train.rating.astype(str)+'#'
lle_transform_train=lle_transform_train[lle_transform_train.rating!='nan#']
ones=lle_transform_train[lle_transform_train.rating=='1#']
fives=lle_transform_train[lle_transform_train.rating=='5#']



##################################################### Plotting LLE ############################################################################
fig, axe = plt.subplots(figsize=(20,20))

### ones Desnsity ###
#sns.kdeplot(data=ones.iloc[:,0],data2=ones.iloc[:,1],cmap="Blues", ax=axe, shade=True, alpha=.4 )

### Twos Desnity #####
#sns.kdeplot(data=fives.iloc[:,0],data2=fives.iloc[:,1],cmap="Reds", shade=True, ax=axe, alpha=.4)

### total Density ####
sns.kdeplot(data=lle_transform_train.iloc[:,1],data2=lle_transform_train.iloc[:,0],cmap="twilight", ax=axe, shade=True, alpha=.4 )

sns.scatterplot(lle_transform_train.LLE_2, lle_transform_train.LLE_1, hue=lle_transform_train.rating ,
                style=lle_transform_train.rating, palette='twilight', alpha=.8)
axe.set_title('LLE Graph', size=50, fontdict= {'family':'MS Reference Sans Serif'} )
axe.set_ylabel('LLE Component One', size=30)
axe.set_xlabel('LLE Component Two', size=30)

axe.set_ylim(-.02,.03)
axe.legend(loc=2, markerscale=3,fancybox=True, shadow=True, prop={'size': 20})

# axe.set_xlim(-.05,.08)


plt.savefig('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\figs\\LLE.png')
plt.show()

Building the Models

  • First Param Grid
  • First Initial Large Grid Search
  • Analyzing / Graphing Top Model
  • Fitting XgBoost on new features and more parameters
  • Plotting Feature Importance
  • Single Fit Algorithum Cell
  • Y test Fit Cell
  • Data Pipe Line Test
In [5]:
######################################################################### first Paramgrid ####################################################################################

knn_params={
    'p':[2],
    'n_neighbors':[5,10,20],
}

random_params={
    'n_estimators':[100,500],
    'min_samples_split':[5,10],
    'min_samples_leaf':[5,10],
}
gradient_params={
    'learning_rate':[.1,.01,.001],
    'n_estimators':[100,500],
    'subsample':[.6,.8,1],
    'min_samples_split':[5],
    'min_samples_leaf':[5],
    'random_state':[4],
}
xg_params={
    'learning_rate':[.1,.01,.001],
    'n_estimators':[100,500],
    'subsample':[.6,.8,1],
    'max_depth':[5,7,9],
}

master_params=[knn_params, random_params, gradient_params, xg_params]
models=[KNeighborsClassifier, RandomForestClassifier, GradientBoostingClassifier, XGBClassifier]
In [6]:
######################################################################### first inital GridSearch ##################################################################################


path="C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\"

now=datetime.datetime.now()
y_train_files=os.listdir(path+"y_train")
x_train_files=os.listdir(path+"dimension_rd_x_train")
df_names=["freq", "l1","l2"]
master_1=pd.DataFrame()

##### selecting only 100 PCA's (issue with time) ##########################
cols=["PCA_{}".format(x) for x in range(100)]
cols.extend(['Sparse_0','Sparse_1','Sparse_2','UMAP_0','UMAP_1','UMAP_2','vader_neg','vader_neu', 'vader_pos', 'vader_compound',
             'Text_blob_polarity','text_blob_subjectivity','LLE_0','LLE_1','LLE_2', 'TSNE_1','TSNE_2'])

for i, x_train in enumerate(x_train_files):
    #### reading files #######
    x_train=pd.read_pickle(path+"dimension_rd_x_train\\{}".format(x_train))
    y_train=pd.read_pickle(path+"y_train\\{}".format(y_train_files[i])).rating.values
    x_train=x_train.loc[:,cols].values
    
    for j, model in enumerate(models):
        #### fitting models ####
        clf=GridSearchCV(model(), master_params[j], n_jobs= -1 ,cv=5, verbose=10)
        new_models=clf.fit(x_train, y_train)
        df=pd.DataFrame(new_models.cv_results_)
        df['df']=df_names[i]
        df['cols']='All'
        df['model']=model.__name__
        master_1=pd.concat([master_1,df], axis=0)
        print('Finished with df {} model {}'.format(df_names[i],model.__name__))
        print("Total time:", datetime.datetime.now()-now)
        print("\n\n#################################################### Fin #############################################################")
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of  15 | elapsed:    6.2s remaining:   41.1s
[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed:    6.8s remaining:   18.8s
[Parallel(n_jobs=-1)]: Done   6 out of  15 | elapsed:    7.4s remaining:   11.2s
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed:    7.8s remaining:    6.8s
[Parallel(n_jobs=-1)]: Done  10 out of  15 | elapsed:   12.5s remaining:    6.2s
[Parallel(n_jobs=-1)]: Done  12 out of  15 | elapsed:   12.8s remaining:    3.1s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   13.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Finished with df freq model KNeighborsClassifier
Total time: 0:00:13.897160


#################################################### Fin #############################################################
Fitting 5 folds for each of 8 candidates, totalling 40 fits
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   42.0s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  30 out of  40 | elapsed:  1.3min remaining:   26.0s
[Parallel(n_jobs=-1)]: Done  35 out of  40 | elapsed:  1.5min remaining:   12.5s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.8min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.8min finished
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\ipykernel_launcher.py:28: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Finished with df freq model RandomForestClassifier
Total time: 0:02:36.127865


#################################################### Fin #############################################################
Fitting 5 folds for each of 18 candidates, totalling 90 fits
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   59.5s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 14.7min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed: 17.3min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed: 26.8min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed: 30.3min
[Parallel(n_jobs=-1)]: Done  85 out of  90 | elapsed: 41.3min remaining:  2.4min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed: 43.6min finished
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\ipykernel_launcher.py:28: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Finished with df freq model GradientBoostingClassifier
Total time: 0:50:03.433879


#################################################### Fin #############################################################
Fitting 5 folds for each of 54 candidates, totalling 270 fits
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   50.7s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 11.1min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed: 13.4min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed: 22.8min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed: 25.5min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed: 33.2min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed: 39.4min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed: 44.6min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed: 50.2min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 61.8min
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed: 66.3min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 80.7min
[Parallel(n_jobs=-1)]: Done 205 tasks      | elapsed: 90.0min
[Parallel(n_jobs=-1)]: Done 226 tasks      | elapsed: 97.8min
[Parallel(n_jobs=-1)]: Done 249 tasks      | elapsed: 107.2min
[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed: 125.1min finished
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\ipykernel_launcher.py:28: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Finished with df freq model XGBClassifier
Total time: 2:59:11.766535


#################################################### Fin #############################################################
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[Parallel(n_jobs=-1)]: Done   2 out of  15 | elapsed:    1.0s remaining:    7.3s
[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed:    2.6s remaining:    7.2s
[Parallel(n_jobs=-1)]: Done   6 out of  15 | elapsed:    2.9s remaining:    4.3s
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed:    3.4s remaining:    3.0s
[Parallel(n_jobs=-1)]: Done  10 out of  15 | elapsed:    3.7s remaining:    1.8s
[Parallel(n_jobs=-1)]: Done  12 out of  15 | elapsed:    4.0s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    4.3s finished
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\ipykernel_launcher.py:28: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Finished with df l1 model KNeighborsClassifier
Total time: 2:59:16.441885


#################################################### Fin #############################################################
Fitting 5 folds for each of 8 candidates, totalling 40 fits
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   22.2s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   42.7s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  30 out of  40 | elapsed:  1.3min remaining:   25.1s
[Parallel(n_jobs=-1)]: Done  35 out of  40 | elapsed:  1.4min remaining:   12.2s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.8min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.8min finished
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\ipykernel_launcher.py:28: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Finished with df l1 model RandomForestClassifier
Total time: 3:01:38.251287


#################################################### Fin #############################################################
Fitting 5 folds for each of 18 candidates, totalling 90 fits
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 11.3min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 14.8min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed: 17.4min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed: 26.9min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed: 30.2min
[Parallel(n_jobs=-1)]: Done  85 out of  90 | elapsed: 41.2min remaining:  2.4min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed: 43.5min finished
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\ipykernel_launcher.py:28: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Finished with df l1 model GradientBoostingClassifier
Total time: 3:48:58.921180


#################################################### Fin #############################################################
Fitting 5 folds for each of 54 candidates, totalling 270 fits
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   46.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 10.8min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed: 12.6min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed: 22.1min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed: 25.1min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed: 32.5min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed: 38.8min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed: 44.2min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed: 49.3min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 60.9min
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed: 65.7min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 80.1min
[Parallel(n_jobs=-1)]: Done 205 tasks      | elapsed: 88.9min
[Parallel(n_jobs=-1)]: Done 226 tasks      | elapsed: 96.8min
[Parallel(n_jobs=-1)]: Done 249 tasks      | elapsed: 106.3min
[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed: 124.0min finished
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\ipykernel_launcher.py:28: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

Finished with df l1 model XGBClassifier
Total time: 5:58:01.128556


#################################################### Fin #############################################################
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of  15 | elapsed:    2.4s remaining:   16.3s
[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed:    2.8s remaining:    8.0s
[Parallel(n_jobs=-1)]: Done   6 out of  15 | elapsed:    3.5s remaining:    5.3s
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed:    3.8s remaining:    3.3s
[Parallel(n_jobs=-1)]: Done  10 out of  15 | elapsed:    4.0s remaining:    2.0s
[Parallel(n_jobs=-1)]: Done  12 out of  15 | elapsed:    4.4s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    4.6s finished
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\ipykernel_launcher.py:28: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Finished with df l2 model KNeighborsClassifier
Total time: 5:58:06.091925


#################################################### Fin #############################################################
Fitting 5 folds for each of 8 candidates, totalling 40 fits
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   42.6s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  30 out of  40 | elapsed:  1.3min remaining:   25.6s
[Parallel(n_jobs=-1)]: Done  35 out of  40 | elapsed:  1.4min remaining:   12.2s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.8min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.8min finished
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\ipykernel_launcher.py:28: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Finished with df l2 model RandomForestClassifier
Total time: 6:00:27.747577


#################################################### Fin #############################################################
Fitting 5 folds for each of 18 candidates, totalling 90 fits
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 14.8min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed: 17.0min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed: 26.2min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed: 30.0min
[Parallel(n_jobs=-1)]: Done  85 out of  90 | elapsed: 40.7min remaining:  2.4min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed: 42.7min finished
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\ipykernel_launcher.py:28: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Finished with df l2 model GradientBoostingClassifier
Total time: 6:46:14.639260


#################################################### Fin #############################################################
Fitting 5 folds for each of 54 candidates, totalling 270 fits
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   48.6s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed: 12.8min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed: 21.9min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed: 25.0min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed: 32.3min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed: 38.2min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed: 43.8min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed: 49.1min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 60.6min
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed: 65.0min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 79.5min
[Parallel(n_jobs=-1)]: Done 205 tasks      | elapsed: 88.3min
[Parallel(n_jobs=-1)]: Done 226 tasks      | elapsed: 96.2min
[Parallel(n_jobs=-1)]: Done 249 tasks      | elapsed: 105.4min
[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed: 123.3min finished
Finished with df l2 model XGBClassifier
Total time: 8:54:36.670238


#################################################### Fin #############################################################
C:\Users\jwr17\Anaconda3\envs\PyhtonAndR\lib\site-packages\ipykernel_launcher.py:28: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

In [320]:
######################################################################### top Model Graphs Round One #########################################################################################

rd_one=pd.read_csv('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\results\\master_all_cols.csv')
display(rd_one.groupby(['model','df'], as_index=False)['mean_test_score'].max())
df_3=rd_one.groupby(['model','df'], as_index=False)['mean_test_score'].agg(['min','max','mean','std'],as_index=False)

df_3=df_3.reset_index()
sns.set_style('darkgrid')
df_3=df_3.sort_values('max', ascending=False)
fig, ax= plt.subplots(figsize=(11,5))
sns.barplot(y=df_3['model'], x=df_3['max'], hue=df_3.df, palette='mako')
ax.set_title('Top Model by Algorithum and DTM')
ax.set_ylabel('')
ax.set_xlabel('Accuracey')
#ax.annotate('afg',(34,45))
axe.legend(loc=2, markerscale=5,fancybox=True, shadow=True, prop={'size': 30})
plt.savefig('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\figs\\top_models.png')
#ax.table(df_3)
model df mean_test_score
0 GradientBoostingClassifier freq 0.396526
1 GradientBoostingClassifier l1 0.405474
2 GradientBoostingClassifier l2 0.399895
3 KNeighborsClassifier freq 0.307579
4 KNeighborsClassifier l1 0.317263
5 KNeighborsClassifier l2 0.317368
6 RandomForestClassifier freq 0.389474
7 RandomForestClassifier l1 0.398421
8 RandomForestClassifier l2 0.393474
9 XGBClassifier freq 0.404000
10 XGBClassifier l1 0.413579
11 XGBClassifier l2 0.408316
In [132]:
###################################################################### NEW Feature Fit + Various Paremeters ########################################################################
### this cell dynamically changed as new features and parameters were test on feature importance


xg_params={
    'learning_rate':[.1,.001,.01],
    'n_estimators':[100,500],
    'subsample':[.6,.8,1],
    'max_depth':[5,9,15],
    
}

path="C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\"

i=0
now=datetime.datetime.now()
y_train_files=os.listdir(path+"y_train")
x_train_files=os.listdir(path+"dimension_rd_x_train_2")
df_names="l1"
master_3=pd.DataFrame()

####### feature importance selection ################3
cols=ft_import_df.sort_values('ft_im', ascending=False).cols[0:200]
# cols=["PCA_{}".format(x) for x in range(100)]
# cols=['Sparse_0','Sparse_1','Sparse_2','UMAP_0','UMAP_1','UMAP_2','vader_neg','vader_neu', 'vader_pos', 'vader_compound',
#              'Text_blob_polarity','text_blob_subjectivity','LLE_0','LLE_1','LLE_2', 'PCA_1','PCA_2','PCA_3']


##################### Reading in File ############################################
x_train=pd.read_pickle(path+"dimension_rd_x_train_2\\{}".format(x_train_files[0]))
x_train=x_train[cols].values
y_train=pd.read_pickle(path+"y_train\\{}".format(y_train_files[1])).rating.values
    

#### fitting models ####
clf=GridSearchCV(XGBClassifier(tree_method='gpu_hist'), xg_params ,cv=3, verbose=10)
new_models=clf.fit(x_train, y_train)
df=pd.DataFrame(new_models.cv_results_)
df['df']=df_names
df['cols']='to 50 ft'
df['model']=XGBClassifier.__name__
master_3=pd.concat([master_2,df], axis=0)
print('Finished with df {} model {}'.format(df_names[i],model.__name__))
print("Total time:", datetime.datetime.now()-now)
print("\n\n#################################################### Fin #############################################################")
Fitting 3 folds for each of 54 candidates, totalling 162 fits
[CV] learning_rate=0.1, max_depth=5, n_estimators=100, subsample=0.6 .
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV]  learning_rate=0.1, max_depth=5, n_estimators=100, subsample=0.6, score=0.408, total=   6.6s
[CV] learning_rate=0.1, max_depth=5, n_estimators=100, subsample=0.6 .
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.5s remaining:    0.0s
[CV]  learning_rate=0.1, max_depth=5, n_estimators=100, subsample=0.6, score=0.409, total=   6.5s
[CV] learning_rate=0.1, max_depth=5, n_estimators=100, subsample=0.6 .
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   13.1s remaining:    0.0s
[CV]  learning_rate=0.1, max_depth=5, n_estimators=100, subsample=0.6, score=0.399, total=   6.7s
[CV] learning_rate=0.1, max_depth=5, n_estimators=100, subsample=0.8 .
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   19.9s remaining:    0.0s
[CV]  learning_rate=0.1, max_depth=5, n_estimators=100, subsample=0.8, score=0.405, total=   6.4s
[CV] learning_rate=0.1, max_depth=5, n_estimators=100, subsample=0.8 .
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   26.3s remaining:    0.0s
[CV]  learning_rate=0.1, max_depth=5, n_estimators=100, subsample=0.8, score=0.405, total=   6.6s
[CV] learning_rate=0.1, max_depth=5, n_estimators=100, subsample=0.8 .
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   33.0s remaining:    0.0s
[CV]  learning_rate=0.1, max_depth=5, n_estimators=100, subsample=0.8, score=0.414, total=   6.4s
[CV] learning_rate=0.1, max_depth=5, n_estimators=100, subsample=1 ...
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   39.4s remaining:    0.0s
[CV]  learning_rate=0.1, max_depth=5, n_estimators=100, subsample=1, score=0.404, total=   6.3s
[CV] learning_rate=0.1, max_depth=5, n_estimators=100, subsample=1 ...
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   45.7s remaining:    0.0s
[CV]  learning_rate=0.1, max_depth=5, n_estimators=100, subsample=1, score=0.402, total=   6.5s
[CV] learning_rate=0.1, max_depth=5, n_estimators=100, subsample=1 ...
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   52.3s remaining:    0.0s
[CV]  learning_rate=0.1, max_depth=5, n_estimators=100, subsample=1, score=0.419, total=   6.5s
[CV] learning_rate=0.1, max_depth=5, n_estimators=500, subsample=0.6 .
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   58.8s remaining:    0.0s
[CV]  learning_rate=0.1, max_depth=5, n_estimators=500, subsample=0.6, score=0.410, total=  29.2s
[CV] learning_rate=0.1, max_depth=5, n_estimators=500, subsample=0.6 .
[CV]  learning_rate=0.1, max_depth=5, n_estimators=500, subsample=0.6, score=0.418, total=  29.4s
[CV] learning_rate=0.1, max_depth=5, n_estimators=500, subsample=0.6 .
[CV]  learning_rate=0.1, max_depth=5, n_estimators=500, subsample=0.6, score=0.421, total=  29.2s
[CV] learning_rate=0.1, max_depth=5, n_estimators=500, subsample=0.8 .
[CV]  learning_rate=0.1, max_depth=5, n_estimators=500, subsample=0.8, score=0.418, total=  29.5s
[CV] learning_rate=0.1, max_depth=5, n_estimators=500, subsample=0.8 .
[CV]  learning_rate=0.1, max_depth=5, n_estimators=500, subsample=0.8, score=0.417, total=  29.6s
[CV] learning_rate=0.1, max_depth=5, n_estimators=500, subsample=0.8 .
[CV]  learning_rate=0.1, max_depth=5, n_estimators=500, subsample=0.8, score=0.416, total=  29.5s
[CV] learning_rate=0.1, max_depth=5, n_estimators=500, subsample=1 ...
[CV]  learning_rate=0.1, max_depth=5, n_estimators=500, subsample=1, score=0.411, total=  29.1s
[CV] learning_rate=0.1, max_depth=5, n_estimators=500, subsample=1 ...
[CV]  learning_rate=0.1, max_depth=5, n_estimators=500, subsample=1, score=0.409, total=  29.3s
[CV] learning_rate=0.1, max_depth=5, n_estimators=500, subsample=1 ...
[CV]  learning_rate=0.1, max_depth=5, n_estimators=500, subsample=1, score=0.418, total=  29.3s
[CV] learning_rate=0.1, max_depth=9, n_estimators=100, subsample=0.6 .
[CV]  learning_rate=0.1, max_depth=9, n_estimators=100, subsample=0.6, score=0.395, total=  27.5s
[CV] learning_rate=0.1, max_depth=9, n_estimators=100, subsample=0.6 .
[CV]  learning_rate=0.1, max_depth=9, n_estimators=100, subsample=0.6, score=0.407, total=  27.7s
[CV] learning_rate=0.1, max_depth=9, n_estimators=100, subsample=0.6 .
[CV]  learning_rate=0.1, max_depth=9, n_estimators=100, subsample=0.6, score=0.410, total=  26.2s
[CV] learning_rate=0.1, max_depth=9, n_estimators=100, subsample=0.8 .
[CV]  learning_rate=0.1, max_depth=9, n_estimators=100, subsample=0.8, score=0.393, total=  28.8s
[CV] learning_rate=0.1, max_depth=9, n_estimators=100, subsample=0.8 .
[CV]  learning_rate=0.1, max_depth=9, n_estimators=100, subsample=0.8, score=0.400, total=  28.7s
[CV] learning_rate=0.1, max_depth=9, n_estimators=100, subsample=0.8 .
[CV]  learning_rate=0.1, max_depth=9, n_estimators=100, subsample=0.8, score=0.395, total=  28.6s
[CV] learning_rate=0.1, max_depth=9, n_estimators=100, subsample=1 ...
[CV]  learning_rate=0.1, max_depth=9, n_estimators=100, subsample=1, score=0.399, total=  30.2s
[CV] learning_rate=0.1, max_depth=9, n_estimators=100, subsample=1 ...
[CV]  learning_rate=0.1, max_depth=9, n_estimators=100, subsample=1, score=0.397, total=  31.1s
[CV] learning_rate=0.1, max_depth=9, n_estimators=100, subsample=1 ...
[CV]  learning_rate=0.1, max_depth=9, n_estimators=100, subsample=1, score=0.402, total=  31.0s
[CV] learning_rate=0.1, max_depth=9, n_estimators=500, subsample=0.6 .
[CV]  learning_rate=0.1, max_depth=9, n_estimators=500, subsample=0.6, score=0.422, total= 1.1min
[CV] learning_rate=0.1, max_depth=9, n_estimators=500, subsample=0.6 .
[CV]  learning_rate=0.1, max_depth=9, n_estimators=500, subsample=0.6, score=0.413, total= 1.1min
[CV] learning_rate=0.1, max_depth=9, n_estimators=500, subsample=0.6 .
[CV]  learning_rate=0.1, max_depth=9, n_estimators=500, subsample=0.6, score=0.417, total= 1.1min
[CV] learning_rate=0.1, max_depth=9, n_estimators=500, subsample=0.8 .
[CV]  learning_rate=0.1, max_depth=9, n_estimators=500, subsample=0.8, score=0.413, total= 1.2min
[CV] learning_rate=0.1, max_depth=9, n_estimators=500, subsample=0.8 .
[CV]  learning_rate=0.1, max_depth=9, n_estimators=500, subsample=0.8, score=0.414, total= 1.2min
[CV] learning_rate=0.1, max_depth=9, n_estimators=500, subsample=0.8 .
[CV]  learning_rate=0.1, max_depth=9, n_estimators=500, subsample=0.8, score=0.411, total= 1.1min
[CV] learning_rate=0.1, max_depth=9, n_estimators=500, subsample=1 ...
[CV]  learning_rate=0.1, max_depth=9, n_estimators=500, subsample=1, score=0.404, total= 1.2min
[CV] learning_rate=0.1, max_depth=9, n_estimators=500, subsample=1 ...
[CV]  learning_rate=0.1, max_depth=9, n_estimators=500, subsample=1, score=0.411, total= 1.2min
[CV] learning_rate=0.1, max_depth=9, n_estimators=500, subsample=1 ...
[CV]  learning_rate=0.1, max_depth=9, n_estimators=500, subsample=1, score=0.409, total= 1.2min
[CV] learning_rate=0.1, max_depth=15, n_estimators=100, subsample=0.6 
[CV]  learning_rate=0.1, max_depth=15, n_estimators=100, subsample=0.6, score=0.403, total=  44.9s
[CV] learning_rate=0.1, max_depth=15, n_estimators=100, subsample=0.6 
[CV]  learning_rate=0.1, max_depth=15, n_estimators=100, subsample=0.6, score=0.408, total=  44.7s
[CV] learning_rate=0.1, max_depth=15, n_estimators=100, subsample=0.6 
[CV]  learning_rate=0.1, max_depth=15, n_estimators=100, subsample=0.6, score=0.400, total=  45.0s
[CV] learning_rate=0.1, max_depth=15, n_estimators=100, subsample=0.8 
[CV]  learning_rate=0.1, max_depth=15, n_estimators=100, subsample=0.8, score=0.401, total=  49.6s
[CV] learning_rate=0.1, max_depth=15, n_estimators=100, subsample=0.8 
[CV]  learning_rate=0.1, max_depth=15, n_estimators=100, subsample=0.8, score=0.390, total=  49.5s
[CV] learning_rate=0.1, max_depth=15, n_estimators=100, subsample=0.8 
[CV]  learning_rate=0.1, max_depth=15, n_estimators=100, subsample=0.8, score=0.402, total=  49.3s
[CV] learning_rate=0.1, max_depth=15, n_estimators=100, subsample=1 ..
[CV]  learning_rate=0.1, max_depth=15, n_estimators=100, subsample=1, score=0.392, total=  54.3s
[CV] learning_rate=0.1, max_depth=15, n_estimators=100, subsample=1 ..
[CV]  learning_rate=0.1, max_depth=15, n_estimators=100, subsample=1, score=0.376, total=  53.9s
[CV] learning_rate=0.1, max_depth=15, n_estimators=100, subsample=1 ..
[CV]  learning_rate=0.1, max_depth=15, n_estimators=100, subsample=1, score=0.397, total=  53.5s
[CV] learning_rate=0.1, max_depth=15, n_estimators=500, subsample=0.6 
[CV]  learning_rate=0.1, max_depth=15, n_estimators=500, subsample=0.6, score=0.419, total= 1.5min
[CV] learning_rate=0.1, max_depth=15, n_estimators=500, subsample=0.6 
[CV]  learning_rate=0.1, max_depth=15, n_estimators=500, subsample=0.6, score=0.419, total= 1.4min
[CV] learning_rate=0.1, max_depth=15, n_estimators=500, subsample=0.6 
[CV]  learning_rate=0.1, max_depth=15, n_estimators=500, subsample=0.6, score=0.416, total= 1.4min
[CV] learning_rate=0.1, max_depth=15, n_estimators=500, subsample=0.8 
[CV]  learning_rate=0.1, max_depth=15, n_estimators=500, subsample=0.8, score=0.422, total= 1.4min
[CV] learning_rate=0.1, max_depth=15, n_estimators=500, subsample=0.8 
[CV]  learning_rate=0.1, max_depth=15, n_estimators=500, subsample=0.8, score=0.407, total= 1.4min
[CV] learning_rate=0.1, max_depth=15, n_estimators=500, subsample=0.8 
[CV]  learning_rate=0.1, max_depth=15, n_estimators=500, subsample=0.8, score=0.419, total= 1.4min
[CV] learning_rate=0.1, max_depth=15, n_estimators=500, subsample=1 ..
[CV]  learning_rate=0.1, max_depth=15, n_estimators=500, subsample=1, score=0.411, total= 1.5min
[CV] learning_rate=0.1, max_depth=15, n_estimators=500, subsample=1 ..
[CV]  learning_rate=0.1, max_depth=15, n_estimators=500, subsample=1, score=0.397, total= 1.6min
[CV] learning_rate=0.1, max_depth=15, n_estimators=500, subsample=1 ..
[CV]  learning_rate=0.1, max_depth=15, n_estimators=500, subsample=1, score=0.410, total= 1.5min
[CV] learning_rate=0.001, max_depth=5, n_estimators=100, subsample=0.6 
[CV]  learning_rate=0.001, max_depth=5, n_estimators=100, subsample=0.6, score=0.375, total=   6.6s
[CV] learning_rate=0.001, max_depth=5, n_estimators=100, subsample=0.6 
[CV]  learning_rate=0.001, max_depth=5, n_estimators=100, subsample=0.6, score=0.373, total=   6.8s
[CV] learning_rate=0.001, max_depth=5, n_estimators=100, subsample=0.6 
[CV]  learning_rate=0.001, max_depth=5, n_estimators=100, subsample=0.6, score=0.377, total=   6.7s
[CV] learning_rate=0.001, max_depth=5, n_estimators=100, subsample=0.8 
[CV]  learning_rate=0.001, max_depth=5, n_estimators=100, subsample=0.8, score=0.377, total=   6.7s
[CV] learning_rate=0.001, max_depth=5, n_estimators=100, subsample=0.8 
[CV]  learning_rate=0.001, max_depth=5, n_estimators=100, subsample=0.8, score=0.366, total=   6.6s
[CV] learning_rate=0.001, max_depth=5, n_estimators=100, subsample=0.8 
[CV]  learning_rate=0.001, max_depth=5, n_estimators=100, subsample=0.8, score=0.370, total=   6.6s
[CV] learning_rate=0.001, max_depth=5, n_estimators=100, subsample=1 .
[CV]  learning_rate=0.001, max_depth=5, n_estimators=100, subsample=1, score=0.340, total=   6.5s
[CV] learning_rate=0.001, max_depth=5, n_estimators=100, subsample=1 .
[CV]  learning_rate=0.001, max_depth=5, n_estimators=100, subsample=1, score=0.338, total=   6.6s
[CV] learning_rate=0.001, max_depth=5, n_estimators=100, subsample=1 .
[CV]  learning_rate=0.001, max_depth=5, n_estimators=100, subsample=1, score=0.360, total=   6.6s
[CV] learning_rate=0.001, max_depth=5, n_estimators=500, subsample=0.6 
[CV]  learning_rate=0.001, max_depth=5, n_estimators=500, subsample=0.6, score=0.390, total=  30.5s
[CV] learning_rate=0.001, max_depth=5, n_estimators=500, subsample=0.6 
[CV]  learning_rate=0.001, max_depth=5, n_estimators=500, subsample=0.6, score=0.386, total=  30.6s
[CV] learning_rate=0.001, max_depth=5, n_estimators=500, subsample=0.6 
[CV]  learning_rate=0.001, max_depth=5, n_estimators=500, subsample=0.6, score=0.376, total=  30.9s
[CV] learning_rate=0.001, max_depth=5, n_estimators=500, subsample=0.8 
[CV]  learning_rate=0.001, max_depth=5, n_estimators=500, subsample=0.8, score=0.384, total=  30.8s
[CV] learning_rate=0.001, max_depth=5, n_estimators=500, subsample=0.8 
[CV]  learning_rate=0.001, max_depth=5, n_estimators=500, subsample=0.8, score=0.373, total=  30.9s
[CV] learning_rate=0.001, max_depth=5, n_estimators=500, subsample=0.8 
[CV]  learning_rate=0.001, max_depth=5, n_estimators=500, subsample=0.8, score=0.375, total=  31.3s
[CV] learning_rate=0.001, max_depth=5, n_estimators=500, subsample=1 .
[CV]  learning_rate=0.001, max_depth=5, n_estimators=500, subsample=1, score=0.360, total=  30.8s
[CV] learning_rate=0.001, max_depth=5, n_estimators=500, subsample=1 .
[CV]  learning_rate=0.001, max_depth=5, n_estimators=500, subsample=1, score=0.356, total=  31.0s
[CV] learning_rate=0.001, max_depth=5, n_estimators=500, subsample=1 .
[CV]  learning_rate=0.001, max_depth=5, n_estimators=500, subsample=1, score=0.371, total=  30.8s
[CV] learning_rate=0.001, max_depth=9, n_estimators=100, subsample=0.6 
[CV]  learning_rate=0.001, max_depth=9, n_estimators=100, subsample=0.6, score=0.384, total=  30.8s
[CV] learning_rate=0.001, max_depth=9, n_estimators=100, subsample=0.6 
[CV]  learning_rate=0.001, max_depth=9, n_estimators=100, subsample=0.6, score=0.388, total=  31.3s
[CV] learning_rate=0.001, max_depth=9, n_estimators=100, subsample=0.6 
[CV]  learning_rate=0.001, max_depth=9, n_estimators=100, subsample=0.6, score=0.383, total=  31.3s
[CV] learning_rate=0.001, max_depth=9, n_estimators=100, subsample=0.8 
[CV]  learning_rate=0.001, max_depth=9, n_estimators=100, subsample=0.8, score=0.391, total=  34.8s
[CV] learning_rate=0.001, max_depth=9, n_estimators=100, subsample=0.8 
[CV]  learning_rate=0.001, max_depth=9, n_estimators=100, subsample=0.8, score=0.375, total=  38.3s
[CV] learning_rate=0.001, max_depth=9, n_estimators=100, subsample=0.8 
[CV]  learning_rate=0.001, max_depth=9, n_estimators=100, subsample=0.8, score=0.377, total=  35.2s
[CV] learning_rate=0.001, max_depth=9, n_estimators=100, subsample=1 .
[CV]  learning_rate=0.001, max_depth=9, n_estimators=100, subsample=1, score=0.339, total=  37.3s
[CV] learning_rate=0.001, max_depth=9, n_estimators=100, subsample=1 .
[CV]  learning_rate=0.001, max_depth=9, n_estimators=100, subsample=1, score=0.323, total=  39.5s
[CV] learning_rate=0.001, max_depth=9, n_estimators=100, subsample=1 .
[CV]  learning_rate=0.001, max_depth=9, n_estimators=100, subsample=1, score=0.318, total=  37.6s
[CV] learning_rate=0.001, max_depth=9, n_estimators=500, subsample=0.6 
[CV]  learning_rate=0.001, max_depth=9, n_estimators=500, subsample=0.6, score=0.397, total= 2.6min
[CV] learning_rate=0.001, max_depth=9, n_estimators=500, subsample=0.6 
[CV]  learning_rate=0.001, max_depth=9, n_estimators=500, subsample=0.6, score=0.390, total= 2.6min
[CV] learning_rate=0.001, max_depth=9, n_estimators=500, subsample=0.6 
[CV]  learning_rate=0.001, max_depth=9, n_estimators=500, subsample=0.6, score=0.386, total= 2.6min
[CV] learning_rate=0.001, max_depth=9, n_estimators=500, subsample=0.8 
[CV]  learning_rate=0.001, max_depth=9, n_estimators=500, subsample=0.8, score=0.394, total= 2.9min
[CV] learning_rate=0.001, max_depth=9, n_estimators=500, subsample=0.8 
[CV]  learning_rate=0.001, max_depth=9, n_estimators=500, subsample=0.8, score=0.384, total= 2.9min
[CV] learning_rate=0.001, max_depth=9, n_estimators=500, subsample=0.8 
[CV]  learning_rate=0.001, max_depth=9, n_estimators=500, subsample=0.8, score=0.390, total= 2.9min
[CV] learning_rate=0.001, max_depth=9, n_estimators=500, subsample=1 .
[CV]  learning_rate=0.001, max_depth=9, n_estimators=500, subsample=1, score=0.347, total= 3.4min
[CV] learning_rate=0.001, max_depth=9, n_estimators=500, subsample=1 .
[CV]  learning_rate=0.001, max_depth=9, n_estimators=500, subsample=1, score=0.353, total= 3.3min
[CV] learning_rate=0.001, max_depth=9, n_estimators=500, subsample=1 .
[CV]  learning_rate=0.001, max_depth=9, n_estimators=500, subsample=1, score=0.354, total= 3.2min
[CV] learning_rate=0.001, max_depth=15, n_estimators=100, subsample=0.6 
[CV]  learning_rate=0.001, max_depth=15, n_estimators=100, subsample=0.6, score=0.382, total= 1.0min
[CV] learning_rate=0.001, max_depth=15, n_estimators=100, subsample=0.6 
[CV]  learning_rate=0.001, max_depth=15, n_estimators=100, subsample=0.6, score=0.382, total= 1.1min
[CV] learning_rate=0.001, max_depth=15, n_estimators=100, subsample=0.6 
[CV]  learning_rate=0.001, max_depth=15, n_estimators=100, subsample=0.6, score=0.389, total= 1.0min
[CV] learning_rate=0.001, max_depth=15, n_estimators=100, subsample=0.8 
[CV]  learning_rate=0.001, max_depth=15, n_estimators=100, subsample=0.8, score=0.387, total= 1.3min
[CV] learning_rate=0.001, max_depth=15, n_estimators=100, subsample=0.8 
[CV]  learning_rate=0.001, max_depth=15, n_estimators=100, subsample=0.8, score=0.383, total= 1.3min
[CV] learning_rate=0.001, max_depth=15, n_estimators=100, subsample=0.8 
[CV]  learning_rate=0.001, max_depth=15, n_estimators=100, subsample=0.8, score=0.379, total= 1.3min
[CV] learning_rate=0.001, max_depth=15, n_estimators=100, subsample=1 
[CV]  learning_rate=0.001, max_depth=15, n_estimators=100, subsample=1, score=0.322, total= 1.6min
[CV] learning_rate=0.001, max_depth=15, n_estimators=100, subsample=1 
[CV]  learning_rate=0.001, max_depth=15, n_estimators=100, subsample=1, score=0.313, total= 1.7min
[CV] learning_rate=0.001, max_depth=15, n_estimators=100, subsample=1 
[CV]  learning_rate=0.001, max_depth=15, n_estimators=100, subsample=1, score=0.301, total= 1.7min
[CV] learning_rate=0.001, max_depth=15, n_estimators=500, subsample=0.6 
[CV]  learning_rate=0.001, max_depth=15, n_estimators=500, subsample=0.6, score=0.397, total= 5.4min
[CV] learning_rate=0.001, max_depth=15, n_estimators=500, subsample=0.6 
[CV]  learning_rate=0.001, max_depth=15, n_estimators=500, subsample=0.6, score=0.391, total= 5.4min
[CV] learning_rate=0.001, max_depth=15, n_estimators=500, subsample=0.6 
[CV]  learning_rate=0.001, max_depth=15, n_estimators=500, subsample=0.6, score=0.397, total= 5.4min
[CV] learning_rate=0.001, max_depth=15, n_estimators=500, subsample=0.8 
[CV]  learning_rate=0.001, max_depth=15, n_estimators=500, subsample=0.8, score=0.398, total= 6.9min
[CV] learning_rate=0.001, max_depth=15, n_estimators=500, subsample=0.8 
[CV]  learning_rate=0.001, max_depth=15, n_estimators=500, subsample=0.8, score=0.385, total= 6.8min
[CV] learning_rate=0.001, max_depth=15, n_estimators=500, subsample=0.8 
[CV]  learning_rate=0.001, max_depth=15, n_estimators=500, subsample=0.8, score=0.395, total= 6.9min
[CV] learning_rate=0.001, max_depth=15, n_estimators=500, subsample=1 
[CV]  learning_rate=0.001, max_depth=15, n_estimators=500, subsample=1, score=0.331, total= 8.2min
[CV] learning_rate=0.001, max_depth=15, n_estimators=500, subsample=1 
[CV]  learning_rate=0.001, max_depth=15, n_estimators=500, subsample=1, score=0.338, total= 8.3min
[CV] learning_rate=0.001, max_depth=15, n_estimators=500, subsample=1 
[CV]  learning_rate=0.001, max_depth=15, n_estimators=500, subsample=1, score=0.341, total= 8.4min
[CV] learning_rate=0.01, max_depth=5, n_estimators=100, subsample=0.6 
[CV]  learning_rate=0.01, max_depth=5, n_estimators=100, subsample=0.6, score=0.392, total=   6.5s
[CV] learning_rate=0.01, max_depth=5, n_estimators=100, subsample=0.6 
[CV]  learning_rate=0.01, max_depth=5, n_estimators=100, subsample=0.6, score=0.386, total=   6.4s
[CV] learning_rate=0.01, max_depth=5, n_estimators=100, subsample=0.6 
[CV]  learning_rate=0.01, max_depth=5, n_estimators=100, subsample=0.6, score=0.390, total=   6.4s
[CV] learning_rate=0.01, max_depth=5, n_estimators=100, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=5, n_estimators=100, subsample=0.8, score=0.390, total=   6.5s
[CV] learning_rate=0.01, max_depth=5, n_estimators=100, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=5, n_estimators=100, subsample=0.8, score=0.371, total=   6.4s
[CV] learning_rate=0.01, max_depth=5, n_estimators=100, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=5, n_estimators=100, subsample=0.8, score=0.384, total=   6.5s
[CV] learning_rate=0.01, max_depth=5, n_estimators=100, subsample=1 ..
[CV]  learning_rate=0.01, max_depth=5, n_estimators=100, subsample=1, score=0.370, total=   6.5s
[CV] learning_rate=0.01, max_depth=5, n_estimators=100, subsample=1 ..
[CV]  learning_rate=0.01, max_depth=5, n_estimators=100, subsample=1, score=0.355, total=   6.4s
[CV] learning_rate=0.01, max_depth=5, n_estimators=100, subsample=1 ..
[CV]  learning_rate=0.01, max_depth=5, n_estimators=100, subsample=1, score=0.377, total=   6.4s
[CV] learning_rate=0.01, max_depth=5, n_estimators=500, subsample=0.6 
[CV]  learning_rate=0.01, max_depth=5, n_estimators=500, subsample=0.6, score=0.415, total=  30.0s
[CV] learning_rate=0.01, max_depth=5, n_estimators=500, subsample=0.6 
[CV]  learning_rate=0.01, max_depth=5, n_estimators=500, subsample=0.6, score=0.411, total=  30.0s
[CV] learning_rate=0.01, max_depth=5, n_estimators=500, subsample=0.6 
[CV]  learning_rate=0.01, max_depth=5, n_estimators=500, subsample=0.6, score=0.409, total=  29.7s
[CV] learning_rate=0.01, max_depth=5, n_estimators=500, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=5, n_estimators=500, subsample=0.8, score=0.414, total=  30.3s
[CV] learning_rate=0.01, max_depth=5, n_estimators=500, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=5, n_estimators=500, subsample=0.8, score=0.406, total=  30.4s
[CV] learning_rate=0.01, max_depth=5, n_estimators=500, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=5, n_estimators=500, subsample=0.8, score=0.407, total=  31.1s
[CV] learning_rate=0.01, max_depth=5, n_estimators=500, subsample=1 ..
[CV]  learning_rate=0.01, max_depth=5, n_estimators=500, subsample=1, score=0.399, total=  30.4s
[CV] learning_rate=0.01, max_depth=5, n_estimators=500, subsample=1 ..
[CV]  learning_rate=0.01, max_depth=5, n_estimators=500, subsample=1, score=0.389, total=  31.7s
[CV] learning_rate=0.01, max_depth=5, n_estimators=500, subsample=1 ..
[CV]  learning_rate=0.01, max_depth=5, n_estimators=500, subsample=1, score=0.398, total=  30.2s
[CV] learning_rate=0.01, max_depth=9, n_estimators=100, subsample=0.6 
[CV]  learning_rate=0.01, max_depth=9, n_estimators=100, subsample=0.6, score=0.380, total=  30.9s
[CV] learning_rate=0.01, max_depth=9, n_estimators=100, subsample=0.6 
[CV]  learning_rate=0.01, max_depth=9, n_estimators=100, subsample=0.6, score=0.393, total=  31.5s
[CV] learning_rate=0.01, max_depth=9, n_estimators=100, subsample=0.6 
[CV]  learning_rate=0.01, max_depth=9, n_estimators=100, subsample=0.6, score=0.388, total=  31.6s
[CV] learning_rate=0.01, max_depth=9, n_estimators=100, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=9, n_estimators=100, subsample=0.8, score=0.398, total=  35.2s
[CV] learning_rate=0.01, max_depth=9, n_estimators=100, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=9, n_estimators=100, subsample=0.8, score=0.387, total=  35.2s
[CV] learning_rate=0.01, max_depth=9, n_estimators=100, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=9, n_estimators=100, subsample=0.8, score=0.384, total=  36.6s
[CV] learning_rate=0.01, max_depth=9, n_estimators=100, subsample=1 ..
[CV]  learning_rate=0.01, max_depth=9, n_estimators=100, subsample=1, score=0.363, total=  43.4s
[CV] learning_rate=0.01, max_depth=9, n_estimators=100, subsample=1 ..
[CV]  learning_rate=0.01, max_depth=9, n_estimators=100, subsample=1, score=0.361, total=  42.6s
[CV] learning_rate=0.01, max_depth=9, n_estimators=100, subsample=1 ..
[CV]  learning_rate=0.01, max_depth=9, n_estimators=100, subsample=1, score=0.370, total=  39.5s
[CV] learning_rate=0.01, max_depth=9, n_estimators=500, subsample=0.6 
[CV]  learning_rate=0.01, max_depth=9, n_estimators=500, subsample=0.6, score=0.406, total= 2.4min
[CV] learning_rate=0.01, max_depth=9, n_estimators=500, subsample=0.6 
[CV]  learning_rate=0.01, max_depth=9, n_estimators=500, subsample=0.6, score=0.417, total= 2.4min
[CV] learning_rate=0.01, max_depth=9, n_estimators=500, subsample=0.6 
[CV]  learning_rate=0.01, max_depth=9, n_estimators=500, subsample=0.6, score=0.410, total= 2.4min
[CV] learning_rate=0.01, max_depth=9, n_estimators=500, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=9, n_estimators=500, subsample=0.8, score=0.411, total= 2.7min
[CV] learning_rate=0.01, max_depth=9, n_estimators=500, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=9, n_estimators=500, subsample=0.8, score=0.410, total= 2.6min
[CV] learning_rate=0.01, max_depth=9, n_estimators=500, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=9, n_estimators=500, subsample=0.8, score=0.412, total= 2.7min
[CV] learning_rate=0.01, max_depth=9, n_estimators=500, subsample=1 ..
[CV]  learning_rate=0.01, max_depth=9, n_estimators=500, subsample=1, score=0.391, total= 3.0min
[CV] learning_rate=0.01, max_depth=9, n_estimators=500, subsample=1 ..
[CV]  learning_rate=0.01, max_depth=9, n_estimators=500, subsample=1, score=0.383, total= 2.9min
[CV] learning_rate=0.01, max_depth=9, n_estimators=500, subsample=1 ..
[CV]  learning_rate=0.01, max_depth=9, n_estimators=500, subsample=1, score=0.391, total= 2.9min
[CV] learning_rate=0.01, max_depth=15, n_estimators=100, subsample=0.6 
[CV]  learning_rate=0.01, max_depth=15, n_estimators=100, subsample=0.6, score=0.383, total= 1.1min
[CV] learning_rate=0.01, max_depth=15, n_estimators=100, subsample=0.6 
[CV]  learning_rate=0.01, max_depth=15, n_estimators=100, subsample=0.6, score=0.401, total= 1.1min
[CV] learning_rate=0.01, max_depth=15, n_estimators=100, subsample=0.6 
[CV]  learning_rate=0.01, max_depth=15, n_estimators=100, subsample=0.6, score=0.392, total= 1.1min
[CV] learning_rate=0.01, max_depth=15, n_estimators=100, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=15, n_estimators=100, subsample=0.8, score=0.391, total= 1.4min
[CV] learning_rate=0.01, max_depth=15, n_estimators=100, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=15, n_estimators=100, subsample=0.8, score=0.387, total= 1.4min
[CV] learning_rate=0.01, max_depth=15, n_estimators=100, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=15, n_estimators=100, subsample=0.8, score=0.383, total= 1.4min
[CV] learning_rate=0.01, max_depth=15, n_estimators=100, subsample=1 .
[CV]  learning_rate=0.01, max_depth=15, n_estimators=100, subsample=1, score=0.347, total= 1.7min
[CV] learning_rate=0.01, max_depth=15, n_estimators=100, subsample=1 .
[CV]  learning_rate=0.01, max_depth=15, n_estimators=100, subsample=1, score=0.344, total= 1.7min
[CV] learning_rate=0.01, max_depth=15, n_estimators=100, subsample=1 .
[CV]  learning_rate=0.01, max_depth=15, n_estimators=100, subsample=1, score=0.351, total= 1.7min
[CV] learning_rate=0.01, max_depth=15, n_estimators=500, subsample=0.6 
[CV]  learning_rate=0.01, max_depth=15, n_estimators=500, subsample=0.6, score=0.407, total= 4.8min
[CV] learning_rate=0.01, max_depth=15, n_estimators=500, subsample=0.6 
[CV]  learning_rate=0.01, max_depth=15, n_estimators=500, subsample=0.6, score=0.411, total= 4.7min
[CV] learning_rate=0.01, max_depth=15, n_estimators=500, subsample=0.6 
[CV]  learning_rate=0.01, max_depth=15, n_estimators=500, subsample=0.6, score=0.413, total= 4.7min
[CV] learning_rate=0.01, max_depth=15, n_estimators=500, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=15, n_estimators=500, subsample=0.8, score=0.405, total= 5.7min
[CV] learning_rate=0.01, max_depth=15, n_estimators=500, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=15, n_estimators=500, subsample=0.8, score=0.404, total= 5.7min
[CV] learning_rate=0.01, max_depth=15, n_estimators=500, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=15, n_estimators=500, subsample=0.8, score=0.406, total= 5.8min
[CV] learning_rate=0.01, max_depth=15, n_estimators=500, subsample=1 .
[CV]  learning_rate=0.01, max_depth=15, n_estimators=500, subsample=1, score=0.388, total= 6.3min
[CV] learning_rate=0.01, max_depth=15, n_estimators=500, subsample=1 .
[CV]  learning_rate=0.01, max_depth=15, n_estimators=500, subsample=1, score=0.384, total= 6.4min
[CV] learning_rate=0.01, max_depth=15, n_estimators=500, subsample=1 .
[CV]  learning_rate=0.01, max_depth=15, n_estimators=500, subsample=1, score=0.390, total= 6.4min
[Parallel(n_jobs=1)]: Done 162 out of 162 | elapsed: 250.0min finished
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-132-b94fc9e9dd51> in <module>
     37 df['cols']='to 50 ft'
     38 df['model']=XGBClassifier.__name__
---> 39 master_3=pd.concat([master_2,df], axis=0)
     40 print('Finished with df {} model {}'.format(df_names[i],model.__name__))
     41 print("Total time:", datetime.datetime.now()-now)

NameError: name 'master_2' is not defined
In [126]:
####################################################################### Creating Feature Importance DataFrame ################################################################################
###
#clf.estimator.fit(x_train, y_train)
# clf_1=XGBClassifier(**clf.best_params_)
# clf_1.fit(x_train, y_train)
# clf_1.feature_importances_
# plot_importance(clf_1,max_num_features=30 )
# x_train=pd.read_pickle(path+"dimension_rd_x_train_2\\{}".format(x_train_files[0]))
# x_train[[col for col in x_train if col not in ['TSNE_1', 'TSNE_2']]]
# ft_import_df=pd.DataFrame()
# ft_import_df['ft_im']=clf_1.feature_importances_
# ft_import_df['cols']=x_train[[col for col in x_train if col not in ['TSNE_1', 'TSNE_2']]].columns
# ft_import_df.sort_values('ft_im', ascending=False).cols[0:100]
#ft_import_df.to_pickle('ft_imp_df.pkl')
# fin_100=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\results\\results_ft_selct_100.pkl')
# fin_100.mean_test_score.max()
Out[126]:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=500, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.8, tree_method='gpu_hist', verbosity=1)
In [332]:
####################################################################### Plotting Feature Importance  ####################################################################################
sns.set_style("dark")
ft_import_df_1=ft_import_df.sort_values('ft_im', ascending=False)
fig, ax= plt.subplots(figsize=(20,10))
sns.barplot(ft_import_df_1.ft_im[0:16],ft_import_df_1.cols[0:16],ax=ax, palette='twilight')
ax.set_title("XGBoost's Top 16 Most Important Features\n", size=30,fontdict= {'family':'MS Reference Sans Serif','weight':'bold'}, weight='bold'  )
ax.set_ylabel('')
ax.set_xlabel('F-Score', weight='bold', size=15)
plt.yticks(fontsize=18)
plt.tight_layout()
plt.savefig('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\figs\\XG_ft_importance_fig.png')
In [8]:
fin_100=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\results\\results_ft_selct_100.pkl')
fin_100.mean_test_score.max()
Out[8]:
0.4203157894736842
In [260]:
########################################################################## Single Fit Algorithum Cell #####################################################################################
path="C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\"

i=0
now=datetime.datetime.now()
y_train_files=os.listdir(path+"y_train")
x_train_files=os.listdir(path+"dimension_rd_x_train_2")
x_train=pd.read_pickle(path+"dimension_rd_x_train_2\\{}".format(x_train_files[0]))
y_train_files=os.listdir(path+"y_train")
y_train=pd.read_pickle(path+"y_train\\{}".format(y_train_files[1]))

path="C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\"
params={'learning_rate': 0.1,
 'max_depth': 5,
 'n_estimators': 100,
 'subsample': 0.6,
 'tree_method': 'gpu_hist'}


col_list_fix=['vader_compound', 'Text_blob_polarity', 'vader_pos', 'vader_neg', 'text_blob_subjectivity', 'vader_neu']



i=0
y_train_files=os.listdir(path+"y_train")
x_train_files=os.listdir(path+"dimension_rd_x_train_2")
ft_import_df=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\ft_imp_df.pkl')

##### Selecting top 100 ft ##########################
cols=ft_import_df.sort_values('ft_im', ascending=False).cols[0:100]

x_train=pd.read_pickle(path+"dimension_rd_x_train_2\\{}".format(x_train_files[0]))
x_train=x_train[cols]
y_train=pd.read_pickle(path+"y_train\\dtm_l1_y_train.pkl").rating.values
    

#### fitting models ####
for col in col_list_fix:
    x_test[col]=x_test[col].astype(float)
    x_train[col]=x_train[col].astype(float)


from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
clf=XGBClassifier(**params)
clf.fit(x_train,y_train)
scores = cross_validate(clf, x_train , y_train, cv=5)
scores
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-260-6a65c6ee9079> in <module>
     25 y_train_files=os.listdir(path+"y_train")
     26 x_train_files=os.listdir(path+"dimension_rd_x_train_2")
---> 27 ft_import_df=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\ft_imp_df.pkl')
     28 
     29 ##### Selecting top 100 ft ##########################

~\Anaconda3\envs\PyhtonAndR\lib\site-packages\pandas\io\pickle.py in read_pickle(path, compression)
    143     """
    144     path = _stringify_path(path)
--> 145     f, fh = _get_handle(path, "rb", compression=compression, is_text=False)
    146 
    147     # 1) try standard libary Pickle

~\Anaconda3\envs\PyhtonAndR\lib\site-packages\pandas\io\common.py in _get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text)
    403         else:
    404             # Binary mode
--> 405             f = open(path_or_buf, mode)
    406         handles.append(f)
    407 

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\ft_imp_df.pkl'
In [ ]:
############################################################### Test fit For Y Variable ###################################################################

x_test=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\dimension_rd_x_test\\dtm_l1_dim_rd_test.pkl")
y_test=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\y_test\\dtm_l1_y_test.pkl")
y_train=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\y_train\\dtm_l1_y_train.pkl")
x_train=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\dimension_rd_x_train\\dtm_l1_dim_rd_train.pkl")
col_list_fix=['vader_compound', 'Text_blob_polarity', 'vader_pos', 'vader_neg', 'text_blob_subjectivity', 'vader_neu']

for col in col_list_fix:
    x_test[col]=x_test[col].astype(float)
    x_train[col]=x_train[col].astype(float)

y_train=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\\y_train\\dtm_l1_y_train.pkl")

# display(x_test.head())
# x_train.head()
# clf.fit(x_train[cols].values,y_train)
cols=ft_import_df.sort_values('ft_im', ascending=False).cols[0:100]

pred=clf.predict(x_test[cols])
pred_1=clf.predict(x_train[cols])
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

print("\n\n\n test df results")
confusion_matrix(y_test,pred_1)
print(classification_report(y_train, pred_1))



print("\n\n\n test df results")
confusion_matrix(y_test,pred)
print(classification_report(y_test, pred))
In [166]:
####################################################### saving model ####################################################
import pickle

pickle_out = open("clf.pickle","wb")
pickle.dump(clf, pickle_out)
pickle_out.close()
In [ ]:
##################################################################### Creating a Data Pipline to Test #################################################################################

%cd "C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\Amazon_reviews"
from sklearn.feature_extraction.text import TfidfTransformer

n = 3000000 # df size
s = 800 #desired sample size
skip = sorted(random.sample(range(1, n), n-s)) # rows to skip
reviews_test = pd.read_csv("Amazon_reviews.csv", skiprows=skip, names=["rating", "title", "review"])

now=datetime.datetime.now()
analyser = SentimentIntensityAnalyzer()
sent_vader = pd.DataFrame(columns=["vader_neg", "vader_neu",'vader_pos','vader_compound'])
x=0

###### vader
for i, review in enumerate(reviews_test.review):
    sent=analyser.polarity_scores(review)
    sent_vader.loc[i,"vader_neg"]=sent['neg']
    sent_vader.loc[i,"vader_neu"]=sent['neu']
    sent_vader.loc[i,"vader_pos"]=sent['pos']
    sent_vader.loc[i,"vader_compound"]=sent['compound']
    if i%1000==0: 
        print("{}% done at {}".format(x, datetime.datetime.now()))
        x+=10

### text blob
sent_blob=pd.DataFrame(columns=['Text_blob_polarity', "text_blob_subjectivity"])
x=0
for i, review in enumerate(reviews_test.review):
    sent=TextBlob(review).sentiment
    sent_blob.loc[i,"Text_blob_polarity"]=sent[0]
    sent_blob.loc[i,"text_blob_subjectivity"]=sent[1]
    if i%1000==0: 
        print("{}% done at {}".format(x, datetime.datetime.now()))
        n+=1000
    
master=pd.concat([sent_vader, sent_blob], axis=1)

stem_function= PorterStemmer
port=PorterStemmer

## stemer function
def stemer(tokens, stem_function):
    stems=[]
    for token in tokens:
        stem=port().stem(token)
        stems.append(stem)
    return stems

## tokenizes words
def tokenizer(text):
    text=re.sub("(!|\\)|\\(|`|~|,|\\.|#|$|%|\\*|\\'|\\\")", "",text) # regex to get ride of function
    toks=wordpunct_tokenize(text.lower()) # converting everything to lower case
    stemed=stemer(toks, stem_function)
    return stemed
    
    
    
word_list=reviews_test.review.to_list()   
vect = CountVectorizer(tokenizer=tokenizer, stop_words = en_stop, ngram_range= (1,2),max_features = 1000)
vect.fit(word_list)
dtm = vect.transform(word_list)   
df2 = pd.DataFrame(dtm.toarray(), columns=vect.get_feature_names())

tfdif2 = TfidfTransformer(norm='l1', use_idf=True)
dtm_l2 = tfdif2.fit_transform(df2)
df2=pd.concat([df2, master], axis=1)
df2

sent_test=df2[['vader_neg','vader_neu', 'vader_pos', 'vader_compound','Text_blob_polarity','text_blob_subjectivity']]
drops=['vader_neg','vader_neu', 'vader_pos', 'vader_compound','Text_blob_polarity','text_blob_subjectivity']
dimension_tec=[umap_fun, lle_fun, sparce_fun,pca_fun]
cols=[col for col in df2.columns if col not in drops]
dtm=df2[cols]

umap_ft=umap_fun.transform(dtm)
col_name=["UMAP_{}".format(x) for x in range(umap_ft.shape[1])] # creating col names
umap_df_train=pd.DataFrame(umap_ft, columns=col_name) # Sparse train df

lle_ft=lle_fun.transform(dtm)
col_name=["LLE_{}".format(x) for x in range(lle_ft.shape[1])] # creating col names
lle_df_train=pd.DataFrame(lle_ft, columns=col_name) # Sparse train df

sparce_ft=sparce_fun.transform(dtm)
col_name=["Sparse_{}".format(x) for x in range(sparce_ft.shape[1])] # creating col names
sparse_df_train=pd.DataFrame(sparce_ft, columns=col_name) # Sparse train df

pca_ft=pca_fun.transform(dtm)
col_name=["PCA_{}".format(x) for x in range(pca_ft.shape[1])] # creating col names
PCA_df_train=pd.DataFrame(pca_ft, columns=col_name) # Sparse train df

train_df=pd.concat([sparse_df_train, PCA_df_train,umap_df_train,master,lle_df_train], axis=1)

chng=['vader_neg', 'vader_neu', 'vader_pos', 'vader_compound']
for col in chng:
    train_df[col]=train_df[col].astype(float)
    


cols=ft_import_df.sort_values('ft_im', ascending=False).cols[0:100]

pred=clf.predict(train_df[cols])
# pred=cross_val_predict(clf, X, y)

print(classification_report(reviews_test.rating, pred))
In [40]:
i=0
now=datetime.datetime.now()
y_train_files=os.listdir(path+"y_train")
x_train_files=os.listdir(path+"dimension_rd_x_train_2")
df_names="l1"
master_2=pd.DataFrame()

##### selecting only 100 PCA's (issue with time) ##########################
# cols=["PCA_{}".format(x) for x in range(100)]
#cols=['Sparse_0','Sparse_1','Sparse_2','UMAP_0','UMAP_1','UMAP_2','vader_neg','vader_neu', 'vader_pos', 'vader_compound',
             #'Text_blob_polarity','text_blob_subjectivity','LLE_0','LLE_1','LLE_2', 'PCA_1','PCA_2','PCA_3']


x_train=pd.read_pickle(path+"dimension_rd_x_train_2\\{}".format(x_train_files[0]))
y_train=pd.read_pickle(path+"y_train\\{}".format(y_train_files[1])).rating.values
#x_train=x_train.loc[:,cols]

x_train[[col for col in x_train.columns if col not in ['TSNE_1', 'TSNE_2']]]
Out[40]:
Sparse_0 Sparse_1 Sparse_2 PCA_0 PCA_1 PCA_2 PCA_3 PCA_4 PCA_5 PCA_6 ... UMAP_0 UMAP_1 vader_neg vader_neu vader_pos vader_compound Text_blob_polarity text_blob_subjectivity LLE_0 LLE_1
0 -0.003447 -0.000146 -0.001338 -0.065991 0.028139 0.072930 -0.085083 -0.060750 0.044697 0.038091 ... -4.084264 6.528867 0.094 0.906 0 -0.6486 -0.0642857 0.52619 0.007993 -0.005878
1 0.000000 0.000000 0.000000 -0.071607 -0.082483 -0.001451 0.001518 -0.027734 0.044720 0.046058 ... -1.139816 2.391321 0 0.826 0.174 0.4939 -0.00416667 0.178571 -0.002267 -0.003622
2 -0.004382 0.000167 -0.015122 -0.053353 0.035840 0.105020 -0.068836 -0.044808 0.004433 -0.011878 ... -4.211996 5.050555 0.113 0.704 0.182 0.6674 -0.019725 0.657407 0.002207 -0.002527
3 -0.008564 0.000079 0.000000 -0.087841 -0.113705 -0.019849 -0.004054 -0.066870 -0.046401 -0.023208 ... -1.442914 1.960394 0.082 0.779 0.139 0.7287 0.206667 0.555 0.002733 0.005821
4 -0.000158 0.000006 -0.022199 -0.048134 0.052211 0.162864 -0.100061 -0.066620 0.021119 0.039493 ... -2.902161 5.783535 0.018 0.83 0.152 0.8338 0.151786 0.642857 0.011837 -0.003232
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
9495 -0.011959 -0.026144 0.000000 0.310136 0.018771 -0.015299 -0.018336 -0.029080 -0.018617 0.035390 ... -5.325161 4.488866 0 0.764 0.236 0.9779 0.323214 0.514286 0.001513 0.001566
9496 -0.007915 -0.013018 0.000000 0.128538 -0.011163 0.011558 -0.024980 -0.019052 0.040633 0.002340 ... -5.991697 3.430374 0.118 0.782 0.099 -0.4734 0.300278 0.63381 0.002872 0.002746
9497 -0.011722 0.000165 0.000000 -0.094820 0.130341 -0.144374 -0.007765 0.021522 -0.123656 0.041907 ... -3.833734 -0.690987 0.239 0.691 0.069 -0.946 -0.328571 0.614286 0.001560 0.002668
9498 -0.003490 -0.020510 0.000000 0.221674 -0.010095 -0.064216 -0.096943 -0.049029 0.029237 0.077646 ... -5.227555 3.110168 0.041 0.83 0.129 0.5859 0.266667 0.55 -0.000388 0.000630
9499 -0.003749 -0.009046 0.000000 0.084163 -0.015090 0.007058 -0.025886 -0.013741 0.067081 0.047346 ... -5.806878 5.106863 0.051 0.648 0.301 0.9185 -0.0745833 0.473333 -0.000098 0.000991

9500 rows × 572 columns

In [579]:
### feature engineering: most Frequent Words ####
# x_train=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\x_train\\dtm_l1_x_train.pkl')
# y_train=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\y_train\\dtm_l1_y_train.pkl')
# x_test=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\x_test\\dtm_l1_x_test.pkl')
# y_test=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\y_test\\dtm_l1_y_test.pkl')
# ###### Most Frequent Words #########

# top=500
# df=x_train
# df['rating']=y_train.rating
# # l1_dtm['rating']=rev.rating
# # display(l1_dtm.melt(var_name='rating'))
# top_1=df.groupby('rating').sum().T.sort_values(1, ascending=False)[0:top].index
# top_2=df.groupby('rating').sum().T.sort_values(2, ascending=False)[0:top].index
# top_3=df.groupby('rating').sum().T.sort_values(3, ascending=False)[0:top].index
# top_4=df.groupby('rating').sum().T.sort_values(4, ascending=False)[0:top].index
# top_5=df.groupby('rating').sum().T.sort_values(5, ascending=False)[0:top].index

# one_toks=[col for col in top_1 if col not in top_5 | top_4 | top_3]
# two_toks=[col for col in top_2 if col not in top_5 | top_4 ]
# three_toks=[col for col in top_3 if col not in top_5 | top_1]
# four_toks=[col for col in top_4 if col not in top_2|top_1]
# five_toks=[col for col in top_5 if col not in top_3 |top_2 | top_1]


# x=['familiar' for x in range(len(one_toks))]
# g=0
# freq_map={**{i:'Low star' for i in one_toks},**{i:'Med star' for i in three_toks},**{i:'High star' for i in five_toks}}


for dex in df.T.iloc[:-9,0].index:
    df.T.loc[dex,]
Out[579]:
Index(['$', '&', '--', '1', '1 2', '10', '100', '12', '15', '2',
       ...
       'wrong', 'wrote', 'ye', 'year', 'year ago', 'year old', 'yet', 'youll',
       'young', 'youv'],
      dtype='object', length=1000)
In [556]:
###### feature Engineering: Binning polarity
x_train=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\dimension_rd_x_test_2\\dtm_l1_dim_rd_test_2.pkl')
y_train=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\y_train\\dtm_l1_y_train.pkl')

pd.cut(x_train.vader_compound
Out[556]:
0      0.7906
1      0.4926
2      0.1265
3      0.1263
4      0.9531
        ...  
495    0.8233
496    0.9351
497    0.4215
498   -0.6537
499    0.7003
Name: vader_compound, Length: 500, dtype: object

Clustering Process

  • HDBSCAN
  • DBSCAN
  • K-Means
  • Most Frequenct Words
  • Token Cluster Attempt
In [25]:
#################################################################################### HDBSCAN ################################################################################################
%cd "C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\clusters"

path="C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\"

### reducing cols ####
cols=["PCA_{}".format(x) for x in range(100)]
cols.extend(['Sparse_0','Sparse_1','Sparse_2','UMAP_0','UMAP_1','UMAP_2'])

### getting file directoryies 
x_train_dir=os.listdir(path+'dimension_rd_x_train_2')
y_train_dir=os.listdir(path+'y_train')
sent_dir=os.listdir(path+'sentiment')


now=datetime.datetime.now()
hdbscan_fun=HDBSCAN(min_cluster_size=500, min_samples=50, core_dist_n_jobs=-1,metric='manhattan')
names=['freq','l1','l2']
x_train_dir=os.listdir(path+'dimension_rd_x_train_2')
y_train_dir=os.listdir(path+'y_train')[1]
y_train=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\y_train\\dtm_l1_y_train.pkl')
# cols=["PCA_{}".format(x) for x in range(100)]
#cols.extend(['Sparse_0','Sparse_1','Sparse_2','UMAP_0','UMAP_1','UMAP_2'])


cols=['vader_neg', 'vader_neu','vader_pos', 'vader_compound', 'Text_blob_polarity','text_blob_subjectivity']

df=pd.read_pickle(path+'dimension_rd_x_train_2\\'+x_train_dir[0])
#hdbscan_fun.fit(df[cols].values)
hdbscan_fun.fit(df[sent])

cluster=hdbscan_fun.labels_
cluster_df=pd.DataFrame(cluster, columns=['HDBSCAN'])  
# cluster_df.to_pickle(names[i]+'_cluster.pkl')
# print("############### DF {} at {}##########".format(file,now))
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\clusters
In [22]:
path="C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\"

x_train_dir=os.listdir(path+'dimension_rd_x_train_2')
df=pd.read_pickle(path+'dimension_rd_x_train_2\\'+x_train_dir[0])
df.iloc[:,-13:]
sent=['vader_compound','Text_blob_polarity']
In [26]:
cluster_df['rating']=y_train.rating
pd.crosstab(cluster_df.HDBSCAN.astype(str),cluster_df.rating.astype(str))
Out[26]:
rating 1.0 2.0 3.0 4.0 5.0 nan
HDBSCAN
-1 493 499 482 481 455 148
0 187 167 164 167 170 51
1 1153 1174 1151 1118 1152 288
In [110]:
######################################################################### HDBSCAN Crosstab #######################################################################################

cluster_df['rating']=y_train.rating.astype(str)
cluster_df.HDBSCAN=cluster_df.HDBSCAN.astype(str)
pd.crosstab(cluster_df['rating'],cluster_df['HDBSCAN'],margins=True, margins_name='total')
Out[110]:
HDBSCAN -1 0 1 2 3 4 5 6 7 total
rating
1 1343 29 130 73 2 1 183 60 12 1833
2 1408 31 115 57 4 1 154 61 9 1840
3 1350 39 109 54 3 3 177 53 9 1797
4 1325 34 114 57 1 4 169 55 7 1766
5 1359 29 115 52 1 1 157 53 10 1777
total 6785 162 583 293 11 10 840 282 47 9013
In [31]:
#################################################################################### DBSCAN ################################################################################################
%cd "C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\clusters"
now=datetime.datetime.now()
db_fun= DBSCAN(eps=.5,min_samples=500,p=2, n_jobs=-1,leaf_size=100)
names=['freq','l1','l2']
x_train_dir=os.listdir(path+'dimension_rd_x_train_2')
y_train_dir=os.listdir(path+'y_train')[1]
y_train=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\y_train\\dtm_l1_y_train.pkl')
cols=["PCA_{}".format(x) for x in range(100)]
cols.extend(['Sparse_0','Sparse_1','UMAP_0','UMAP_1'])
cols.extend(['vader_neg', 'vader_neu','vader_pos', 'vader_compound', 'Text_blob_polarity','text_blob_subjectivity'])
# cols=['TSNE_1','TSNE_2']
# cols=['vader_neg', 'vader_neu','vader_pos', 'vader_compound', 'Text_blob_polarity','text_blob_subjectivity']

df=pd.read_pickle(path+'dimension_rd_x_train_2\\'+x_train_dir[0])
db_fun.fit(df[sent].values)
#db_fun.fit(df.values)

cluster=hdbscan_fun.labels_
cluster_df=pd.DataFrame(cluster, columns=['DBSCAN'])  
# print("############### DF {} at {}##########".format(file,now))
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\clusters
In [32]:
################################################################ DBSCAN Cross Tab ########################################################################################
cluster_df['rating']=y_train.rating.astype(str)
cluster_df.DBSCAN=cluster_df.DBSCAN.astype(str)
pd.crosstab(cluster_df['rating'],cluster_df['DBSCAN'],margins=True, margins_name='total')
Out[32]:
DBSCAN -1 0 1 total
rating
1 493 187 1153 1833
2 499 167 1174 1840
3 482 164 1151 1797
4 481 167 1118 1766
5 455 170 1152 1777
total 2410 855 5748 9013
In [29]:
#################################################################################### K-MEANS ################################################################################################
%cd "C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\clusters"
now=datetime.datetime.now()
km_fun= KMeans(n_clusters=5, n_jobs=-1)
names=['freq','l1','l2']
x_train_dir=os.listdir(path+'dimension_rd_x_train_2')
y_train_dir=os.listdir(path+'y_train')[1]
y_train=pd.read_pickle('C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\y_train\\dtm_l1_y_train.pkl')
#cols=["PCA_{}".format(x) for x in range(100)]
#cols.extend(['Sparse_0','Sparse_1','UMAP_0','UMAP_1'])
#cols.extend(['vader_neg', 'vader_neu','vader_pos', 'vader_compound', 'Text_blob_polarity','text_blob_subjectivity'])
cols=['TSNE_1','TSNE_2']
#cols=['vader_neg', 'vader_neu','vader_pos', 'vader_compound', 'Text_blob_polarity','text_blob_subjectivity']

df=pd.read_pickle(path+'dimension_rd_x_train_2\\'+x_train_dir[0])
km_fun.fit(df[sent].values)
#km_fun.fit(df.values)

cluster=km_fun.labels_
cluster_df=pd.DataFrame(cluster, columns=['K_MEANS'])  
# print("############### DF {} at {}##########".format(file,now))
C:\Users\jwr17\OneDrive - University of New Hampshire\Text Minning\New folder\clusters
In [30]:
######################################################################### K-Means Cross Tab ##################################################################################
cluster_df['rating']=y_train.rating.astype(str)
cluster_df.K_MEANS=cluster_df.K_MEANS.astype(str)
pd.crosstab(cluster_df['rating'],cluster_df['K_MEANS'],margins=True, margins_name='total')
Out[30]:
K_MEANS 0 1 2 3 4 total
rating
1 296 286 591 367 293 1833
2 329 289 606 368 248 1840
3 341 281 567 364 244 1797
4 313 272 572 351 258 1766
5 297 271 619 343 247 1777
total 1576 1399 2955 1793 1290 9013
In [333]:
################# Clustering on tokens #################
l1_dtm=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\df's\\dtm_l1.pkl")
rev=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\df's\\reviews.pkl")
In [487]:
####################################################################### Frequency of word by Ratings ##################################################

# l1_dtm=l1_dtm.iloc[:,:-6]
l1_dtm=pd.read_pickle("C:\\Users\\jwr17\\OneDrive - University of New Hampshire\\Text Minning\\New folder\\df's\\dtm_l1.pkl")

top=300
l1_dtm['rating']=rev.rating
# display(l1_dtm.melt(var_name='rating'))
top_1=l1_dtm.groupby('rating').sum().T.sort_values(1, ascending=False)[0:top].index
top_2=l1_dtm.groupby('rating').sum().T.sort_values(2, ascending=False)[0:top].index
top_3=l1_dtm.groupby('rating').sum().T.sort_values(3, ascending=False)[0:top].index
top_4=l1_dtm.groupby('rating').sum().T.sort_values(4, ascending=False)[0:top].index
top_5=l1_dtm.groupby('rating').sum().T.sort_values(5, ascending=False)[0:top].index

one_toks=[col for col in top_1 if col not in top_5 | top_4 | top_3 | top_2 ]
two_toks=[col for col in top_2 if col not in top_5 | top_4 |]
three_toks=[col for col in top_3 if col not in top_5 | top_1]
four_toks=[col for col in top_4 if col not in top_2|top_1]
five_toks=[col for col in top_5 if col not in top_3 |top_2 | top_1]
In [410]:
################################################################################# Clustering Token Atempt  ########################################################################################
trn=l1_dtm.pivot(columns='rating').T.reset_index().fillna(0)

tokens=trn.level_0
tokens=trn.rating
trn



umap_fun=umap.UMAP(n_neighbors=50, n_components=10, metric='cosine')

rd_trn=umap_fun.fit_transform(trn.iloc[:,2:])

hdbscan_fun.fit(rd_trn)

cluster=hdbscan_fun.labels_
cluster_df=pd.DataFrame(cluster, columns=['HDBSCAN'])  
# print("############### DF {} at {}##########".format(file,now))
In [427]:
token_cluster=pd.concat([cluster_df,tokens,trn.rating], axis=1)
token_cluster.groupby('rating')
token_cluster[['HDBSCAN','rating']]
pd.crosstab(token_cluster['rating'],token_cluster['HDBSCAN'])
Out[427]:
HDBSCAN 0 1 2 3 4
rating
1 0 0 1000 0 0
2 0 0 0 1000 0
3 0 1000 0 0 0
4 999 0 0 1 0
5 0 0 0 0 1000