"""

6. Assuming a set of documents that need to be classified, use the naïve Bayesian Classifier model to perform this task. 

Built-in Java classes/API can be used to write the program. Calculate the accuracy, precision, and recall for your data set


"""


import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import MultinomialNB



msg=pd.read_csv('naivetext.csv',names=['message','label'])

#shape gives the number of dimensions in the array

print('The dimensions of the dataset',msg.shape) #(18,2)


msg['labelnum']=msg.label.map({'pos':1,'neg':0})

X=msg.message

y=msg.labelnum

#print (X)

#print(y)


#splitting the dataset into train and test data

xtrain,xtest,ytrain,ytest=train_test_split(X,y)

print ('\n the total number of Training Data :',ytrain.shape)

print ('\n the total number of Test Data :',ytest.shape)



#output the words or Tokens in the text documents

cv = CountVectorizer()

xtrain_dtm = cv.fit_transform(xtrain)

xtest_dtm=cv.transform(xtest)

#print('dtm=',xtrain_dtm)

print('\n The words or Tokens in the text documents \n')

print(cv.get_feature_names())

df=pd.DataFrame(xtrain_dtm.toarray(),columns=cv.get_feature_names())

print(df)

# Training Naive Bayes (NB) classifier on training data.

clf = MultinomialNB().fit(xtrain_dtm,ytrain)

predicted = clf.predict(xtest_dtm)


from sklearn import metrics

#printing accuracy, Confusion matrix, Precision and Recall

print('\n Accuracy of the classifier is',metrics.accuracy_score(ytest,predicted))

print('\n Confusion matrix')

print(metrics.confusion_matrix(ytest,predicted))

print('\n The value of Precision', metrics.precision_score(ytest,predicted))

print('\n The value of Recall', metrics.recall_score(ytest,predicted))

#print(metrics.classification_report(metrics,predicted,target_names=None)