lst=['subreddit','label'] plt.figure(figsize=(15,12)) for i in range(len(lst)): plt.subplot(1,2,i+1) a=stress[lst[i]].value_counts() lbl=a.index plt.title(lst[i]+'_Distribution')
plt.pie(x=a,labels=lbl,autopct="%.1f %%") plt.show()
#defining function for preprocessing defpreprocess(text,remove_digits=True): text = re.sub('\W+',' ', text) text = re.sub('\s+',' ', text) text = re.sub("(?, "", text) text = re.sub("-(?!\w)|(?, "", text) text=text.lower() nopunc=[char for char in text if char notin string.punctuation] nopunc=''.join(nopunc) nopunc=' '.join([word for word in nopunc.split() if word.lower() notin stopwords.words('english')])
return nopunc # Defining a function for lemitization deflemmatize(words):
words=nlp(words) lemmas = [] for word in words:
lemmas.append(word.lemma_) return lemmas
#converting them into string
deflisttostring(s): str1=' ' return (str1.join(s))
defclean_text(input): word=preprocess(input) lemmas=lemmatize(word) return listtostring(lemmas) # Creating a feature to store clean texts stress['clean_text']=stress['text'].apply(clean_text) stress.head()
# Vectorization from sklearn.feature_extraction.text import TfidfVectorizer
# Model Building from sklearn.model_selection import GridSearchCV,StratifiedKFold, KFold,train_test_split,cross_val_score,cross_val_predict from sklearn.linear_model import LogisticRegression,SGDClassifier from sklearn import preprocessing from sklearn.naive_bayes import MultinomialNB from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import StackingClassifier,RandomForestClassifier, AdaBoostClassifier from sklearn.neighbors import KNeighborsClassifier
#Model Evaluation from sklearn.metrics import confusion_matrix,classification_report, accuracy_score,f1_score,precision_score from sklearn.pipeline import Pipeline
# Time from time import time # Defining target & feature for ML model building x=stress['clean_text'] y=stress['label'] x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=1)
# Self defining function to convert the data into vector form by tf idf # vectorizer and classify and create model by Decision Tree defmodel_dt_tf
(x_train, x_test, y_train, y_test): global acc_dt_tf,f1_dt_tf # Text to vector transformation vector = TfidfVectorizer() x_train = vector.fit_transform(x_train) x_test = vector.transform(x_test)
ovr = DecisionTreeClassifier(random_state=1)
#fitting training data into the model & predicting t0 = time()
# Creating tabular format for better comparison tbl=pd.DataFrame() tbl['Model']=pd.Series(['Logistic Regreesion','Multinomial NB', 'Decision Tree'
,'KNN','Random Forest','Adaptive Boosting']) tbl['Accuracy']=pd.Series([acc_lr_tf,acc_nb_tf,acc_dt_tf,acc_knn_tf, acc_rf_tf,acc_ab_tf]) tbl['F1_Score']=pd.Series([f1_lr_tf,f1_nb_tf,f1_dt_tf,f1_knn_tf, f1_rf_tf,f1_ab_tf]) tbl.set_index('Model') # Best model on the basis of F1 Score tbl.sort_values('F1_Score',ascending=False)
# Model building lr =LogisticRegression() mnb=MultinomialNB() dct=DecisionTreeClassifier(random_state=1) knn=KNeighborsClassifier() rf=RandomForestClassifier(random_state=1) ab=AdaBoostClassifier(random_state=1) m =[lr,mnb,dct,knn,rf,ab] model_name=['Logistic R','MultiNB','DecTRee','KNN','R forest','Ada Boost']
data=["""I don't have the ability to cope with it anymore. I'm trying, but a lot of things are triggering me, and I'm shutting down at work, just finding the place I feel safest, and staying there for an hour or two until I feel like I can do something again. I'm tired of watching my back, tired of traveling to places I don't feel safe, tired of reliving that moment, tired of being triggered, tired of the stress, tired of anxiety and knots in my stomach, tired of irrational thought when triggered, tired of irrational paranoia. I'm exhausted and need a break, but know it won't be enough until I journey the long road through therapy. I'm not suicidal at all, just wishing this pain and misery would end, to have my life back again."""]
data=["""In case this is the first time you're reading this post... We are looking for people who are willing to complete some online questionnaires about employment and well-being which we hope will help us to improve services for assisting people with mental health difficulties to obtain and retain employment. We are developing an employment questionnaire for people with personality disorders; however we are looking for people from all backgrounds to complete it. That means you do not need to have a diagnosis of personality disorder – you just need to have an interest in completing the online questionnaires. The questionnaires will only take about 10 minutes to complete online. For your participation, we’ll donate £1 on your behalf to a mental health charity (Young Minds: Child & Adolescent Mental Health, Mental Health Foundation, or Rethink)"""]