# Filter only related columns and drop duplicated reviewsreviews = reviews[["Review_Text","Rating"]]reviews = reviews.drop_duplicates(subset='Review_Text')
让我们使用 seaborn 的 countplot 来打印一个条形图,以了解评论的总体情感。
# Create a bar plot with value countssns.countplot(x='Rating', data=reviews)
def text_preprocessing(text): # Convert words to lower case text = text.lower() # Expand contractionsifTrue: text = text.split() new_text =[]for word in text:if word in contractions: new_text.append(contractions[word])else: new_text.append(word) text =" ".join(new_text) # Format words and remove unwanted characters text = re.sub(r'https?:\/\/.*[\r\n]*','', text, flags=re.MULTILINE) text = re.sub(r'\,' ', text) text = re.sub(r'&','', text) text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]',' ', text) text = re.sub(r' ',' ', text) text = re.sub(r'\'',' ', text) # Tokenize each word text = nltk.WordPunctTokenizer().tokenize(text) # Lemmatize each word text =[nltk.stem.WordNetLemmatizer().lemmatize(token, pos='v')for token in text if len(token)>1] return text
def to_string(text):# Convert list to string text =' '.join(map(str, text)) return text # Create a list of review by applying text_preprocessing functionreviews['Review_Clean_List']= list(map(text_preprocessing, reviews.Review_Text)) # Return to string with to_string functionreviews['Review_Clean']= list(map(to_string, reviews['Review_Clean_List']))
# Import Counter from collections importCounter # Join all word corpusreview_words =','.join(list(reviews['Review_Clean'].values)) # Count and find the 30 most frequentCounter=Counter(review_words.split())most_frequent =Counter.most_common(30) # Bar plot of frequent wordsfig = plt.figure(1, figsize =(20,10))_ = pd.DataFrame(most_frequent, columns=("words","count"))sns.barplot(x ='words', y ='count', data = _, palette ='winter')plt.xticks(rotation=45);
# Load the list of stopwordsnltk.download('stopwords') stopwords_list = stopwords.words('english')stopwords_list.extend(['park','disney','disneyland']) reviews['Review_Clean_List']=[[word for word in line if word notin stopwords_list]for line in reviews['Review_Clean_List']]reviews['Review_Clean']= list(map(text_as_string, reviews['Review_Clean_List'])) # Join all word corpusreview_words =','.join(list(reviews['Review_Clean'].values)) # Count and find the 30 most frequentCounter=Counter(review_words.split())most_frequent =Counter.most_common(30) # Bar plot of frequent wordsfig = plt.figure(1, figsize =(20,10))_ = pd.DataFrame(most_frequent, columns=("words","count"))sns.barplot(x ='words', y ='count', data = _, palette ='winter')plt.xticks(rotation=45);
奖励部分
让我们使用之前创建的review_words生成文本语料库的词云。
# Generate the word cloudwordcloud =WordCloud(background_color="white", max_words=200, contour_width =8, contour_color ="steelblue", collocations=False).generate(review_words) # Visualize the word cloudfig = plt.figure(1, figsize =(10,10))plt.axis('off')plt.imshow(wordcloud)plt.show()
# Create Dictionaryid2word = gensim.corpora.Dictionary(reviews['Review_Clean_List']) # Create Corpus: Term Document Frequencycorpus =[id2word.doc2bow(text)for text in reviews['Review_Clean_List']]
from gensim.models importCoherenceModel # Compute coherence scorenumber_of_topics =[]coherence_score =[]for i in range(1,10): lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, iterations=50, num_topics=i) coherence_model_lda =CoherenceModel(model=lda_model, texts=reviews['Review_Clean_List'], dictionary=id2word, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() number_of_topics.append(i) coherence_score.append(coherence_lda) # Create a dataframe of coherence score by number of topics topic_coherence = pd.DataFrame({'number_of_topics':number_of_topics,
'coherence_score':coherence_score}) # Print a line plotsns.lineplot(data=topic_coherence, x='number_of_topics', y='coherence_score')
# Define the number of topics n_topics =4 # Run the LDA modellda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=n_topics, random_state=100, update_every=1, chunksize=10, passes=10, alpha='symmetric', iterations=100, per_word_topics=True)
让我们来探讨一下在每个话题中出现的单词及其相对权重。
for idx, topic in lda_model.print_topics(-1):print("Topic: {} Word: {}".format(idx, topic))
# Import and enable notebook to run visualizationimport pyLDAvis.gensim_modelspyLDAvis.enable_notebook() vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary=lda_model.id2word)vis
在左侧,我们可以看到每个主题在主题距离图上表示为气泡,这个图是多维缩放在 x 和 y 轴上,如果我们单击一个主题,可视化会自动调整到该特定主题。气泡之间的距离表示主题之间的语义距离,如果气泡重叠,这意味着有很多共同的词。在我们的例子中,主题很好地分离且不重叠。此外,主题气泡的面积表示每个主题的覆盖范围,主题 1 占评价的约 50%,而其他主题则几乎平均分享。