Comments_summarization_spacy
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from collections import Counter
from heapq import nlargest
import pandas as pd
# Load the large English model
nlp = spacy.load('en_core_web_lg')
stopwords = list(STOP_WORDS)
punctuation += '\n'
# Function to summarize text
def summarize_text(text):
doc = nlp(text)
word_frequencies = {}
for word in doc:
if word.text.lower() not in stopwords and word.text.lower() not in punctuation:
if word.text not in word_frequencies.keys():
word_frequencies[word.text] = 1
else:
word_frequencies[word.text] += 1
max_frequency = max(word_frequencies.values())
for word in word_frequencies.keys():
word_frequencies[word] = word_frequencies[word] / max_frequency
sentence_scores = {}
for sent in doc.sents:
for word in sent:
if word.text.lower() in word_frequencies.keys():
if sent not in sentence_scores.keys():
sentence_scores[sent] = word_frequencies[word.text.lower()]
else:
sentence_scores[sent] += word_frequencies[word.text.lower()]
select_length = int(len([sent for sent in doc.sents]) * 0.3)
summary = nlargest(select_length, sentence_scores, key = sentence_scores.get)
final_summary = ' '.join([word.text for word in summary])
return final_summary
# Function to summarize grouped worknotes
def summarize_comments(comments):
combined_comments = ' '.join(comments)
return summarize_text(combined_comments)
# Sample data
data = {
'RTSK Number': ['001', '001', '002', '002'],
'RTSK Worknote': [
'Worknote 1 for RTSK 001. This is a detailed note explaining various aspects of the task.',
'Worknote 2 for RTSK 001. Additional comments and updates.',
'Worknote 1 for RTSK 002. Initial setup and configuration details.',
'Worknote 2 for RTSK 002. Follow-up and final remarks.'
]
}
df = pd.DataFrame(data)
# Group by 'RTSK Number' and summarize
summarized_df = df.groupby('RTSK Number')['RTSK Worknote'].apply(summarize_comments).reset_index()
summarized_df.columns = ['RTSK Number', 'Summary']
# Display the summarized dataframe
print(summarized_df)
No comments