Comments_summarization_spacy

 import spacy

from spacy.lang.en.stop_words import STOP_WORDS

from string import punctuation

from collections import Counter

from heapq import nlargest

import pandas as pd


# Load the large English model

nlp = spacy.load('en_core_web_lg')

stopwords = list(STOP_WORDS)

punctuation += '\n'


# Function to summarize text

def summarize_text(text):

    doc = nlp(text)

    word_frequencies = {}


    for word in doc:

        if word.text.lower() not in stopwords and word.text.lower() not in punctuation:

            if word.text not in word_frequencies.keys():

                word_frequencies[word.text] = 1

            else:

                word_frequencies[word.text] += 1


    max_frequency = max(word_frequencies.values())


    for word in word_frequencies.keys():

        word_frequencies[word] = word_frequencies[word] / max_frequency


    sentence_scores = {}

    for sent in doc.sents:

        for word in sent:

            if word.text.lower() in word_frequencies.keys():

                if sent not in sentence_scores.keys():

                    sentence_scores[sent] = word_frequencies[word.text.lower()]

                else:

                    sentence_scores[sent] += word_frequencies[word.text.lower()]


    select_length = int(len([sent for sent in doc.sents]) * 0.3)

    summary = nlargest(select_length, sentence_scores, key = sentence_scores.get)

    final_summary = ' '.join([word.text for word in summary])

    return final_summary


# Function to summarize grouped worknotes

def summarize_comments(comments):

    combined_comments = ' '.join(comments)

    return summarize_text(combined_comments)


# Sample data

data = {

    'RTSK Number': ['001', '001', '002', '002'],

    'RTSK Worknote': [

        'Worknote 1 for RTSK 001. This is a detailed note explaining various aspects of the task.',

        'Worknote 2 for RTSK 001. Additional comments and updates.',

        'Worknote 1 for RTSK 002. Initial setup and configuration details.',

        'Worknote 2 for RTSK 002. Follow-up and final remarks.'

    ]

}

df = pd.DataFrame(data)


# Group by 'RTSK Number' and summarize

summarized_df = df.groupby('RTSK Number')['RTSK Worknote'].apply(summarize_comments).reset_index()

summarized_df.columns = ['RTSK Number', 'Summary']


# Display the summarized dataframe

print(summarized_df)


No comments

Theme images by tjasam. Powered by Blogger.