import spacy from spacy.lang.en.stop_words import STOP_WORDS from string import punctuation from collections import Counter # Load the spaCy model nlp = spacy.load("en_core_web_sm") # Sample text doc1 = """Python is a high-level, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation. Python is dynamically-typed and garbage-collected. It supports multiple programming paradigms, including structured (particularly procedural), object-oriented, and functional programming. It is often described as a "batteries included" language due to its comprehensive standard library. Guido van Rossum began working on Python in the late 1980s as a successor to the ABC programming language and first released it in 1991 as Python 0.9.0. Python 2.0 was released in 2000 and introduced new features such as list comprehensions, cycle-detecting garbage collection, reference counting, and Unicode support. Python 3.0, released in 2008, was a major revision that is not completely backward-compatible with earlier versions. Python 2 was discontinued with version 2.7.18 in 2020.""" # Process the text with spaCy docx = nlp(doc1) # Tokenization and word frequency calculation stopwords = list(STOP_WORDS) word_frequencies = {} for word in docx: if word.text.lower() not in stopwords and word.text not in punctuation: if word.text.lower() not in word_frequencies: word_frequencies[word.text.lower()] = 1 else: word_frequencies[word.text.lower()] += 1 # Normalize word frequencies by dividing by the max frequency max_freq = max(word_frequencies.values()) for word in word_frequencies.keys(): word_frequencies[word] = word_frequencies[word] / max_freq # Sentence scoring based on word frequencies sentence_scores = {} for sent in docx.sents: for word in sent: if word.text.lower() in word_frequencies.keys(): if sent not in sentence_scores: sentence_scores[sent] = word_frequencies[word.text.lower()] else: sentence_scores[sent] += word_frequencies[word.text.lower()] # Selecting the top 30% of sentences based on their scores from heapq import nlargest select_length = int(len(sentence_scores) * 0.3) summary = nlargest(select_length, sentence_scores, key=sentence_scores.get) # Join the selected sentences into a final summary final_summary = [word.text for word in summary] summary = ' '.join(final_summary) print("Original Text Length:", len(doc1.split())) print("Summary Length:", len(summary.split())) print("\nSummary:\n", summary)