Spaces:

hughustla
/

text_summarisation_demo

Runtime error

App Files Files Community

hughustla commited on May 16, 2022

Commit

5a60200

1 Parent(s): c3c27cd

Add application files

Browse files

Files changed (5) hide show

app.py +17 -0
poetry.lock +0 -0
pyproject.toml +20 -0
src/text_rank_summarizer.py +67 -0
src/transformer_summarization.py +12 -0

app.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import gradio as gr
+from gradio import inputs
+# from src.text_rank_summarizer import summarize
+from src.transformer_summarization import summarize
+long_text_input = inputs.Textbox(lines=200, label='Long Text')
+summary_lines = inputs.Number(default=4, label='Summary Lines')
+interface = gr.Interface(fn=summarize,
+                         inputs=[long_text_input],
+                         outputs=['text'],
+                         live=False,
+                         layout='horizontal',
+                         css='css/index.css')
+if __name__ == '__main__':
+    app, local_url, share_url = interface.launch()

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,20 @@

+[tool.poetry]
+name = "text_summarisation_demo"
+version = "0.1.0"
+description = ""
+authors = ["swhustla <fdkelly@gmail.com>"]
+[tool.poetry.dependencies]
+python = ">=3.9,<3.11"
+gradio = "pytextrank"
+Jinja2 = "^3.0.3"
+pytextrank = "^3.2.3"
+huggingface = "^0.0.1"
+transformers = {extras = ["pytorch"], version = "^4.17.0"}
+torch = "^1.11.0"
+[tool.poetry.dev-dependencies]
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"

src/text_rank_summarizer.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import spacy
+import pytextrank
+from math import sqrt
+from operator import itemgetter
+nlp = spacy.load('en_core_web_sm')
+nlp.add_pipe('textrank')
+def _phrase_vector(doc):
+    phrase_id = 0
+    unit_vector = []
+    sent_bounds = [[s.start, s.end, set([])] for s in doc.sents]
+    for p in doc._.phrases:
+        unit_vector.append(p.rank)
+        for chunk in p.chunks:
+            for sent_start, sent_end, sent_vector in sent_bounds:
+                if chunk.start >= sent_start and chunk.end <= sent_end:
+                    sent_vector.add(phrase_id)
+                    break
+        phrase_id += 1
+    sum_ranks = sum(unit_vector)
+    return [rank / sum_ranks for rank in unit_vector], sent_bounds
+def _sent_rank(unit_vector, sent_bounds):
+    sent_rank = {}
+    sent_id = 0
+    for sent_start, sent_end, sent_vector in sent_bounds:
+        sum_sq = 0.0
+        for phrase_id in range(len(unit_vector)):
+            if phrase_id not in sent_vector:
+                sum_sq += unit_vector[phrase_id] ** 2.0
+        sent_rank[sent_id] = sqrt(sum_sq)
+        sent_id += 1
+    return sent_rank
+def _rank_to_summary(sent_rank, doc, summary_lines):
+    sent_text = {}
+    sent_id = 0
+    for sent in doc.sents:
+        sent_text[sent_id] = sent.text
+        sent_id += 1
+    summary = []
+    num_sent = 0
+    for sent_id, _ in sent_rank:
+        num_sent += 1
+        summary.append(sent_text[sent_id])
+        if num_sent == summary_lines:
+            break
+    return ' '.join(summary)
+def summarize(text, summary_lines):
+    doc = nlp(text)
+    phrase_vector, sent_bounds = _phrase_vector(doc)
+    sent_rank  = sorted(_sent_rank(phrase_vector, sent_bounds).items(), key=itemgetter(1))
+    return _rank_to_summary(sent_rank, doc, summary_lines)

src/transformer_summarization.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from transformers import LongformerTokenizer, EncoderDecoderModel
+# Load model and tokenizer
+model = EncoderDecoderModel.from_pretrained("patrickvonplaten/longformer2roberta-cnn_dailymail-fp16")
+tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
+def summarize(text):
+    input_ids = tokenizer(text, return_tensors="pt").input_ids
+    output_ids = model.generate(input_ids)
+    # Get the summary from the output tokens
+    return tokenizer.decode(output_ids[0], skip_special_tokens=True)