mirror of
https://github.com/NotXia/unibo-ai-notes.git
synced 2025-12-15 19:12:22 +01:00
Remove BDA + TM
This commit is contained in:
@ -1,15 +0,0 @@
|
|||||||
{
|
|
||||||
"name": "Big Data Analytics and Text Mining",
|
|
||||||
"year": 2,
|
|
||||||
"semester": 1,
|
|
||||||
"pdfs": [
|
|
||||||
{
|
|
||||||
"name": "Text mining",
|
|
||||||
"path": "module1/tm.pdf"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "Big data analytics",
|
|
||||||
"path": "module2/bda.pdf"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
@ -1 +0,0 @@
|
|||||||
../../../ainotes.cls
|
|
||||||
@ -1,138 +0,0 @@
|
|||||||
\chapter{Automatic text summarization}
|
|
||||||
|
|
||||||
\begin{description}
|
|
||||||
\item[Extractive summarization] \marginnote{Extractive summarization}
|
|
||||||
Select fragments of text.
|
|
||||||
|
|
||||||
\item[Abstractive summarization] \marginnote{Abstractive summarization}
|
|
||||||
Rephrase the content of the text.
|
|
||||||
|
|
||||||
\item[Hybrid summarization] \marginnote{Hybrid summarization}
|
|
||||||
Apply an extractive method followed by an abstractive one.
|
|
||||||
\end{description}
|
|
||||||
|
|
||||||
|
|
||||||
\begin{description}
|
|
||||||
\item[Generic vs query-focused] \phantom{}
|
|
||||||
\begin{description}
|
|
||||||
\item[Generic]
|
|
||||||
Summary of the whole document.
|
|
||||||
\item[Query-focused]
|
|
||||||
Summary that replies to given questions
|
|
||||||
\end{description}
|
|
||||||
|
|
||||||
\item[Technical vs lay] \phantom{}
|
|
||||||
\begin{description}
|
|
||||||
\item[Technical]
|
|
||||||
Summary using scientific language.
|
|
||||||
\item[Lay]
|
|
||||||
Summary using common language.
|
|
||||||
\end{description}
|
|
||||||
|
|
||||||
\item[Narrative vs bullet point] \phantom{}
|
|
||||||
\begin{description}
|
|
||||||
\item[Narrative]
|
|
||||||
Standard textual summary.
|
|
||||||
\item[Bullet point]
|
|
||||||
Set of key phrases.
|
|
||||||
\end{description}
|
|
||||||
|
|
||||||
\item[Single document vs multi document] \phantom{}
|
|
||||||
\begin{description}
|
|
||||||
\item[Single document]
|
|
||||||
Summary covering a single document.
|
|
||||||
\item[Multi document]
|
|
||||||
Summary covering multiple documents.
|
|
||||||
\end{description}
|
|
||||||
|
|
||||||
\item[Short document vs long document] \phantom{}
|
|
||||||
\begin{description}
|
|
||||||
\item[Short document]
|
|
||||||
Summary of a document with a few tokens.
|
|
||||||
\item[Long document]
|
|
||||||
Summary of a document with many tokens.
|
|
||||||
\end{description}
|
|
||||||
\end{description}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
\section{Metrics}
|
|
||||||
|
|
||||||
Summarization metrics can evaluate different levels:
|
|
||||||
\begin{descriptionlist}
|
|
||||||
\item[Syntactic]
|
|
||||||
Check word overlapping (e.g., ROUGE).
|
|
||||||
\item[Semantic]
|
|
||||||
Check semantic coverage (e.g., BERTScore).
|
|
||||||
\item[Factuality]
|
|
||||||
Check factuality to the source (e.g., BARTScore).
|
|
||||||
\item[Fluency]
|
|
||||||
Check for redundancies (e.g., unique N-gram ratio).
|
|
||||||
\item[Efficiency]
|
|
||||||
Measure trade-off between performance and costs (e.g., CARBURACY).
|
|
||||||
\end{descriptionlist}
|
|
||||||
|
|
||||||
|
|
||||||
\subsection{Recall-Oriented Understudy for Gisting Evaluation (ROUGE)}
|
|
||||||
|
|
||||||
\begin{description}
|
|
||||||
\item[ROUGE] \marginnote{ROUGE}
|
|
||||||
N-gram oriented metric that compares the generated summary and the ground truth.
|
|
||||||
|
|
||||||
\begin{description}
|
|
||||||
\item[ROUGE-1] Overlap of 1-grams.
|
|
||||||
\item[ROUGE-2] Overlap of 2-grams.
|
|
||||||
\item[ROUGE-L] Length of the common longest subsequence.
|
|
||||||
\end{description}
|
|
||||||
\end{description}
|
|
||||||
|
|
||||||
|
|
||||||
\begin{description}
|
|
||||||
\item[Precision]
|
|
||||||
\[ \texttt{ROUGE}_\texttt{precision} = \frac{\vert \text{overlaps} \vert}{\vert \text{generated summary} \vert} \]
|
|
||||||
|
|
||||||
\item[Recall]
|
|
||||||
\[ \texttt{ROUGE}_\texttt{recall} = \frac{\vert \text{overlaps} \vert}{\vert \text{ground truth} \vert} \]
|
|
||||||
\end{description}
|
|
||||||
|
|
||||||
|
|
||||||
\subsection{Limitations}
|
|
||||||
|
|
||||||
\begin{itemize}
|
|
||||||
\item ROUGE only evaluates on a syntactic level.
|
|
||||||
\item ROUGE-2 and ROUGE-L are sensitive to the position of words.
|
|
||||||
\end{itemize}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
\section{State-of-the-art generative summarizers}
|
|
||||||
|
|
||||||
\subsection{BART}
|
|
||||||
|
|
||||||
\begin{itemize}
|
|
||||||
\item \marginnote{BART}
|
|
||||||
Encoder-decoder Transformer with an input size of 1024 tokens.
|
|
||||||
\item It is suited for short document summarization.
|
|
||||||
\item It is pre-trained using a denoising sequence-to-sequence approach.
|
|
||||||
\end{itemize}
|
|
||||||
|
|
||||||
|
|
||||||
\subsection{Longformer encoder-decoder}
|
|
||||||
|
|
||||||
\begin{itemize}
|
|
||||||
\item \marginnote{Longformer encoder-decoder}
|
|
||||||
Encoder-decoder Transformer with an input size of 16k tokens.
|
|
||||||
\item It is suited for long document summarization.
|
|
||||||
\item It uses a linear encoder self-attention based on global and local attention that reduces the quadratic complexity of the standard attention mechanism.
|
|
||||||
\end{itemize}
|
|
||||||
|
|
||||||
|
|
||||||
\subsection{PRIMERA}
|
|
||||||
|
|
||||||
\begin{itemize}
|
|
||||||
\item \marginnote{PRIMERA}
|
|
||||||
Encoder-decoder Transformer based on Longformer with an input size of 4K tokens.
|
|
||||||
\item It is suited for long document summarization.
|
|
||||||
\item It has an ad-hoc pre-training for multi document summarization.
|
|
||||||
\end{itemize}
|
|
||||||
|
|
||||||
@ -1,13 +0,0 @@
|
|||||||
\documentclass[11pt]{ainotes}
|
|
||||||
|
|
||||||
\title{Big Data Analytics and Text Mining\\(Module 1)}
|
|
||||||
\date{2024 -- 2025}
|
|
||||||
\def\lastupdate{{PLACEHOLDER-LAST-UPDATE}}
|
|
||||||
\def\giturl{{PLACEHOLDER-GIT-URL}}
|
|
||||||
|
|
||||||
\begin{document}
|
|
||||||
|
|
||||||
\makenotesfront
|
|
||||||
\input{./sections/_summarization.tex}
|
|
||||||
|
|
||||||
\end{document}
|
|
||||||
@ -1 +0,0 @@
|
|||||||
../../../ainotes.cls
|
|
||||||
@ -1,12 +0,0 @@
|
|||||||
\documentclass[11pt]{ainotes}
|
|
||||||
|
|
||||||
\title{Big Data Analytics and Text Mining\\(Module 2)}
|
|
||||||
\date{2024 -- 2025}
|
|
||||||
\def\lastupdate{{PLACEHOLDER-LAST-UPDATE}}
|
|
||||||
\def\giturl{{PLACEHOLDER-GIT-URL}}
|
|
||||||
|
|
||||||
\begin{document}
|
|
||||||
|
|
||||||
\makenotesfront
|
|
||||||
|
|
||||||
\end{document}
|
|
||||||
Reference in New Issue
Block a user