diff --git a/src/year2/big-data-analytics-and-text-mining/metadata.json b/src/year2/big-data-analytics-and-text-mining/metadata.json new file mode 100644 index 0000000..efda739 --- /dev/null +++ b/src/year2/big-data-analytics-and-text-mining/metadata.json @@ -0,0 +1,15 @@ +{ + "name": "Big Data Analytics and Text Mining", + "year": 2, + "semester": 1, + "pdfs": [ + { + "name": "Text mining", + "path": "module1/tm.pdf" + }, + { + "name": "Big data analytics", + "path": "module2/bda.pdf" + } + ] +} \ No newline at end of file diff --git a/src/year2/big-data-analytics-and-text-mining/module1/ainotes.cls b/src/year2/big-data-analytics-and-text-mining/module1/ainotes.cls new file mode 120000 index 0000000..4a953bf --- /dev/null +++ b/src/year2/big-data-analytics-and-text-mining/module1/ainotes.cls @@ -0,0 +1 @@ +../../../ainotes.cls \ No newline at end of file diff --git a/src/year2/big-data-analytics-and-text-mining/module1/sections/_summarization.tex b/src/year2/big-data-analytics-and-text-mining/module1/sections/_summarization.tex new file mode 100644 index 0000000..b08fec5 --- /dev/null +++ b/src/year2/big-data-analytics-and-text-mining/module1/sections/_summarization.tex @@ -0,0 +1,138 @@ +\chapter{Automatic text summarization} + +\begin{description} + \item[Extractive summarization] \marginnote{Extractive summarization} + Select fragments of text. + + \item[Abstractive summarization] \marginnote{Abstractive summarization} + Rephrase the content of the text. + + \item[Hybrid summarization] \marginnote{Hybrid summarization} + Apply an extractive method followed by an abstractive one. +\end{description} + + +\begin{description} + \item[Generic vs query-focused] \phantom{} + \begin{description} + \item[Generic] + Summary of the whole document. + \item[Query-focused] + Summary that replies to given questions + \end{description} + + \item[Technical vs lay] \phantom{} + \begin{description} + \item[Technical] + Summary using scientific language. + \item[Lay] + Summary using common language. + \end{description} + + \item[Narrative vs bullet point] \phantom{} + \begin{description} + \item[Narrative] + Standard textual summary. + \item[Bullet point] + Set of key phrases. + \end{description} + + \item[Single document vs multi document] \phantom{} + \begin{description} + \item[Single document] + Summary covering a single document. + \item[Multi document] + Summary covering multiple documents. + \end{description} + + \item[Short document vs long document] \phantom{} + \begin{description} + \item[Short document] + Summary of a document with a few tokens. + \item[Long document] + Summary of a document with many tokens. + \end{description} +\end{description} + + + +\section{Metrics} + +Summarization metrics can evaluate different levels: +\begin{descriptionlist} + \item[Syntactic] + Check word overlapping (e.g., ROUGE). + \item[Semantic] + Check semantic coverage (e.g., BERTScore). + \item[Factuality] + Check factuality to the source (e.g., BARTScore). + \item[Fluency] + Check for redundancies (e.g., unique N-gram ratio). + \item[Efficiency] + Measure trade-off between performance and costs (e.g., CARBURACY). +\end{descriptionlist} + + +\subsection{Recall-Oriented Understudy for Gisting Evaluation (ROUGE)} + +\begin{description} + \item[ROUGE] \marginnote{ROUGE} + N-gram oriented metric that compares the generated summary and the ground truth. + + \begin{description} + \item[ROUGE-1] Overlap of 1-grams. + \item[ROUGE-2] Overlap of 2-grams. + \item[ROUGE-L] Length of the common longest subsequence. + \end{description} +\end{description} + + +\begin{description} + \item[Precision] + \[ \texttt{ROUGE}_\texttt{precision} = \frac{\vert \text{overlaps} \vert}{\vert \text{generated summary} \vert} \] + + \item[Recall] + \[ \texttt{ROUGE}_\texttt{recall} = \frac{\vert \text{overlaps} \vert}{\vert \text{ground truth} \vert} \] +\end{description} + + +\subsection{Limitations} + +\begin{itemize} + \item ROUGE only evaluates on a syntactic level. + \item ROUGE-2 and ROUGE-L are sensitive to the position of words. +\end{itemize} + + + +\section{State-of-the-art generative summarizers} + +\subsection{BART} + +\begin{itemize} + \item \marginnote{BART} + Encoder-decoder Transformer with an input size of 1024 tokens. + \item It is suited for short document summarization. + \item It is pre-trained using a denoising sequence-to-sequence approach. +\end{itemize} + + +\subsection{Longformer encoder-decoder} + +\begin{itemize} + \item \marginnote{Longformer encoder-decoder} + Encoder-decoder Transformer with an input size of 16k tokens. + \item It is suited for long document summarization. + \item It uses a linear encoder self-attention based on global and local attention that reduces the quadratic complexity of the standard attention mechanism. +\end{itemize} + + +\subsection{PRIMERA} + +\begin{itemize} + \item \marginnote{PRIMERA} + Encoder-decoder Transformer based on Longformer with an input size of 4K tokens. + \item It is suited for long document summarization. + \item It has an ad-hoc pre-training for multi document summarization. +\end{itemize} + diff --git a/src/year2/big-data-analytics-and-text-mining/module1/tm.tex b/src/year2/big-data-analytics-and-text-mining/module1/tm.tex new file mode 100644 index 0000000..6972836 --- /dev/null +++ b/src/year2/big-data-analytics-and-text-mining/module1/tm.tex @@ -0,0 +1,13 @@ +\documentclass[11pt]{ainotes} + +\title{Big Data Analytics and Text Mining\\(Module 1)} +\date{2024 -- 2025} +\def\lastupdate{{PLACEHOLDER-LAST-UPDATE}} +\def\giturl{{PLACEHOLDER-GIT-URL}} + +\begin{document} + + \makenotesfront + \input{./sections/_summarization.tex} + +\end{document} \ No newline at end of file diff --git a/src/year2/big-data-analytics-and-text-mining/module2/ainotes.cls b/src/year2/big-data-analytics-and-text-mining/module2/ainotes.cls new file mode 120000 index 0000000..4a953bf --- /dev/null +++ b/src/year2/big-data-analytics-and-text-mining/module2/ainotes.cls @@ -0,0 +1 @@ +../../../ainotes.cls \ No newline at end of file diff --git a/src/year2/big-data-analytics-and-text-mining/module2/bda.tex b/src/year2/big-data-analytics-and-text-mining/module2/bda.tex new file mode 100644 index 0000000..5d853d2 --- /dev/null +++ b/src/year2/big-data-analytics-and-text-mining/module2/bda.tex @@ -0,0 +1,12 @@ +\documentclass[11pt]{ainotes} + +\title{Big Data Analytics and Text Mining\\(Module 2)} +\date{2024 -- 2025} +\def\lastupdate{{PLACEHOLDER-LAST-UPDATE}} +\def\giturl{{PLACEHOLDER-GIT-URL}} + +\begin{document} + + \makenotesfront + +\end{document} \ No newline at end of file