From 5bc88a2ed7d9d369f9ccf45625d6d0c127611960 Mon Sep 17 00:00:00 2001 From: NotXia <35894453+NotXia@users.noreply.github.com> Date: Sat, 14 Dec 2024 21:54:26 +0100 Subject: [PATCH] Remove BDA + TM --- .../metadata.json | 15 -- .../module1/ainotes.cls | 1 - .../module1/sections/_summarization.tex | 138 ------------------ .../module1/tm.tex | 13 -- .../module2/ainotes.cls | 1 - .../module2/bda.tex | 12 -- 6 files changed, 180 deletions(-) delete mode 100644 src/year2/big-data-analytics-and-text-mining/metadata.json delete mode 120000 src/year2/big-data-analytics-and-text-mining/module1/ainotes.cls delete mode 100644 src/year2/big-data-analytics-and-text-mining/module1/sections/_summarization.tex delete mode 100644 src/year2/big-data-analytics-and-text-mining/module1/tm.tex delete mode 120000 src/year2/big-data-analytics-and-text-mining/module2/ainotes.cls delete mode 100644 src/year2/big-data-analytics-and-text-mining/module2/bda.tex diff --git a/src/year2/big-data-analytics-and-text-mining/metadata.json b/src/year2/big-data-analytics-and-text-mining/metadata.json deleted file mode 100644 index efda739..0000000 --- a/src/year2/big-data-analytics-and-text-mining/metadata.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "name": "Big Data Analytics and Text Mining", - "year": 2, - "semester": 1, - "pdfs": [ - { - "name": "Text mining", - "path": "module1/tm.pdf" - }, - { - "name": "Big data analytics", - "path": "module2/bda.pdf" - } - ] -} \ No newline at end of file diff --git a/src/year2/big-data-analytics-and-text-mining/module1/ainotes.cls b/src/year2/big-data-analytics-and-text-mining/module1/ainotes.cls deleted file mode 120000 index 4a953bf..0000000 --- a/src/year2/big-data-analytics-and-text-mining/module1/ainotes.cls +++ /dev/null @@ -1 +0,0 @@ -../../../ainotes.cls \ No newline at end of file diff --git a/src/year2/big-data-analytics-and-text-mining/module1/sections/_summarization.tex b/src/year2/big-data-analytics-and-text-mining/module1/sections/_summarization.tex deleted file mode 100644 index b08fec5..0000000 --- a/src/year2/big-data-analytics-and-text-mining/module1/sections/_summarization.tex +++ /dev/null @@ -1,138 +0,0 @@ -\chapter{Automatic text summarization} - -\begin{description} - \item[Extractive summarization] \marginnote{Extractive summarization} - Select fragments of text. - - \item[Abstractive summarization] \marginnote{Abstractive summarization} - Rephrase the content of the text. - - \item[Hybrid summarization] \marginnote{Hybrid summarization} - Apply an extractive method followed by an abstractive one. -\end{description} - - -\begin{description} - \item[Generic vs query-focused] \phantom{} - \begin{description} - \item[Generic] - Summary of the whole document. - \item[Query-focused] - Summary that replies to given questions - \end{description} - - \item[Technical vs lay] \phantom{} - \begin{description} - \item[Technical] - Summary using scientific language. - \item[Lay] - Summary using common language. - \end{description} - - \item[Narrative vs bullet point] \phantom{} - \begin{description} - \item[Narrative] - Standard textual summary. - \item[Bullet point] - Set of key phrases. - \end{description} - - \item[Single document vs multi document] \phantom{} - \begin{description} - \item[Single document] - Summary covering a single document. - \item[Multi document] - Summary covering multiple documents. - \end{description} - - \item[Short document vs long document] \phantom{} - \begin{description} - \item[Short document] - Summary of a document with a few tokens. - \item[Long document] - Summary of a document with many tokens. - \end{description} -\end{description} - - - -\section{Metrics} - -Summarization metrics can evaluate different levels: -\begin{descriptionlist} - \item[Syntactic] - Check word overlapping (e.g., ROUGE). - \item[Semantic] - Check semantic coverage (e.g., BERTScore). - \item[Factuality] - Check factuality to the source (e.g., BARTScore). - \item[Fluency] - Check for redundancies (e.g., unique N-gram ratio). - \item[Efficiency] - Measure trade-off between performance and costs (e.g., CARBURACY). -\end{descriptionlist} - - -\subsection{Recall-Oriented Understudy for Gisting Evaluation (ROUGE)} - -\begin{description} - \item[ROUGE] \marginnote{ROUGE} - N-gram oriented metric that compares the generated summary and the ground truth. - - \begin{description} - \item[ROUGE-1] Overlap of 1-grams. - \item[ROUGE-2] Overlap of 2-grams. - \item[ROUGE-L] Length of the common longest subsequence. - \end{description} -\end{description} - - -\begin{description} - \item[Precision] - \[ \texttt{ROUGE}_\texttt{precision} = \frac{\vert \text{overlaps} \vert}{\vert \text{generated summary} \vert} \] - - \item[Recall] - \[ \texttt{ROUGE}_\texttt{recall} = \frac{\vert \text{overlaps} \vert}{\vert \text{ground truth} \vert} \] -\end{description} - - -\subsection{Limitations} - -\begin{itemize} - \item ROUGE only evaluates on a syntactic level. - \item ROUGE-2 and ROUGE-L are sensitive to the position of words. -\end{itemize} - - - -\section{State-of-the-art generative summarizers} - -\subsection{BART} - -\begin{itemize} - \item \marginnote{BART} - Encoder-decoder Transformer with an input size of 1024 tokens. - \item It is suited for short document summarization. - \item It is pre-trained using a denoising sequence-to-sequence approach. -\end{itemize} - - -\subsection{Longformer encoder-decoder} - -\begin{itemize} - \item \marginnote{Longformer encoder-decoder} - Encoder-decoder Transformer with an input size of 16k tokens. - \item It is suited for long document summarization. - \item It uses a linear encoder self-attention based on global and local attention that reduces the quadratic complexity of the standard attention mechanism. -\end{itemize} - - -\subsection{PRIMERA} - -\begin{itemize} - \item \marginnote{PRIMERA} - Encoder-decoder Transformer based on Longformer with an input size of 4K tokens. - \item It is suited for long document summarization. - \item It has an ad-hoc pre-training for multi document summarization. -\end{itemize} - diff --git a/src/year2/big-data-analytics-and-text-mining/module1/tm.tex b/src/year2/big-data-analytics-and-text-mining/module1/tm.tex deleted file mode 100644 index 6972836..0000000 --- a/src/year2/big-data-analytics-and-text-mining/module1/tm.tex +++ /dev/null @@ -1,13 +0,0 @@ -\documentclass[11pt]{ainotes} - -\title{Big Data Analytics and Text Mining\\(Module 1)} -\date{2024 -- 2025} -\def\lastupdate{{PLACEHOLDER-LAST-UPDATE}} -\def\giturl{{PLACEHOLDER-GIT-URL}} - -\begin{document} - - \makenotesfront - \input{./sections/_summarization.tex} - -\end{document} \ No newline at end of file diff --git a/src/year2/big-data-analytics-and-text-mining/module2/ainotes.cls b/src/year2/big-data-analytics-and-text-mining/module2/ainotes.cls deleted file mode 120000 index 4a953bf..0000000 --- a/src/year2/big-data-analytics-and-text-mining/module2/ainotes.cls +++ /dev/null @@ -1 +0,0 @@ -../../../ainotes.cls \ No newline at end of file diff --git a/src/year2/big-data-analytics-and-text-mining/module2/bda.tex b/src/year2/big-data-analytics-and-text-mining/module2/bda.tex deleted file mode 100644 index 5d853d2..0000000 --- a/src/year2/big-data-analytics-and-text-mining/module2/bda.tex +++ /dev/null @@ -1,12 +0,0 @@ -\documentclass[11pt]{ainotes} - -\title{Big Data Analytics and Text Mining\\(Module 2)} -\date{2024 -- 2025} -\def\lastupdate{{PLACEHOLDER-LAST-UPDATE}} -\def\giturl{{PLACEHOLDER-GIT-URL}} - -\begin{document} - - \makenotesfront - -\end{document} \ No newline at end of file