From cb3e163b97b772f9c76a03b6949bad686b1fe67f Mon Sep 17 00:00:00 2001
From: jannisp <jannisp@student.ethz.ch>
Date: Wed, 11 Aug 2021 17:57:07 +0200
Subject: [PATCH] Extend autocorrelation

---
 main.tex | 135 +++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 122 insertions(+), 13 deletions(-)

diff --git a/main.tex b/main.tex
index a4d8594..d80367a 100644
--- a/main.tex
+++ b/main.tex
@@ -179,8 +179,8 @@ mathematically formulated by strict stationarity.
     $Cov[X_t,X_{t+h}] = \gamma_h$ & autocovariance depends only on lag $h$ \\   
 \end{tabular}
 
-\subsubsection{Weak}
-It is impossible to "prove" the theoretical concept of stationarity from data. We can only search for evidence in favor or against it. \\
+\subsubsection{Weak} \label{weak-stationarity}
+ It is impossible to "prove" the theoretical concept of stationarity from data. We can only search for evidence in favor or against it. \\
 \vspace{0.1cm}
 However, with strict stationarity, even finding evidence only is too difficult. We thus resort to the concept of weak stationarity.
 
@@ -393,7 +393,7 @@ fit <- gam(log(maine) ~ s(tnum) + mm)
 \end{lstlisting}
 
 \section{Autocorrelation}
-For most of the rest of this course, we will deal with (weakly) stationary time series. See \ref{}
+For most of the rest of this course, we will deal with (weakly) stationary time series. See \ref{weak-stationarity} \\
 \vspace{.2cm}
 Definition of autocorrelation at lag $k$
 $$Cor(X_{t+k},X_t) = \frac{Cov(X_{k+t},X_t)}{\sqrt{Var(X_{k+t})\cdot Var(X_t)}} = \rho(k)$$
@@ -410,25 +410,134 @@ We assume $\rho(k) = 0.7$
     \item From this we can also conclude that any $\rho(k) < 0.4$ is not a strong association, i.e. has a small effect on the next observation only.
 \end{itemize}
 
+\subsection{Lagged scatterplot approach}
+Create a plot of $(x_t, x_{t+k}) \, \forall \, t = 1,...,n-k$ and compute the canonical Pearson correlation coefficient of these pairs and use it as an estimation for the autocorrelation $\tilde{\rho}(k)$
+
+\begin{lstlisting}[language=R]
+lag.plot(wave, do.lines=FALSE, pch=20)
+\end{lstlisting}
+
+\begin{figure}[H]
+    \centering
+    \includegraphics[width=.25\textwidth]{lagged-scatterplot.png}
+    \caption{Lagged scatterplot example for $k=1$}
+    \label{fig:lagged-scatterplot}
+\end{figure}
+
+\subsection{Plug-in estimation}
+Plug-in estimation relies on the canonical covariance estimator:
+$$\hat{\rho}(k) = \frac{Cov(X_t,X_{t+k})}{Var(X_t)}$$
+Plug-in estimates are biased, i.e. shrunken towards zero for large lags $k$. Nevertheless, they are generally more reliable and precise.
+
+\begin{figure}[H]
+    \centering
+    \includegraphics[width=.25\textwidth]{lagged-scatterplot-vs-plug-in.png}
+    \caption{Lagged scatterplot estimation vs. plug-in estimation}
+    \label{fig:lagged-scatterplot-vs-plug-in}
+\end{figure}
+
+\subsection{Important points on ACF estimation}
+\begin{itemize}
+    \item Correlations measure linear association and usually fail if there are non-linear associations between the variables.
+    \item The bigger the lag $k$ for which $\rho(k)$ is estimated, the fewer data pairs remain. Hence the higher the lag, the bigger the variability in $\hat{\rho}(k)$ .
+    \item To avoid spurious autocorrelation, the plug-in approach shrinks $\hat{\rho}(k)$ for large $k$ towards zero. This creates a bias, but pays off in terms of mean squared error.
+    \item Autocorrelations are only computed and inspected for lags up to $10 \log_{10}(n)$, where they have less bias/variance
+\end{itemize}
+
+\subsection{Correlogram}
+\begin{lstlisting}[language=R]
+acf(wave, ylim=c(-1,1))
+\end{lstlisting}
+
+\begin{figure}[H]
+    \centering
+    \includegraphics[width=.25\textwidth]{correlogram.png}
+    \caption{Example correlogram}
+    \label{fig:correlogram}
+\end{figure}
+
+
+\subsubsection{Confidence Bands}
+Even for an i.i.d. series $X_t$ without autocorrelation, i.e. $\rho(k) = 0 \, \forall \, k$, the estimates will be different from zero: $\hat{\rho}(k) \neq 0$ \\
+\textbf{Question}: Which $\hat{\rho}(k)$ are significantly different from zero?
+
+$$\hat{\rho}(k) \sim N(0,1/n), \; \mathrm{for \, large} \, n$$
+\begin{itemize}
+    \item Under the null hypothesis of an i.i.d. series, a 95\% acceptance region for the null is given by the interval $\pm 1.96 / \sqrt{n}$
+    \item  For any stationary series, $\hat{\rho}(k)$ within the confidence bands are considered to be different from 0 only by chance, while those outside are considered to be truly different from zero.
+\end{itemize}
+\textbf{Type I Errors} \\
+For iid series, we need to expect 5\% of type I errors, i.e. $\hat{\rho}(k)$ that go beyond the confidence bands by chance. \\
+\textbf{Non i.i.d. series} \\
+The confidence bands are asymptotic for i.i.d. series. Real finite length non-i.i.d. series have different (unknown) properties.
+
+\subsection{Ljung-box test}
+The Ljung-Box approach tests the null hypothesis that a number of autocorrelation coefficients are simultaneously equal to zero. \\
+Thus, it tests for significant autocorrelation in a series. The test statistic is:
+
+$$Q(h) = n(n+2)\sum_{k=1}^h \frac{\hat{\rho}^2}{n-k} \sim \chi_h^2$$
+
+\begin{lstlisting}[language=R]
+Box.test(wave, lag=10, type="Ljung-Box")
+\end{lstlisting}
+
+\subsection{ACF and outliers}
+The estimates $\hat{\rho}(k)$ are sensitive to outliers. They can be diagnosed using the lagged scatterplot, where every single outlier appears twice. \\
+\vspace{.2cm}
+\textbf{Some basic strategies for dealing with outliers}
+\begin{itemize}
+    \item if it is bad data point: delete the observation
+    \item most (if not all) R functions can deal with missing data
+    \item if complete data are required, replace missing values with
+    \begin{itemize}
+        \item global mean of the series
+        \item local mean of the series, e.g. $\pm 3$ observations
+        \item fit a time series model and predict the missing value
+    \end{itemize}
+\end{itemize}
+
+\subsection{Properties of estimated ACF}
+\begin{itemize}
+    \item Appearance of the series $\Rightarrow$ Appearance of the ACF \\ Appearance of the series $\nLeftarrow$ Appearance of the ACF
+    \item The compensation issue: \\ $\sum_{k=1}^{n-1}\hat{\rho}(k) = -1/2$ \\ All estimable autocorrelation coefficients sum up to -1/2
+    \item  For large lags $k$ , there are only few data pairs for estimating $\rho(k)$. This leads to higher variability and hence the plug-in estimates are shrunken towards zero.
+\end{itemize}
+
+\subsection{Application: Variance of the arithmetic mean}
+We need to estimate the mean of a realized/observed time series. We would like to attach a standard error
+\begin{itemize}
+    \item If we estimate the mean of a time series without taking into account the dependency, the standard error will be flawed.
+    \item This leads to misinterpretation of tests and confidence intervals and therefore needs to be corrected.
+    \item The standard error of the mean can both be over-, but also underestimated. This depends on the ACF of the series.
+\end{itemize}
+
+\subsubsection{Confidence interval}
+For a 95\% CI:
+$$\hat{\mu} \pm 1.96 \sqrt{\frac{\gamma(0)}{n^2} \bigg(n + 2 \cdot \sum_{k=1}^{10log_{10}(n)}(n-k)\rho(k) \bigg)}$$
+
+In R we can use
+\begin{lstlisting}[language=R]
+n <- length(b)
+var.ts <- 1/n^2*acf(b,lag=0,type="cov")$acf[1]*(n+2*sum(((n-1):(n-10))*acf(b,10)$acf[-1]))
+mean(b) + c(-1.96,1.96)*sqrt(var.ts)
+\end{lstlisting}
+
 \scriptsize
 
-\section*{Copyleft}
-
-\doclicenseImage \\
-Dieses Dokument ist unter (CC BY-SA 3.0) freigegeben \\
+\section*{Copyright}
+Nearly everything is copy paste from the slides or the script. Copyright belongs to M. Dettling \\
 \faGlobeEurope \kern 1em \url{https://n.ethz.ch/~jannisp/ats-zf} \\
 \faGit \kern 0.88em \url{https://git.thisfro.ch/thisfro/ats-zf} \\
 Jannis Portmann, FS21
 
-\section*{Referenzen}
+\section*{References}
 \begin{enumerate}
-    \item Skript
+    \item ATSA\_Script\_v219219.docx, M. Dettling
+    \item ATSA\_Slides\_v219219.pptx, M. Dettling
 \end{enumerate}
 
-\section*{Bildquellen}
-\begin{itemize}
-    \item Bild
-\end{itemize}
+\section*{Image sources}
+All pictures are taken from the slides or the script mentioned above.
 
 \end{multicols*}