################################################################################
# Time-series introduction for ecologists
# 14.00 Italian Time: 1 December 2021 + 24 January 2022
# Ulrike Obertegger, ulrike.obertegger@fmach.it

################################################################################
# packages (n=19) to install to run the complete skript in R
# tidyverse
# lubridate
# forecast
# fpp2
# urca
# tseries
# nlme
# rcompanion
# openair
# Kendall
# Metrics
# strucchange
# trend
# bfast
# dynlm
# mgcv
# ggplot2
# dtwclust
# vegan

################## let`s talk about time series data ###########################
# What is a Time Series ?
# Any metric that is measured over time at regular or irregular intervals 
# impute missing data (library MICE); threshold of 5% of the total for large datasets
# missForest

# most statistical tests rely on 
# independence, 
# homoscedasticity (the variability of the random disturbance is the same across elements of the vector)
# normality
# Heteroscedasticity in a regression model refers to the unequal scatter of residuals at different levels of a response variable.

# How to create a Time Series in R ?
rm(list=ls())
gc()
data("AirPassengers")
str(AirPassengers)
library(tidyverse)
glimpse(AirPassengers)
par(mfrow=c(2,2))
plot(ts(AirPassengers, frequency = 4,  start = c(1959, 2)))          # frequency 4 => Quarterly Data)
plot(ts(AirPassengers, frequency = 12, start = 1990))                # freq 12 => Monthly data. 
plot(ts(AirPassengers, start=c(1877), end=c(2021), frequency=1))     # Yearly Data
library(lubridate)                                                   # weekly data
plot(ts(AirPassengers, 
   freq=365.25/7, 
   start=decimal_date(ymd("2006-12-27"))))

# What is Autocorrelation and Partial-Autocorrelation?
# Autocorrelation is the correlation of a Time Series with lags of itself. 
# It shows if the previous states (lagged observations) of the time series has an 
# influence on the current state. 
# serial correlation = autocorrelation
par(mfrow=c(1,1))
acf(AirPassengers)
ACF <- acf(AirPassengers)
ACF$acf

# threshold for statistical significance
N=144
1.96/sqrt(N)
1.96/sqrt(N-20) # lag10
abline(h=1.96/sqrt(N), col="green")
abline(h=1.96/sqrt(N-20), col="blue")
pacf(AirPassengers)
# Partial Autocorrelation is the correlation of the time series with a lag of itself, 
# with the linear dependence of all the lags between them removed.
# controlling for other lags

## cross-correlation
# The degree of resemblance between a time series and a lagged version of another time series is measured using cross-correlation.
# the company spends on marketing and the revenue gained in that period.
Spend  <- c(5, 3, 6, 5, 8, 9, 10, 17, 12, 11, 10, 9)
Income <- c(25, 29, 22, 34, 22, 28, 29, 31, 34, 45, 45, 40)

# The lag k value returned by ccf(x, y) estimates the correlation between x[t+k] and y[t].
ccf(Spend, Income)
print(ccf(Spend, Income))

# The Granger Causality test is used to examine if one time series may be used to forecast another.
# order: In the first time series, the number of lags to utilize.
library(lmtest)
grangertest(Spend, Income, order = 2)
# Despite the fact that the null hypothesis of the test was not rejected, it’s possible that reverse causation is occurring. 
grangertest(Income, Spend, order = 2)

acf(Spend)
acf(Income)

# see https://stats.stackexchange.com/questions/271054/is-there-any-relationship-between-granger-causality-and-cross-correlation-diagra
# in Granger test, we factor out the auto-correlation coefficients, 
# and hence we only focus our attention on the cross-correlation coefficients. 
# Therefore, we may find significance in the cross-correlation diagram but not 
# in the Granger test because the cross-correlation does not factor our the auto-correlation.

##### * ##### * ##### * ##### * ##### * ##### * ##### * ##### * ##### * ##### * #####
# Each data point (Yt) at time t in a Time Series can be expressed as either a sum or a product of 3 components, 
# namely, Seasonality (St), Trend (Tt) and Error (et) (a.k.a White Noise).

# For Additive Time Series,

# Y_t = S_t + T_t + error_t

# For Multiplicative Time Series,

# Y_t = S_t x T_t x error_t

# A multiplicative time series can be converted to additive by taking a log of the time series.

# additiveTS <- log (multiplicativeTS)  # often converts multiplicative to additive time serie
par(mfrow=c(1,2))
plot(AirPassengers)
additiveTS <- log10(AirPassengers)
plot(additiveTS)

# What is a Stationary Time Series?
# A time series is said to be stationary when 
# The mean value of time-series is constant over time, which implies, the trend 
# component is nullified.
# The variance does not increase over time.
# Seasonality effect is minimal.

# Time series that show no autocorrelation are called white noise
                      
# It is used commonly to determine if the time series is stationary or not. 
# stationary time series will have the autocorrelation fall to zero fairly quickly 
# but for a non-stationary series it drops gradually.

# Thus, time series with trends, or with seasonality, are not stationary — 
# the trend and seasonality will affect the value of the time series at different times.

# Concept of stationarity and the technique of differencing time series.

library(forecast)
# Transformations such as logarithms can help to stabilise the variance of a time series. 
# Differencing can help stabilise the mean of a time series by removing changes in the level 
# of a time series, and therefore eliminating (or reducing) trend and seasonality.

library(fpp2)
plot(goog200)
acf(goog200)

# compute the differences between consecutive observations
plot(diff(goog200))
acf(diff(goog200))
# other option = detrending

Box.test(goog200, lag=10, type="Ljung-Box")
Box.test(diff(goog200), lag=10, type="Ljung-Box")

# Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test: the null hypothesis is that the data are stationary, 
# and we look for evidence that the null hypothesis is false.
# ur.kpss() function from the urca package.

library(urca)
plot(goog)
goog %>% ur.kpss() %>% summary()
# The test statistic is much bigger than the 1% critical value, 
# indicating that the null hypothesis is rejected. 

goog %>% diff() %>% ur.kpss() %>% summary()
plot(diff(goog))

# While exponential smoothing models are based on a description of the trend and seasonality in the data, 
# ARIMA models aim to describe the autocorrelations in the data.

# ARIMA
# autoregressive integrated moving average
# AR
# In an autoregression model, we forecast the variable of interest using a 
# linear combination of p-past values of the variable. AR(p)

# moving average model uses past forecast errors in a regression-like model.
# MA(q) model, a moving average model of order q.

# ARIMA(p,d,q)
autoplot(uschange[,"Consumption"]) +
  xlab("Year") + ylab("Quarterly percentage change")
par(mfrow=c(1,1))
acf(uschange[,"Consumption"])
pacf(uschange[,"Consumption"])
auto.arima(uschange[,"Consumption"], seasonal=FALSE)

# end of the introduction
# have a look at some data
################################################################################
setwd("/Users/uliobertegger/Documents/Eigene Dateien/nach 2010/Kongresse-Tagungen/2021/SPF WP4_December 2021")
dat<-read.csv("dat.csv")
head(dat)
par(mfrow=c(1,1))
plot(dat$x, type="o")

library(tseries)
kpss.test(dat$x)   # Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test
acf(dat$x)
pacf(dat$x)

data <- ts(dat$x, frequency=12, start=c(2002,2))
plot(data)

(t<-time(data))
# (t<-seq(1,length(dat$x), by=1))

mod.0 <- lm(dat$x~t)   # naive approach
summary(mod.0)
abline(mod.0)

# good statistical practice
acf(residuals(mod.0))
pacf(residuals(mod.0))

plot(mod.0)
# homogeneity
plot(residuals(mod.0))
abline(h=0)
plot(y=residuals(mod.0), x=t)
abline(h=0)

library(nlme)
dati <- dat$x
# corARMA
# gls has as default "REML"
mod0 <- gls(dati~t)
mod1 <- gls(dati~t, correlation=corAR1(form=~1), method="REML")
anova(mod0, mod1)
resid <- residuals(mod1, type="normalized")   # do this
acf(resid)
# when working on the correlation structure: method = REML
# when working on the fixed part -> method = LM

resid.n <- residuals(mod1)  # do not forget to put type="normalized
acf(resid.n)
summary(mod1)

library(rcompanion)
nagelkerke(mod1)

abline(mod1, col="red", lwd=2)
predict(mod1)
(11.031234-6.563517)/90     # delta
(11.031234-6.563517)/90*12  # monthly data

#### ARIMA
diff2 <- diff(dati)
kpss.test(diff2)

library(forecast)
(fit2<-auto.arima(dati, ic="aic"))
(fit2<-Arima(dati, order=c(2,1,2), include.drift=TRUE))
resi2<-fit2$residuals
acf(resi2)

length(dati)
# instead of p-values, have a look at the confidence interval
# SD = SE*sqrt(N)
# CI = mean +/- 1.96*SD
-0.0399 + c(-1.96, 1.96)*0.0149/sqrt(90) # CI for the estimate of the drift (=trend)

str(fit2)
fit2$fitted

(10.230723-6.726570)/90  # delta
(10.230723-6.726570)/90*12

# compare ARIMA with gls
# [1] 0.4672204 vs 0.5956956

# how to compare the models?
# RMSE

plot(dati)
points(y=fit2$fitted, x=time(dati), col="red", pch=16)

########################### the classical approach #############################
# water level of the Nile
data("Nile")
Nile
plot(Nile)
acf(Nile)
pacf(Nile)

# the classic approach
library(Kendall)
MannKendall(Nile)

# addition :o)
library(wql)
tsp(Nile)  # an annual time series
mannKen(Nile, plot=TRUE)

library(openair)
# there is a trend, then we can use non-parametric Sen Theil test to assess the trend

(Time<-seq(as.POSIXct("1871-01-01 00:00:00", tz="UTC"),
           as.POSIXct("1970-01-01 00:00:00", tz="UTC"),
           by="year"))

# Note: In the case of the intercept, it is assumed the y-axis crosses the x-axis on 1/1/1970.

input <- as.numeric(Nile)
data  <- data.frame(date=Time, 
                    water.level=input)
head(data)
glimpse(data)
plot(data$date, data$water.level, type="o")

st <- TheilSen(data, pollutant = "water.level")
st

st$data$res2     # output of the plot
st$data$main.data$conc
st$plot
st$data[[2]]
# inspect ACF
# residuals = predicted - observed
(slope     <- st$data$res2$slope[1])
(intercept <- st$data$res2$intercept[1])

obs      <- data$water.level
(pred.st <- intercept+slope*seq(1,100,1))
(pred.st <- as.numeric(intercept+slope*seq(1,100,1) + 266))

resi <- obs - pred.st
acf(resi)

a <- (obs-pred.st)
b <- a^2
c <- sum(b)
d <- c/100
sqrt(d)

library(Metrics)
Metrics::rmse(obs, pred.st)

## Nile data with one breakpoint: the annual flows drop in 1898
## because the first Ashwan dam was built
## compute breakpoints
library(strucchange)
# this model says that there is just an intercept
# and we are looking for a change in the intercept
(bp.nile <- breakpoints(Nile ~ 1))

## fit and visualize segmented and unsegmented model

plot(Nile)
fm0 <- lm(Nile ~ 1)
fm1 <- lm(Nile ~ breakfactor(bp.nile, breaks = 1))
lines(y=fitted(fm0), x=seq(1871,1970,1), col = 3)
lines(fitted(fm1),   x=seq(1871,1970,1), col = 4, lwd=4)
lines(bp.nile, breaks = 1)

summary(fm1)
acf(residuals(fm1))

points(y=c(pred.st), x=time(Nile), type="l", lwd=3)  # finally, I solved the problem
                                                     # the intercept is right now

# compare two competing models by the RMSE
### RMSE
part1 <- rep(1097.75, 28)
part2 <- rep((1097.75-247.78), 100-28)
pred.cp<-c(part1, part2)

Metrics::rmse(obs, pred.cp)

### not considering a change point can let you find a trend that is not there

# Flaim et al 2020 Water Resources Research

##### change in the slope
# Computation of breakpoints in regression relationships. Given a number of breaks the function
# computes the optimal breakpoints.
# the BIC would choose 0 breakpoints although the RE and supF test
# clearly reject the hypothesis of structural stability. Bai &
# Perron (2003) report that the BIC has problems in dynamic regressions.
# due to the shape of the RE process of the F statistics choose two
# breakpoints and fit corresponding models

# have a look at
# https://lindeloev.github.io/mcp/articles/packages.html#a-simple-dataset-to-compare-packages-1
################################################################################
rm(list=ls(all=TRUE))
data("nottem")
nottem <- (nottem-32)/1.8  # from Fahrenheit to Celsius
# 20 years of monthly temperature measurements at Nottingham Castle (ts object) 
# (Anderson, O. D. 1976). 
par(mfrow=c(1,1))
plot(nottem)
temp <- nottem
class(temp)

length(temp)

# the classic
library(trend)
# seasonal Kendall test for trend 
# R. Hirsch, J. Slack, R. Smith (1982), 
# Techniques of Trend Analysis for Monthly Water Quality Data, Water Resources Research 18, 107–121.
# there may be instances where some month exhibit a strong upward trend and others exhibit a strong downward trend,
# and thus the SMK test indicates no trend. The SMK test is specifically designed to provide single summary statistic 
# for the entire record and will indicate trend when there are trends in opposing directions in different months
(res <- smk.test(temp))
## print method
## summary method
summary(res)
sum(res$Sg)
###################################################
library(bfast)
# Break Detection in the Seasonal and Trend Component of a Univariate Time Series
fit<-bfast(temp,h=0.15,season="harmonic",max.iter=10)
plot(fit)
fit
str(fit)
plot(fit$output[[1]]$Tt)  # Trend
plot(fit$output[[1]]$St)  # seasonal

############## # * # modeling time-series with cosine trends # * # #############
# plotting of raw data
plot(temp, ylab="mean monthly air temperature")

# periodogram
# spec.pgram(temp, spans=c(5,5))

## linear regression with trend and harmonic seasonality (of order 2)
library("dynlm")
dm0 <- dynlm(temp ~ trend(temp) + harmon(temp, order = 2))
summary(dm0)
acf(residuals(dm0))
# seasonal component is a cyclical pattern of annually
# recurring deviations from the long-term mean
# trend is significant, annual sine wave highly significant (cos1/sin1),
# biannual sine wave also significant (cos2/sin2).

## unfortunately strucchange cannot use the nice formula above
## directly -> so extract the trend regressor and the matrix with
## the harmonic regressors
dtemp        <- data.frame(temp = temp)
dtemp$trend  <- model.matrix(dm0)[, 2]
dtemp$harmon <- model.matrix(dm0)[, 3:6]
colnames(dtemp$harmon) <- c("cos1", "cos2", "sin1", "sin2")
head(dtemp)
m0 <- lm(temp ~ trend + harmon, data = dtemp)
summary(m0)
## --> same as above

## supF test
library("strucchange")
fs <- Fstats(temp ~ trend + harmon, data = dtemp, from = 0.1)
plot(fs)
sctest(fs)

(bp <- breakpoints(temp ~ trend + harmon, data = dtemp, breaks=1, h=0.1))

confint(bp)
## 

## inspect fitted coefficients in both segments
coef(bp)

## M-fluctuation tests
scus <- gefp(temp ~ trend + harmon, data = dtemp)
plot(scus, functional = supLM(0.1))

plot(scus, aggregate = FALSE, ylim = c(-2.2, 2.2))

coef(m0)

## visualization
plot(temp, col = "black", lwd = 2, ylab="monthly air temperature")
X<-time(temp)
fit<-fitted(m0)
lines(x=as.vector(X), y=as.vector(fit), col="red", type="l")

## trend
lines(ts(coef(m0)[1] + coef(m0)[2] * dtemp$trend,
         start = 1920, freq = 12), col = 4, lwd = 2, lty = 5)
summary(m0)
# slope is 0.03/?
coef(m0)[1] + coef(m0)[2] * dtemp$trend # how we get the values
(9.156176-9.776694) # increase in 240 month
(9.156176-9.776694)*-1/240*12 
# slope is 0.03/year
0.0310259*20 # the total increase

## seasonality patterns
a<-coef(m0)[3:6]

harmoncos1<-coef(m0)[3]
harmoncos2<-coef(m0)[4]
harmonsin1<-coef(m0)[5]
harmonsin2<-coef(m0)[6]
(a<-c(harmoncos1,harmoncos2,harmonsin1,harmonsin2))

seas <- model.matrix(m0)[1:12,3:6] %*% a

# head(model.matrix(m0))[,1:6]
#(Intercept)     trend harmoncos1 harmoncos2    harmonsin1    harmonsin2
#1         1 0.1666667        1.0        1.0 -9.308876e-13 -1.861775e-12
#2         1 0.3333333        0.5       -0.5  8.660254e-01  8.660254e-01
#3         1 0.5000000       -0.5       -0.5  8.660254e-01 -8.660254e-01
#4         1 0.6666667       -1.0        1.0  6.006076e-13 -1.201215e-12
#5         1 0.8333333       -0.5       -0.5 -8.660254e-01  8.660254e-01
#6         1 1.0000000        0.5       -0.5 -8.660254e-01 -8.660254e-01

#plot(head(model.matrix(m0))[,2],head(model.matrix(m0))[,3], type="l")

colnames(seas) <-c("1973-2012")
rownames(seas) <- month.abb[1:12]
barplot(t(seas), beside = TRUE, legend = TRUE)
seas

## annual amplitude and phase
sqrt(coef(m0)[3]^2 + coef(m0)[5]^2)
atan(-coef(m0)[5]/coef(m0)[3])

## bi-annual amplitude and phase
sqrt(coef(m0)[4]^2 + coef(m0)[6]^2)
atan(-coef(m0)[6]/coef(m0)[4])

## with respect to the long-term mean

## continuous seasonality patterns
tr <- 0:365/365    
# tr goes from 0 to 1 in 365 steps
h1 <- cbind("cos" = cos(2 * pi * 1 * tr), "sin" = sin(2 * pi * 1 * tr))
h2 <- cbind("cos" = cos(2 * pi * 2 * tr), "sin" = sin(2 * pi * 2 * tr))
s1a <- h1 %*% coef(m0)[c(3,5)]   # harmon 1 period 1
# coef(m0)[c(3,5)] 
# harmoncos1 harmonsin1 
# -9.471048  -2.706701 
s2a <- h2 %*% coef(m0)[c(4,6)]   # harmon 2 period 1

#### visualization
par(mfrow = c(1, 2))

plot(tr, s1a + s2a, type = "l", lty = 1,
     xlab = "Time (within year)", ylab = "Seasonality",
     main = "Combined")
legend("bottomleft", colnames(seas), lty = 1, bty = "n")

plot(tr, s1a, type = "l", lty = 1, col = "blue",
     xlab = "Time (within year)", ylab = "Seasonality", main = "Separate")
lines(tr, s2a, col = "red", lty = 2)
legend("bottomleft", c(colnames(seas), "annual", "bi-annual"), lty = c(1,1, 2),
       col = c(1, 4, 2), bty = "n")

#### check residuals from breakpoint model for autocorrelation
par(mfrow = c(1,2))
acf(residuals(m0))
pacf(residuals(m0))

# Ljung–Box test for examining the 0-hypothesis of independence in a time series
Box.test(residuals(m0), type = "Ljung", lag = 12)

## normality seems to be roughly ok
hist(residuals(m0))
qqnorm(residuals(m0))
qqline(residuals(m0))

###### what we could do
library(nlme)
m0 <- gls(temp ~ trend + harmon,
          data = dtemp)
m1 <- gls(temp ~ trend + harmon, correlation =corAR1(form=~1),
          data = dtemp, method="REML")
summary(m1)
anova(m0,m1)

par(mfrow=c(1,1))
plot(temp, col = "black", lwd = 2, ylab="monthly air temperature")
X<-time(temp)
fit<-fitted(m1)
lines(x=as.vector(X), y=as.vector(fit), col="red", type="l")
acf(residuals(m1, type="normalized"))

## Flaim et al. 2016 Freshwater Biology example of strucchange and cosine trends 
################################################################################
## set up data frame with all auxiliary variables
plus<-c(rep(0,120), rep(10, 120))
tempi<-temp+plus
plot(tempi)
library(trend)
(res <- smk.test(tempi))

## summary method
summary(res)

fit<-bfast(tempi,h=0.15,season="harmonic",max.iter=10)
plot(fit)
fit

########### for safety reasons let`s do the analysis step by step
dtempi      <- data.frame(tempi = tempi)
dtempi$time <- as.numeric(time(tempi)) + 1/12

dtempi$trend <- dtempi$time - 1975
## -> trend uses 1975 as origin

dtempi$harmon1 <- cbind("cos"=cos(2*pi*1*dtempi$time),"sin"=sin(2*pi*1*dtempi$time))
dtempi$harmon2 <- cbind("cos"=cos(2*pi*2*dtempi$time),"sin"=sin(2*pi*2*dtempi$time))
## -> separate harmonic matrices for annual and bi-annual patterns
head(dtempi)
## regression model
m0 <- lm(tempi ~ trend + harmon1 + harmon2, data = dtempi)
summary(m0)

## supF test
library("strucchange")
fs <- Fstats(tempi ~ trend + harmon1 + harmon2, data = dtempi, from = 0.1)
plot(fs)

sctest(fs)

(bp <- breakpoints(tempi ~ trend + harmon1 + harmon2, data = dtempi, breaks=1, h=0.1))

confint(bp)

## M-fluctuation tests
dtempo        <- data.frame(tempi = tempi)
dtempo$trend  <- model.matrix(dm0)[, 2]
dtempo$harmon <- model.matrix(dm0)[, 3:6]
scus <- gefp(tempi ~ trend + harmon, data = dtempo)
plot(scus, functional = supLM(0.1))

plot(scus, aggregate = FALSE, ylim = c(-2.2, 2.2))

## inspect fitted coefficients in both segments
coef(bp)

seg <- breakfactor(bp)

# two different possibilities how to get the results
seglm<-lm(tempi ~ 0+ seg/trend+ seg/harmon1 + seg/harmon2, data = dtempi)  # values of the intercept as they are
seglm<-lm(tempi ~ seg/trend+ seg/harmon1 + seg/harmon2, data = dtempi)     # baseline 
summary(seglm)

## visualization
par(mfrow=c(1,1))
plot(tempi, col = "black", lwd = 2, ylab="tempi")
lines(fitted(bp))
lines(confint(bp))

## trends
seg <- breakfactor(bp)
lines(ts(coef(bp)[seg, 1] + coef(bp)[seg, 2] * dtempi$trend,
         start = 1920, freq = 12), col = "blue", lwd = 2, lty = 5)

as.matrix(ts(coef(bp)[seg, 1] + coef(bp)[seg, 2] * dtempi$trend,
             start = 1920, freq = 12))

## seasonality patterns (at observed time points)
seas <-model.matrix(m0)[1:12, 3:6] %*% t(coef(bp)[, 3:6])
rownames(seas) <- apply(matrix(month.abb, 1, 12), 2, paste, collapse = "/")
barplot(t(seas), beside = T, legend = T, col=c("red","blue"))

## -> amplitude increases over time

## continuous seasonality patterns
tr <- 0:365/365    
# tr geht von 0 bis 1 in 365 Schritten
h1 <- cbind("cos" = cos(2 * pi * 1 * tr), "sin" = sin(2 * pi * 1 * tr))
h2 <- cbind("cos" = cos(2 * pi * 2 * tr), "sin" = sin(2 * pi * 2 * tr))
s1a <- h1 %*% coef(bp)[1, 3:4]   # harmon 1 period 1
# > coef(bp)[1, 3:4]
# harmon1cos harmon1sin 
# -621.3082  -598.2405 
s2a <- h2 %*% coef(bp)[1, 5:6]   # harmon 2 period 1
s1b <- h1 %*% coef(bp)[2, 3:4]   # harmon 1 period 2
s2b <- h2 %*% coef(bp)[2, 5:6]   # harmon 2 period 2

par(mfrow = c(1, 2))

plot(tr, s1b + s2b, type = "l", lty = 1,
     xlab = "Time (within year)", ylab = "Seasonality",
     main = "Combined")
lines(tr, s1a + s2a, lty = 3)
legend("topleft", colnames(seas), lty = c(3,1), bty = "n")

plot(tr, s1b, type = "l", lty = 1, col = 2,
     xlab = "Time (within year)", ylab = "Seasonality", main = "Separate")
lines(tr, s1a, col = 2, lty = 2)
lines(tr, s2b, col = 4, lty = 1)
lines(tr, s2a, col = 4, lty = 2)
legend("topleft", c(colnames(seas), "annual", "bi-annual"), lty = c(2,1,1,2),
       col = c(1, 1, 2, 4), bty = "n")

cbind(c(s1a + s2a),c(s1b + s2b))  # extract the values

## annual amplitude and phase
sqrt(coef(bp)[,3]^2 + coef(bp)[,4]^2)
atan(-coef(bp)[,4]/coef(bp)[,3])

## bi-annual amplitude and phase
sqrt(coef(bp)[,5]^2 + coef(bp)[,6]^2)
atan(-coef(bp)[,6]/coef(bp)[,5])

## 
## check residuals from breakpoint model for autocorrelation
par(mfrow = c(1, 2))
acf(residuals(bp))
pacf(residuals(bp))

## --> virtually no autocorrelation left, Box-Ljung test non-significant
# Ljung–Box test for examining the 0-hypothesis of independence in a time series
Box.test(residuals(bp), type = "Ljung", lag = 6)

## normality seems to be roughly ok
hist(residuals(bp))
qqnorm(residuals(bp))
qqline(residuals(bp))

###### what we could do
library(nlme)
m0 <- gls(tempi ~ 0+ seg/trend+ seg/harmon1 + seg/harmon2, data = dtempi)

m1 <- gls(tempi ~ 0+ seg/trend+ seg/harmon1 + seg/harmon2, data = dtempi, correlation =corAR1(form=~1))
summary(m1)
anova(m0,m1)

par(mfrow=c(1,1))
plot(tempi, col = "black", lwd = 2, ylab="monthly air temperature")
X<-time(tempi)
fit<-fitted(m1)
lines(x=as.vector(X), y=as.vector(fit), col="red", type="l")
acf(residuals(m1, type="normalized"))

################################################################################
## new data
plus<-c(rep(0,120), seq(1:120))
tempii<-temp+plus
plot(tempii)

library(trend)
(res <- smk.test(tempii))
## summary method
summary(res)

fit<-bfast(tempii,h=0.15,season="harmonic",max.iter=10)
plot(fit)
fit

dtempii      <- data.frame(tempii = tempii)
dtempii$time <- as.numeric(time(tempii)) + 1/12

dtempii$trend <- dtempii$time - 1975
## -> trend uses 1975 as origin

dtempii$harmon1 <- cbind("cos"=cos(2*pi*1*dtempii$time),"sin"=sin(2*pi*1*dtempii$time))
dtempii$harmon2 <- cbind("cos"=cos(2*pi*2*dtempii$time),"sin"=sin(2*pi*2*dtempii$time))
## -> separate harmonic matrices for annual and bi-annual patterns

## regression model
m0 <- lm(tempii ~ trend + harmon1 + harmon2, data = dtempii)
summary(m0)

## supF test
library("strucchange")
fs <- Fstats(tempii ~ trend + harmon1 + harmon2, data = dtempii, from = 0.1)
plot(fs)

sctest(fs)

(bp <- breakpoints(tempii ~ trend + harmon1 + harmon2, data = dtempii, breaks=1, h=0.1))

confint(bp)
## breakpoint
## inspect fitted coefficients in both segments
coef(bp)

seg <- breakfactor(bp)
seglm<-lm(tempii ~ 0+ seg/trend+ seg/harmon1 + seg/harmon2, data = dtempii)
summary(seglm)

## trends
seg <- breakfactor(bp)
plot(dtempii$tempii, col = "black", lwd = 2, ylab="tempii")

lines(ts(coef(bp)[seg, 1] + coef(bp)[seg, 2] * dtempii$trend,
         start = 1920, freq = 12), col = "blue", lwd = 2, lty = 5)

as.matrix(ts(coef(bp)[seg, 1] + coef(bp)[seg, 2] * dtempii$trend,
             start = 1920, freq = 12))

## --> linear trend (with breakpoint)

## seasonality patterns (at observed time points)
seas <-model.matrix(m0)[1:12, 3:6] %*% t(coef(bp)[, 3:6])
rownames(seas) <- apply(matrix(month.abb, 1, 12), 2, paste, collapse = "/")
barplot(t(seas), beside = T, legend = T, col=c("red","blue"))
barplot(t(seas), beside = T, legend = T)

## -> amplitude increases over time

## continuous seasonality patterns
tr <- 0:365/365    
# tr geht von 0 bis 1 in 365 Schritten
h1 <- cbind("cos" = cos(2 * pi * 1 * tr), "sin" = sin(2 * pi * 1 * tr))
h2 <- cbind("cos" = cos(2 * pi * 2 * tr), "sin" = sin(2 * pi * 2 * tr))
s1a <- h1 %*% coef(bp)[1, 3:4]   # harmon 1 period 1
# > coef(bp)[1, 3:4]
# harmon1cos harmon1sin 
# -621.3082  -598.2405 
s2a <- h2 %*% coef(bp)[1, 5:6]   # harmon 2 period 1
s1b <- h1 %*% coef(bp)[2, 3:4]   # harmon 1 period 2
s2b <- h2 %*% coef(bp)[2, 5:6]   # harmon 2 period 2

par(mfrow = c(1, 2))

plot(tr, s1b + s2b, type = "l", lty = 1,
     xlab = "Time (within year)", ylab = "Seasonality",
     main = "Combined")
lines(tr, s1a + s2a, lty = 3)
legend("topleft", colnames(seas), lty = c(3,1), bty = "n")

plot(tr, s1b, type = "l", lty = 1, col = 2,
     xlab = "Time (within year)", ylab = "Seasonality", main = "Separate")
lines(tr, s1a, col = 2, lty = 2)
lines(tr, s2b, col = 4, lty = 1)
lines(tr, s2a, col = 4, lty = 2)
legend("topleft", c(colnames(seas), "annual", "bi-annual"), lty = c(2,1,1,2),
       col = c(1, 1, 2, 4), bty = "n")

cbind(c(s1a + s2a),c(s1b + s2b))  # extract values

## annual amplitude and phase
sqrt(coef(bp)[,3]^2 + coef(bp)[,4]^2)
atan(-coef(bp)[,4]/coef(bp)[,3])

## bi-annual amplitude and phase
sqrt(coef(bp)[,5]^2 + coef(bp)[,6]^2)
atan(-coef(bp)[,6]/coef(bp)[,5])

## --> annual amplitude increases over time

## check residuals from breakpoint model for autocorrelation
par(mfrow = c(1, 2))
acf(residuals(bp))
pacf(residuals(bp))

## --> virtually no autocorrelation left, Box-Ljung test non-significant
# Ljung–Box test for examining the 0-hypothesis of independence in a time series
Box.test(residuals(bp), type = "Ljung", lag = 6)

## even normality seems to be roughly ok
hist(residuals(bp))
qqnorm(residuals(bp))
qqline(residuals(bp))

################ * # generalised additive modelling GAM # * ####################
# https://m-clark.github.io/generalized-additive-models/building_gam.html
# https://fromthebottomoftheheap.net/2014/05/09/modelling-seasonal-data-with-gam/#fn1
# Spline Regression is one of the non-parametric regression technique. 
# In this technique the dataset is divided into bins at intervals or points which we call knots. 
# Also this bin has its separate fit. 
# https://medium.com/analytics-vidhya/spline-regression-in-r-960ca82aa62c#:~:text=Spline%20Regression%20is%20one%20of,bin%20has%20its%20separate%20fit.
library(mgcv)
library(ggplot2)
library(tidyverse)

# Behind the scenes, the model is trying to balance two competing goals. 
# On the one hand we want to maximize the fit to the data. In linear regression, 
# this goal amounts to minimizing the sum of squared errors. On the other hand, 
# we want to minimize wiggliness (overfitting). In penalized smoothing splines, 
# this is done by first specifying a penalty matrix that defines wiggliness for
# that spline basis. 

# pseudocode mod <- gam(y ~ s(x1) + s(x2), data = foo)

################################################################################
library(mgcv)
setwd("/Users/uliobertegger/Documents/Eigene Dateien/nach 2010/Kongresse-Tagungen/2021/SPF WP4_December 2021")
cet<-read.csv("cet.csv")
head(cet)
## get rid of the annual too - store for plotting
rn <- cet$X
Years <- rn[1]:rn[length(rn)]
annCET <- data.frame(Temperature = cet$Annual,
                     Year = Years)
head(annCET)
ceti <- data.frame(year=Years, cet[, 2:13])
head(ceti)
dim(ceti)

(cet.new<-ceti %>%
    pivot_longer(!year, names_to="month", values_to="temp") %>%
    mutate(Month=rep(1:12,362))  %>% 
    mutate(Date=as.Date(paste(year, Month, 15, sep="-"))) %>%
    mutate(Time= as.numeric(Date) / 1000 ))               # centered around 1970

head(cet.new)
ggplot(cet.new, aes(x=Date, y=temp)) +
  geom_line()

ggplot(annCET, aes(x=Year, y=Temperature)) +
  geom_line()

# bs = cc, cyclical cubic spline
# k = 12, because of 12 month
m <- gam(temp ~ s(Month, bs = "cc", k = 12) + s(Time),
         data = cet.new)

summary(m)

layout(matrix(1:2, ncol = 2))
plot(m, scale = 0)
layout(1)

layout(matrix(1:2, ncol = 2))
acf(resid(m),  lag.max = 36, main = "ACF")
pacf(resid(m), lag.max = 36, main = "pACF")
layout(1)

# some low-order AR model is needed
ctrl <- list(niterEM = 0, msVerbose = TRUE, optimMethod="L-BFGS-B")

## Null-Model
# gamm has as default "ML"
m <- gamm(temp ~ s(Month, bs = "cc", k = 12) + s(Time),
          data = cet.new, method="REML")
summary(m$gam)
acf(residuals(m$lme))

## AR(1)
# corARMA(form = ~ 1|year, p = 1) -> | year is a grouping factor, the 
# correlation structure is assumed to apply only to observations within the same group

m1 <- gamm(temp ~ s(Month, bs = "cc", k = 12) + s(Time, k = 20),
           data = cet.new, correlation = corARMA(form = ~ 1|year, p = 1),
           control = ctrl, method="REML")
## AR(2)
m2 <- gamm(temp ~ s(Month, bs = "cc", k = 12) + s(Time, k = 20),
           data = cet.new, correlation = corARMA(form = ~ 1|year, p = 2),
           control = ctrl, method="REML")
## AR(3)
m3 <- gamm(temp ~ s(Month, bs = "cc", k = 12) + s(Time, k = 20),
           data = cet.new, correlation = corARMA(form = ~ 1|year, p = 3),
           control = ctrl, method="REML")

anova(m$lme, m1$lme, m2$lme, m3$lme)

layout(matrix(1:2, ncol = 2))
plot(m2$gam, scale = 0)
layout(1)

layout(matrix(1:2, ncol = 2))
res <- resid(m2$lme, type = "normalized")
acf(res, lag.max = 36, main = "ACF - AR(2) errors")
pacf(res, lag.max = 36, main = "pACF- AR(2) errors")
layout(1)

# best source for gam:
# https://fromthebottomoftheheap.net/2015/11/21/climate-change-and-spline-interactions/
# fit a model that allows the seasonal part of the model to change in time along with the trend.
# pseudo R code
# mod <- gam(y = te(x1, x2), data = foo)

############### hierarchical clustering of time-series #########################
library(dtwclust)
setwd("/Users/uliobertegger/Documents/Eigene Dateien/nach 2010/Kongresse-Tagungen/2021/SPF WP4_December 2021")
emission<-read.csv("emission.csv")
# emissions of greenhouse gasses per capita Eurostat (2018)
head(emission)

# dynamic time warping
# normalising of data and performing clustering using DTW distance on a range of 2-6 clusters.
library(vegan)
emissions.norm <- decostand(emission[,-1], method="standardize", MARGIN=1)
?tsclust
(clust.pam <- tsclust(emissions.norm, type="hierarchical", k=2L:6L, distance="dtw"))

# evaluation of clustering efficiency because clustering is a unsupervised procedure
# cluster validity indices (CVIs)
out<-sapply(clust.pam, cvi, type = "internal")
colnames(out)<-c("2 clusters", "3 clusters", "4 clusters", "5 clusters", "6 clusters")
out

clust.pam <- tsclust(emissions.norm, type="hierarchical", k=4L, distance="dtw")
plot(clust.pam, type = "sc")

plot(clust.pam)
# The dashed line represents the representative time series.

t(cbind(emission[,1], cluster = clust.pam@cluster))

# see Obertegger et al. 2017 Water Resources Research
################################################################################
################################################################################