In this analysis, we take a look at the aggregated U.S. net electrical generation, and how that power is produced. The preliminary steps will show the breakdown of various fuel sources and then we will move into a quick forecast of coal consumption within the next few years.
suppressWarnings(suppressMessages(library(tidyverse)))
suppressWarnings(suppressMessages(library(RColorBrewer)))
suppressWarnings(suppressMessages(library(kableExtra)))
suppressWarnings(suppressMessages(library(gridExtra)))
suppressWarnings(suppressMessages(library(forecast)))
suppressWarnings(suppressMessages(library(ggplot2)))
suppressWarnings(suppressMessages(library(sparklyr)))
suppressWarnings(suppressMessages(library(dplyr)))
suppressWarnings(suppressMessages(library(tidyr)))
First, we will load cleansed data from a previous ETL phase that was stored in hadoop. This information is provided from the bulk downloaded files available at the U.S. Energy Information Administration.
sc <- sparklyr::spark_connect(
master = 'local[2]',
spark_home = Sys.getenv("SPARK_HOME"))
total_energy_fact_df <- sparklyr::spark_read_parquet(
sc = sc,
name = 'total_energy_fact_df',
path = "hdfs://localhost:9000/Processed/TotalEnergyFactDF")
total_energy_dim_df <- sparklyr::spark_read_parquet(
sc = sc,
name = 'total_energy_dim_df',
path = "hdfs://localhost:9000/Processed/TotalEnergyDimDF") %>% select(
"name",
"units",
"series_id"
)
total_energy_df <- total_energy_dim_df %>%
filter(name %rlike% "(Electricity Net Generation From.*All Sectors, Monthly$)") %>%
sdf_broadcast() %>%
left_join(
total_energy_fact_df,
by = "series_id",
how = "left"
) %>%
mutate(
date = as.Date(date),
value = as.numeric(value)
) %>%
arrange(date)
We will cut off any measurements older than 1990, as many records are missing for certain categories, and the does not appear to be useful in its current form. After a quick check, we see that there are no null values in the fields we selected.
electricity_net_generation_df <- total_energy_df %>%
dplyr::filter(date >= as.Date("1990-01-01") & date < as.Date("2020-01-01"))
electricity_net_generation_df %>%
dplyr::filter(is.na(value) | value == '') %>%
dplyr::count(name = "null fields") %>%
kable('html') %>%
kable_styling(
bootstrap_options = 'striped',
full_width = FALSE,
position = "center",
font_size = 20)
date_limit_df <- electricity_net_generation_df %>%
group_by() %>%
dplyr::summarize(
min_date = min(date),
max_date = max(date)) %>%
sparklyr::collect()
## Warning: Missing values are always removed in SQL.
## Use `MIN(x, na.rm = TRUE)` to silence this warning
## This warning is displayed only once per session.
## Warning: Missing values are always removed in SQL.
## Use `MAX(x, na.rm = TRUE)` to silence this warning
## This warning is displayed only once per session.
electricity_net_generation_pivot_df <- electricity_net_generation_df %>%
dplyr::mutate(
name = regexp_replace(name, '^Electricity Net Generation From (.*), All Sectors, Monthly$', 'eng_$1'),
name = tolower(name)
) %>%
mutate(
name = regexp_replace(name, ' ', '_')
) %>%
sdf_pivot(
formula = date ~ name,
fun.aggregate = list(value = "mean"))
electricity_net_generation_pivot_df <- sparklyr::sdf_register(
electricity_net_generation_pivot_df,
"electricity_net_generation_pivot_df")
sparklyr::tbl_cache(
sc,
"electricity_net_generation_pivot_df")
And now, we can take a first look at the net generation by fuel type. It’s immediately obvious that coal, natural gas, and nuclear power make up the major portion of consumed fuels, so we will group all others into a single category for simplicity.
color_values <- colorspace::diverge_hcl(n = 12, h = c(180, 260), c = c(0, 260), l = c(0,80), power = 2)
color_values_big <- colorspace::diverge_hcl(n = 20, h = c(180, 260), c = c(0, 260), l = c(0,100), power = 2)
electricity_net_generation_pivot_melted_df <- electricity_net_generation_pivot_df %>%
collect() %>%
gather(key = "id", value = 'val', -c(date))
plot_theme <- theme(plot.title = element_text(hjust = 0.5, size = 25),
plot.subtitle = element_text(hjust = 0.5, size = 14),
strip.text = element_text(size = 20),
plot.background = element_rect(fill = "lightgrey"),
panel.background = element_rect(fill = "white"),
panel.grid.major.x = element_line(color = "lightgrey"),
panel.grid.major.y = element_line(color = "lightgrey"),
axis.text = element_text(size=14, color = "grey1"),
axis.title = element_text(size=16, color = "grey1"))
ggplot(
data = electricity_net_generation_pivot_melted_df,
mapping = aes(x = date, y = val, color = id)) +
geom_line() +
scale_color_manual(values = color_values) +
labs(x = "Date", y = "Million kWh") +
ggtitle("Electricity Net Generation") +
plot_theme
min_cutoff_date <- '2009-01-01'
electricity_net_generation_pivot_reduced_df <- electricity_net_generation_df %>%
filter(date > as.Date(min_cutoff_date)) %>%
dplyr::mutate(
name = regexp_replace(name, '^Electricity Net Generation From (.*), All Sectors, Monthly$', 'eng_$1'),
name = tolower(name)) %>%
mutate(
name = regexp_replace(name, ' ', '_')
) %>%
mutate(
name = ifelse(
name %in% c(
"eng_coal",
"eng_natural_gas",
"eng_nuclear_electric_power"),
name,
"eng_other")
) %>%
sdf_pivot(
formula = date ~ name,
fun.aggregate = list(value = "mean"))
electricity_net_generation_pivot_reduced_df <- sparklyr::sdf_register(
electricity_net_generation_pivot_reduced_df,
"electricity_net_generation_pivot_reduced_df")
sparklyr::tbl_cache(
sc,
"electricity_net_generation_pivot_reduced_df")
date_limit_reduced_df <- electricity_net_generation_pivot_reduced_df %>%
group_by() %>%
dplyr::summarize(
min_date = min(date),
max_date = max(date)) %>%
sparklyr::collect()
Now we can see that in general the generation from coal has been on a slow decline with natural gas doing the opposite and nuclear electric power holding at a steady output.
electricity_net_generation_pivot_melted_df <- electricity_net_generation_pivot_reduced_df %>%
collect() %>%
gather(key = "id", value = 'val', -c(date))
ggplot(
data = electricity_net_generation_pivot_melted_df,
mapping = aes(x = date, y = val, color = id)) +
geom_line() +
scale_color_manual(values = color_values) +
labs(x = "Date", y = "Million kWh") +
ggtitle("Electricity Net Generation - Reduced Categories") +
plot_theme
So, let’s see if we can predict the future of coal consumption. Note, this is very much a simple analysis and there are many factors that contribute to the various fuels usages, inclding political and economical. For this analysis, we will try to fit the time series to an ARIMA model, given how flexible it can be in hyperparamter testing. Also, predictor variables can be passed to these models via the xreg
option, which we hope will help the model accuracy.
eng_coal_ts <- ts(
data = electricity_net_generation_pivot_reduced_df %>%
arrange(date) %>%
select("eng_coal") %>%
collect() %>%
pull("eng_coal"),
start = c(
as.numeric(format(date_limit_reduced_df["min_date"][[1]], "%Y")),
as.numeric(format(date_limit_reduced_df["min_date"][[1]], "%m"))),
end = c(
as.numeric(format(date_limit_reduced_df["max_date"][[1]], "%Y")),
as.numeric(format(date_limit_reduced_df["max_date"][[1]], "%m"))),
frequency = 12)
From the seasonal plot wee see a rather consistent pattern of peaks and valleys based on the month. The ACF plot is decaying, indicating that an arima differencing model may be appropriate, or could benefit from enabling drift. The PACF plot show spikes at periodic lags, suggesting that differencing and/or moving average models may be appropriate as well.
ggseasonplot(eng_coal_ts, polar=TRUE) +
scale_color_manual(values=color_values_big) +
ggtitle("Seasonal Coal Generation") +
plot_theme
autoplot(eng_coal_ts, series='original') +
labs(x = "Year", y = "Million kWh") +
ggtitle("Electricity Net Generation From Coal") +
scale_color_manual(values=color_values) +
plot_theme
acf_plot <- ggAcf(eng_coal_ts) +
plot_theme +
ggtitle("ACF")
pacf_plot <- ggPacf(eng_coal_ts) +
plot_theme +
ggtitle("PACF")
grid.arrange(acf_plot, pacf_plot, ncol=2)
In order to try and create the most useful model, we will try to use the natural gas power generation values as predictors for the ARIMA model as well. Some variations on this data were tested, namely the rolling mean value over the last 12 months and lagged generation values. Only one predictor was selected to avoid any multicollinearity issues. Also, due to the COVID pandemic, we will leave out the 2020 data from training and testing (that will likely be a different analysis in the future).
prediction_length <- 48
eng_coal_train_ts <- subset(eng_coal_ts, start = 1, end = length(eng_coal_ts) - prediction_length)
eng_coal_test_ts <- subset(eng_coal_ts, start = length(eng_coal_ts) - prediction_length)
# Previous attempts used the raw net generation values from other sources.
# Howver, using the cumulative means provides a simpler comparison and smoother
#predictor for fitting
#p_xreg <- matrix(
# c(
# electricity_net_generation_pivot_reduced_df %>% select('eng_natural_gas') %>% pull(),
# electricity_net_generation_pivot_reduced_df %>% select('eng_nuclear_electric_power') %>% pull()),
# ncol=2,
# byrow = FALSE
#)
#using rolling median
p_rollmean_df <- electricity_net_generation_pivot_df %>%
arrange(date) %>%
select(
date,
eng_natural_gas) %>%
# eng_nuclear_electric_power) %>%
collect() %>%
mutate(
#eng_natural_gas_rm = zoo::rollmeanr(lag(eng_natural_gas), k = 13, fill = NA),
eng_natural_gas = lag(eng_natural_gas)
# eng_nuclear_electric_power = zoo::rollmeanr(lag(eng_nuclear_electric_power), k = 13, fill = NA)
) %>%
filter(date > as.Date(min_cutoff_date)) %>%
select(-date)
#only pick one predictor to avoid collinearity issues
p_xreg <- matrix(
c(
#p_rollmean_df %>% select('eng_natural_gas_rm') %>% pull()),
p_rollmean_df %>% select('eng_natural_gas') %>% pull()),
#p_rollmean_df %>% select('eng_nuclear_electric_power') %>% pull()),
ncol=1,
byrow = FALSE
)
p_xreg_train <- matrix(p_xreg[1:length(eng_coal_train_ts),], ncol = dim(p_rollmean_df)[2])
p_xreg_test <- matrix(p_xreg[length(eng_coal_train_ts):length(eng_coal_ts),], ncol = dim(p_rollmean_df)[2])
#export for hyperparameter testing
save(eng_coal_train_ts, p_xreg_train, file = paste("../RFiles/Data/total_energy_coal_data.RData", sep="/"))
As a first step, we will evaluate what happens when we let auto.arima
pick the model for us. We will hold out the last 48 predictions for later testing. Non-seasonal differencing was not selected to be used; however, it appears that drift was enabled with a seasonal moving average. The regressors may have something to do with affecting which model was selected; however the standard error associated with each predictor does not indicate a strong influence. Given all that, do see that the box test has given us promising results that the residuals are not correlated.
auto_fit <- auto.arima(eng_coal_train_ts, xreg = p_xreg_train)
checkresiduals(auto_fit)
##
## Ljung-Box test
##
## data: Residuals from Regression with ARIMA(1,0,0)(2,1,0)[12] errors
## Q* = 7.65, df = 12, p-value = 0.8119
##
## Model df: 5. Total lags used: 17
summary(auto_fit)
## Series: eng_coal_train_ts
## Regression with ARIMA(1,0,0)(2,1,0)[12] errors
##
## Coefficients:
## ar1 sar1 sar2 drift xreg
## 0.8196 -0.4175 -0.3479 -584.3776 0.0931
## s.e. 0.0891 0.1285 0.1404 265.0405 0.1962
##
## sigma^2 estimated as 58382832: log likelihood=-735.65
## AIC=1483.3 AICc=1484.61 BIC=1496.88
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 96.55095 6813.582 5052.253 -0.2310099 3.950352 0.4077027 0.0405893
Now, let’s give our own guess a shot where we used the time series cross validation function (tsCV) from the forecast package. This training focused on selecting the best model at the furthest prediction value (48). Note, I had to make a series of edits to make this function operate when provided with the xreg
option. Notes on these edits can be found here.
The results of the hyperparameter training also included a drift term, but this time added a non-seasonal moving average term.. However, during testing, it appeared that the inclusion of the drift parameter was affecting the significance of the predictor. By allowing the model the slowly decrease via the drift parameter, the inverse correlation between the natural gas consumption seemed to go unnoticed. Therefore, the selected model, and a ‘no drift’ model were kept for analysis.
hyper_fit <- Arima(
eng_coal_train_ts,
order=c(2,0,1),
seasonal=c(1,0,2),
lambda = NULL,
include.drift = TRUE,
xreg=p_xreg_train)
checkresiduals(hyper_fit)
##
## Ljung-Box test
##
## data: Residuals from Regression with ARIMA(2,0,1)(1,0,2)[12] errors
## Q* = 11.013, df = 8, p-value = 0.201
##
## Model df: 9. Total lags used: 17
summary(hyper_fit)
## Series: eng_coal_train_ts
## Regression with ARIMA(2,0,1)(1,0,2)[12] errors
##
## Coefficients:
## ar1 ar2 ma1 sar1 sma1 sma2 intercept drift
## 0.6511 0.0683 0.2965 0.9572 -0.4218 -0.1182 148956.56 -541.8345
## s.e. 0.5019 0.4341 0.4796 0.0432 0.1689 0.1784 22353.54 216.1200
## xreg
## 0.0538
## s.e. 0.1907
##
## sigma^2 estimated as 60866206: log likelihood=-865.43
## AIC=1750.86 AICc=1753.91 BIC=1775.05
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 261.442 7366.563 5961.411 -0.2125715 4.54334 0.4810692 0.02496489
hyper_fit_no_drift <- Arima(
eng_coal_train_ts,
order=c(2,0,1),
seasonal=c(2,1,1),
lambda = NULL,
include.drift = FALSE,
xreg=p_xreg_train)
checkresiduals(hyper_fit_no_drift)
##
## Ljung-Box test
##
## data: Residuals from Regression with ARIMA(2,0,1)(2,1,1)[12] errors
## Q* = 8.2108, df = 10, p-value = 0.6083
##
## Model df: 7. Total lags used: 17
summary(hyper_fit_no_drift)
## Series: eng_coal_train_ts
## Regression with ARIMA(2,0,1)(2,1,1)[12] errors
##
## Coefficients:
## ar1 ar2 ma1 sar1 sar2 sma1 xreg
## 1.6372 -0.6474 -0.7861 -0.1056 -0.2605 -0.3382 0.0875
## s.e. 0.4856 0.4625 0.4302 0.3086 0.1781 0.3223 0.2051
##
## sigma^2 estimated as 61480004: log likelihood=-736.72
## AIC=1489.44 AICc=1491.76 BIC=1507.54
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -1015.213 6885.221 5172.932 -1.038683 4.081683 0.4174412
## ACF1
## Training set 0.02459151
Now that the two complex models have been trained, we can test them against each other as well as a few other simpler prediction methods for comparison. Specifically, we will add a seasonal naive predictor and an auto.arima
model using no predictors
auto_simple_fit <- auto.arima(eng_coal_train_ts)
checkresiduals(auto_simple_fit)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,0)(2,1,0)[12] with drift
## Q* = 7.633, df = 13, p-value = 0.8667
##
## Model df: 4. Total lags used: 17
summary(auto_simple_fit)
## Series: eng_coal_train_ts
## ARIMA(1,0,0)(2,1,0)[12] with drift
##
## Coefficients:
## ar1 sar1 sar2 drift
## 0.8019 -0.4252 -0.3605 -534.1904
## s.e. 0.0847 0.1278 0.1381 219.8821
##
## sigma^2 estimated as 57528998: log likelihood=-735.76
## AIC=1481.52 AICc=1482.45 BIC=1492.84
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 114.9374 6814.622 4991.29 -0.2176097 3.901121 0.4027831 0.05003209
Our selected hyperparameter model performs the best; however, not by much. Also, as showin in the summary plots above, the predictor variables appear to have a very weak influence on the model and barely help the hyperparameter trained model fit outperform the simple ARIMA model.
hyper_fit_df <- hyper_fit %>%
forecast(h = length(eng_coal_test_ts), xreg=p_xreg_test) %>%
accuracy(eng_coal_test_ts) %>%
data.frame() %>%
tibble::rownames_to_column(var = "model") %>%
mutate(model = paste("Hyper",model))
hyper_fit_no_drift_df <- hyper_fit_no_drift %>%
forecast(h = length(eng_coal_test_ts), xreg=p_xreg_test) %>%
accuracy(eng_coal_test_ts) %>%
data.frame() %>%
tibble::rownames_to_column(var = "model") %>%
mutate(model = paste("Hyper No Drift",model))
auto_fit_df <- auto_fit %>%
forecast(h = length(eng_coal_test_ts), xreg=p_xreg_test) %>%
accuracy(eng_coal_test_ts) %>%
data.frame() %>%
tibble::rownames_to_column(var = "model") %>%
mutate(model = paste("Auto",model))
auto_simple_fit_df <- auto_simple_fit %>%
forecast(h = length(eng_coal_test_ts)) %>%
accuracy(eng_coal_test_ts) %>%
data.frame() %>%
tibble::rownames_to_column(var = "model") %>%
mutate(model = paste("Auto Simple",model))
snaive_fit_df <- snaive(y = eng_coal_train_ts, h = length(eng_coal_test_ts)) %>%
accuracy(eng_coal_test_ts) %>%
data.frame() %>%
tibble::rownames_to_column(var = "model") %>%
mutate(model = paste("Seasonal Naive",model))
hyper_fit_df %>% rbind(
hyper_fit_no_drift_df,
auto_fit_df,
auto_simple_fit_df,
snaive_fit_df
) %>%
filter(grepl('test', model, ignore.case = TRUE)) %>%
arrange(RMSE) %>%
kable('html') %>%
kable_styling(bootstrap_options = "striped", position = 'center', full_width = TRUE)
model | ME | RMSE | MAE | MPE | MAPE | MASE | ACF1 | Theil.s.U |
---|---|---|---|---|---|---|---|---|
Hyper Test set | -346.8382 | 8999.774 | 7183.254 | -1.7219416 | 7.869915 | 0.5796685 | 0.3569343 | 0.5987488 |
Auto Test set | 1575.6913 | 9179.720 | 7271.547 | 0.7468322 | 7.741919 | 0.5867935 | 0.3008190 | 0.5908042 |
Auto Simple Test set | 154.4648 | 9225.960 | 7324.011 | -0.7076261 | 7.886640 | 0.5910273 | 0.3467761 | 0.6026227 |
Hyper No Drift Test set | 3858.1697 | 11169.105 | 9283.151 | 2.8316702 | 9.676188 | 0.7491244 | 0.5301015 | 0.7188086 |
Seasonal Naive Test set | -17686.9412 | 24155.824 | 20598.446 | -20.7869917 | 23.484812 | 1.6622370 | 0.5553310 | 1.6586534 |
Now, we can plot the predictions of the four models we created in this analysis.
#Adjust future predictors
auto_simple_forecast <- forecast(
auto_simple_fit,
h = prediction_length)
auto_forecast <- forecast(
auto_fit,
h = prediction_length,
xreg=p_xreg_test)
hyper_forecast <- forecast(
hyper_fit,
h = prediction_length,
xreg=p_xreg_test
)
hyper_no_drift_forecast <- forecast(
hyper_fit_no_drift,
h = prediction_length,
xreg=p_xreg_test
)
base_plot <- autoplot(
ts(
c(p_xreg[,1]),
start = c(
as.numeric(format(date_limit_reduced_df["min_date"][[1]], "%Y")),
as.numeric(format(date_limit_reduced_df["min_date"][[1]], "%m"))),
frequency = 12),
series = 'mean natural gas') +
autolayer(
ts(
c(p_xreg),
start = c(
as.numeric(format(date_limit_reduced_df["min_date"][[1]], "%Y")),
as.numeric(format(date_limit_reduced_df["min_date"][[1]], "%m"))),
frequency = 12),
series = 'actual natural gas') +
autolayer(
snaive(y = eng_coal_train_ts, length(eng_coal_test_ts)),
PI = FALSE,
series = "naive") +
labs(x="Date", y="Million kWh") +
scale_x_continuous(
labels=function(x){as.integer(x)},
limits=c(
as.numeric(format(date_limit_reduced_df["max_date"][[1]], "%Y"))-5,
as.numeric(format(date_limit_reduced_df["max_date"][[1]], "%Y"))+1),
breaks = seq(
as.numeric(format(date_limit_reduced_df["max_date"][[1]], "%Y"))-5,
as.numeric(format(date_limit_reduced_df["max_date"][[1]], "%Y"))+1,
1)) +
autolayer(
eng_coal_ts,
series = "actual coal"
) +
plot_theme +
scale_color_viridis_d()
base_plot +
autolayer(auto_simple_forecast, alpha = 0.3) +
autolayer(
auto_simple_forecast,
series = "Prediction",
PI = FALSE) +
ggtitle(
'Auto Arima Model - No Predictors',
subtitle = as.character(auto_simple_fit))
base_plot +
autolayer(auto_forecast, alpha = 0.3) +
autolayer(
auto_forecast,
series = "Prediction",
PI = FALSE) +
ggtitle(
'Auto Arima Model',
subtitle = as.character(auto_fit)) +
plot_theme
base_plot +
autolayer(hyper_forecast, alpha = 0.3) +
autolayer(
hyper_forecast,
series = "Prediction",
PI = FALSE) +
ggtitle(
'Hyperparameter Model',
subtitle = as.character(hyper_fit)) +
plot_theme
base_plot +
autolayer(hyper_no_drift_forecast, alpha = 0.3) +
autolayer(
hyper_no_drift_forecast,
series = "Prediction",
PI = FALSE) +
ggtitle(
'Hyperparameter Model w/o Drift',
subtitle = as.character(hyper_fit_no_drift)) +
plot_theme
As we can see, the differences between models is minimal, which means we should lean towards selecting the simple arima model with no predictors until we can find some significant attributes that would reduce the error. Given that the simple model has less than a 9% mean absolute percentage error, it seems as though the decrease in consumption of coal is on a steady decline without any correlated fluctuations to the natural gas consumption. Therefore, it is unclear if using natural gas consumption as a predictor is useful, but it does not appear to be in this simple model. If additional feature engineering is performed or other significant predictors are included, then that may remove some of the noise. In general, there are multiple options to choose from this analysis that will give relatively good predictions, and the selection of one over the other may depend on the purposes for it’s future use. Obviously, other model types could be chosen; however, it seems as though more meaningful predictors would be the key to acheiving higher levels of accuracy.