Home Page

In this analysis, we take a look at the aggregated U.S. net electrical generation, and how that power is produced. The preliminary steps will show the breakdown of various fuel sources and then we will move into a quick forecast of coal consumption within the next few years.

suppressWarnings(suppressMessages(library(tidyverse)))
suppressWarnings(suppressMessages(library(RColorBrewer)))
suppressWarnings(suppressMessages(library(kableExtra)))
suppressWarnings(suppressMessages(library(gridExtra)))
suppressWarnings(suppressMessages(library(forecast)))
suppressWarnings(suppressMessages(library(ggplot2)))
suppressWarnings(suppressMessages(library(sparklyr)))
suppressWarnings(suppressMessages(library(dplyr)))
suppressWarnings(suppressMessages(library(tidyr)))

First, we will load cleansed data from a previous ETL phase that was stored in hadoop. This information is provided from the bulk downloaded files available at the U.S. Energy Information Administration.

sc <- sparklyr::spark_connect(
  master = 'local[2]', 
  spark_home = Sys.getenv("SPARK_HOME"))
total_energy_fact_df <- sparklyr::spark_read_parquet(
  sc = sc, 
  name = 'total_energy_fact_df', 
  path = "hdfs://localhost:9000/Processed/TotalEnergyFactDF")

total_energy_dim_df <- sparklyr::spark_read_parquet(
  sc = sc, 
  name = 'total_energy_dim_df', 
  path = "hdfs://localhost:9000/Processed/TotalEnergyDimDF") %>% select(
    "name",
    "units",
    "series_id"
  )

total_energy_df <- total_energy_dim_df %>% 
  filter(name %rlike% "(Electricity Net Generation From.*All Sectors, Monthly$)") %>% 
  sdf_broadcast() %>% 
  left_join(
    total_energy_fact_df,
    by = "series_id",
    how = "left"
  ) %>% 
  mutate(
    date = as.Date(date),
    value = as.numeric(value)
  ) %>% 
  arrange(date)

We will cut off any measurements older than 1990, as many records are missing for certain categories, and the does not appear to be useful in its current form. After a quick check, we see that there are no null values in the fields we selected.

electricity_net_generation_df <- total_energy_df %>% 
  dplyr::filter(date >= as.Date("1990-01-01") & date < as.Date("2020-01-01"))
electricity_net_generation_df %>% 
  dplyr::filter(is.na(value) | value == '') %>% 
  dplyr::count(name = "null fields") %>% 
  kable('html') %>% 
  kable_styling(
    bootstrap_options = 'striped',
    full_width = FALSE,
    position = "center",
    font_size = 20)
date_limit_df <- electricity_net_generation_df %>% 
  group_by() %>% 
  dplyr::summarize(
    min_date = min(date), 
    max_date = max(date)) %>% 
  sparklyr::collect()
## Warning: Missing values are always removed in SQL.
## Use `MIN(x, na.rm = TRUE)` to silence this warning
## This warning is displayed only once per session.
## Warning: Missing values are always removed in SQL.
## Use `MAX(x, na.rm = TRUE)` to silence this warning
## This warning is displayed only once per session.
electricity_net_generation_pivot_df <- electricity_net_generation_df %>% 
  dplyr::mutate(
    name = regexp_replace(name, '^Electricity Net Generation From (.*), All Sectors, Monthly$', 'eng_$1'),
    name = tolower(name)
  ) %>% 
  mutate(
    name = regexp_replace(name, ' ', '_')
  ) %>% 
  sdf_pivot(
    formula = date ~ name, 
    fun.aggregate = list(value = "mean"))

electricity_net_generation_pivot_df <- sparklyr::sdf_register(
    electricity_net_generation_pivot_df, 
    "electricity_net_generation_pivot_df")
sparklyr::tbl_cache(
  sc, 
  "electricity_net_generation_pivot_df")

And now, we can take a first look at the net generation by fuel type. It’s immediately obvious that coal, natural gas, and nuclear power make up the major portion of consumed fuels, so we will group all others into a single category for simplicity.

color_values <- colorspace::diverge_hcl(n = 12, h = c(180, 260), c = c(0, 260), l = c(0,80), power = 2)
color_values_big <- colorspace::diverge_hcl(n = 20, h = c(180, 260), c = c(0, 260), l = c(0,100), power = 2)

electricity_net_generation_pivot_melted_df <- electricity_net_generation_pivot_df %>% 
    collect() %>% 
    gather(key = "id", value = 'val', -c(date))

plot_theme <- theme(plot.title = element_text(hjust = 0.5, size = 25),
        plot.subtitle = element_text(hjust = 0.5, size = 14),
        strip.text = element_text(size = 20),
        plot.background = element_rect(fill = "lightgrey"), 
        panel.background = element_rect(fill = "white"), 
        panel.grid.major.x = element_line(color = "lightgrey"), 
        panel.grid.major.y = element_line(color = "lightgrey"), 
        axis.text = element_text(size=14, color = "grey1"), 
        axis.title = element_text(size=16, color = "grey1"))

ggplot(
  data = electricity_net_generation_pivot_melted_df, 
  mapping = aes(x = date, y = val, color = id)) +
  geom_line() +
  scale_color_manual(values = color_values) +
  labs(x = "Date", y = "Million kWh") +
  ggtitle("Electricity Net Generation") +
  plot_theme

min_cutoff_date <- '2009-01-01'
electricity_net_generation_pivot_reduced_df <- electricity_net_generation_df %>% 
  filter(date > as.Date(min_cutoff_date)) %>% 
  dplyr::mutate(
    name = regexp_replace(name, '^Electricity Net Generation From (.*), All Sectors, Monthly$', 'eng_$1'),
    name = tolower(name)) %>% 
  mutate(
    name = regexp_replace(name, ' ', '_')
  ) %>% 
  mutate(
    name = ifelse(
      name %in% c(
        "eng_coal", 
        "eng_natural_gas", 
        "eng_nuclear_electric_power"), 
      name, 
      "eng_other")
  ) %>% 
  sdf_pivot(
    formula = date ~ name, 
    fun.aggregate = list(value = "mean"))

electricity_net_generation_pivot_reduced_df <- sparklyr::sdf_register(
    electricity_net_generation_pivot_reduced_df, 
    "electricity_net_generation_pivot_reduced_df")
sparklyr::tbl_cache(
  sc, 
  "electricity_net_generation_pivot_reduced_df")

date_limit_reduced_df <- electricity_net_generation_pivot_reduced_df %>% 
  group_by() %>% 
  dplyr::summarize(
    min_date = min(date), 
    max_date = max(date)) %>% 
  sparklyr::collect()

Now we can see that in general the generation from coal has been on a slow decline with natural gas doing the opposite and nuclear electric power holding at a steady output.

electricity_net_generation_pivot_melted_df <- electricity_net_generation_pivot_reduced_df %>% 
    collect() %>% 
    gather(key = "id", value = 'val', -c(date))
ggplot(
  data = electricity_net_generation_pivot_melted_df, 
  mapping = aes(x = date, y = val, color = id)) +
  geom_line() +
  scale_color_manual(values = color_values) +
  labs(x = "Date", y = "Million kWh") +
  ggtitle("Electricity Net Generation - Reduced Categories") +
  plot_theme

So, let’s see if we can predict the future of coal consumption. Note, this is very much a simple analysis and there are many factors that contribute to the various fuels usages, inclding political and economical. For this analysis, we will try to fit the time series to an ARIMA model, given how flexible it can be in hyperparamter testing. Also, predictor variables can be passed to these models via the xreg option, which we hope will help the model accuracy.

eng_coal_ts <- ts(
  data = electricity_net_generation_pivot_reduced_df %>%  
    arrange(date) %>% 
    select("eng_coal") %>% 
    collect() %>% 
    pull("eng_coal"), 
  start = c(
    as.numeric(format(date_limit_reduced_df["min_date"][[1]], "%Y")), 
    as.numeric(format(date_limit_reduced_df["min_date"][[1]], "%m"))),
  end = c(
    as.numeric(format(date_limit_reduced_df["max_date"][[1]], "%Y")), 
    as.numeric(format(date_limit_reduced_df["max_date"][[1]], "%m"))),
  frequency = 12)

From the seasonal plot wee see a rather consistent pattern of peaks and valleys based on the month. The ACF plot is decaying, indicating that an arima differencing model may be appropriate, or could benefit from enabling drift. The PACF plot show spikes at periodic lags, suggesting that differencing and/or moving average models may be appropriate as well.

ggseasonplot(eng_coal_ts, polar=TRUE) +
  scale_color_manual(values=color_values_big) +
  ggtitle("Seasonal Coal Generation") +
  plot_theme

autoplot(eng_coal_ts, series='original') + 
  labs(x = "Year", y = "Million kWh") +
  ggtitle("Electricity Net Generation From Coal") +
  scale_color_manual(values=color_values) +
  plot_theme

acf_plot <- ggAcf(eng_coal_ts) + 
  plot_theme +
  ggtitle("ACF")
pacf_plot <- ggPacf(eng_coal_ts) +
  plot_theme +
  ggtitle("PACF")
grid.arrange(acf_plot, pacf_plot, ncol=2)

In order to try and create the most useful model, we will try to use the natural gas power generation values as predictors for the ARIMA model as well. Some variations on this data were tested, namely the rolling mean value over the last 12 months and lagged generation values. Only one predictor was selected to avoid any multicollinearity issues. Also, due to the COVID pandemic, we will leave out the 2020 data from training and testing (that will likely be a different analysis in the future).

prediction_length <- 48
eng_coal_train_ts <- subset(eng_coal_ts, start = 1, end = length(eng_coal_ts) - prediction_length)
eng_coal_test_ts <- subset(eng_coal_ts, start = length(eng_coal_ts) - prediction_length)

# Previous attempts used the raw net generation values from other sources.
# Howver, using the cumulative means provides a simpler comparison and smoother
#predictor for fitting
#p_xreg <- matrix(
#  c(
#    electricity_net_generation_pivot_reduced_df %>% select('eng_natural_gas') %>% pull(),
#    electricity_net_generation_pivot_reduced_df %>% select('eng_nuclear_electric_power') %>% pull()),
#  ncol=2, 
#  byrow = FALSE
#)

#using rolling median
p_rollmean_df <- electricity_net_generation_pivot_df %>% 
  arrange(date) %>% 
  select(
    date, 
    eng_natural_gas) %>%  
#    eng_nuclear_electric_power) %>% 
  collect() %>% 
  mutate(
    #eng_natural_gas_rm = zoo::rollmeanr(lag(eng_natural_gas), k = 13, fill = NA),
    eng_natural_gas = lag(eng_natural_gas)
 #   eng_nuclear_electric_power = zoo::rollmeanr(lag(eng_nuclear_electric_power), k = 13, fill = NA)
  ) %>% 
  filter(date > as.Date(min_cutoff_date)) %>% 
  select(-date) 
  
  
#only pick one predictor to avoid collinearity issues
p_xreg <- matrix(
  c(
    #p_rollmean_df %>% select('eng_natural_gas_rm') %>% pull()),
    p_rollmean_df %>% select('eng_natural_gas') %>% pull()),
    #p_rollmean_df %>% select('eng_nuclear_electric_power') %>% pull()),
  ncol=1, 
  byrow = FALSE
)

p_xreg_train <- matrix(p_xreg[1:length(eng_coal_train_ts),], ncol = dim(p_rollmean_df)[2])
p_xreg_test <- matrix(p_xreg[length(eng_coal_train_ts):length(eng_coal_ts),], ncol = dim(p_rollmean_df)[2])

#export for hyperparameter testing
save(eng_coal_train_ts, p_xreg_train, file = paste("../RFiles/Data/total_energy_coal_data.RData", sep="/"))

As a first step, we will evaluate what happens when we let auto.arima pick the model for us. We will hold out the last 48 predictions for later testing. Non-seasonal differencing was not selected to be used; however, it appears that drift was enabled with a seasonal moving average. The regressors may have something to do with affecting which model was selected; however the standard error associated with each predictor does not indicate a strong influence. Given all that, do see that the box test has given us promising results that the residuals are not correlated.

auto_fit <- auto.arima(eng_coal_train_ts, xreg = p_xreg_train)
checkresiduals(auto_fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from Regression with ARIMA(1,0,0)(2,1,0)[12] errors
## Q* = 7.65, df = 12, p-value = 0.8119
## 
## Model df: 5.   Total lags used: 17
summary(auto_fit)
## Series: eng_coal_train_ts 
## Regression with ARIMA(1,0,0)(2,1,0)[12] errors 
## 
## Coefficients:
##          ar1     sar1     sar2      drift    xreg
##       0.8196  -0.4175  -0.3479  -584.3776  0.0931
## s.e.  0.0891   0.1285   0.1404   265.0405  0.1962
## 
## sigma^2 estimated as 58382832:  log likelihood=-735.65
## AIC=1483.3   AICc=1484.61   BIC=1496.88
## 
## Training set error measures:
##                    ME     RMSE      MAE        MPE     MAPE      MASE      ACF1
## Training set 96.55095 6813.582 5052.253 -0.2310099 3.950352 0.4077027 0.0405893

Now, let’s give our own guess a shot where we used the time series cross validation function (tsCV) from the forecast package. This training focused on selecting the best model at the furthest prediction value (48). Note, I had to make a series of edits to make this function operate when provided with the xreg option. Notes on these edits can be found here.

The results of the hyperparameter training also included a drift term, but this time added a non-seasonal moving average term.. However, during testing, it appeared that the inclusion of the drift parameter was affecting the significance of the predictor. By allowing the model the slowly decrease via the drift parameter, the inverse correlation between the natural gas consumption seemed to go unnoticed. Therefore, the selected model, and a ‘no drift’ model were kept for analysis.

hyper_fit <- Arima(
  eng_coal_train_ts, 
  order=c(2,0,1),
  seasonal=c(1,0,2),
  lambda = NULL,
  include.drift = TRUE,
  xreg=p_xreg_train)

checkresiduals(hyper_fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from Regression with ARIMA(2,0,1)(1,0,2)[12] errors
## Q* = 11.013, df = 8, p-value = 0.201
## 
## Model df: 9.   Total lags used: 17
summary(hyper_fit)
## Series: eng_coal_train_ts 
## Regression with ARIMA(2,0,1)(1,0,2)[12] errors 
## 
## Coefficients:
##          ar1     ar2     ma1    sar1     sma1     sma2  intercept      drift
##       0.6511  0.0683  0.2965  0.9572  -0.4218  -0.1182  148956.56  -541.8345
## s.e.  0.5019  0.4341  0.4796  0.0432   0.1689   0.1784   22353.54   216.1200
##         xreg
##       0.0538
## s.e.  0.1907
## 
## sigma^2 estimated as 60866206:  log likelihood=-865.43
## AIC=1750.86   AICc=1753.91   BIC=1775.05
## 
## Training set error measures:
##                   ME     RMSE      MAE        MPE    MAPE      MASE       ACF1
## Training set 261.442 7366.563 5961.411 -0.2125715 4.54334 0.4810692 0.02496489
hyper_fit_no_drift <- Arima(
  eng_coal_train_ts, 
  order=c(2,0,1),
  seasonal=c(2,1,1),
  lambda = NULL,
  include.drift = FALSE,
  xreg=p_xreg_train)

checkresiduals(hyper_fit_no_drift)

## 
##  Ljung-Box test
## 
## data:  Residuals from Regression with ARIMA(2,0,1)(2,1,1)[12] errors
## Q* = 8.2108, df = 10, p-value = 0.6083
## 
## Model df: 7.   Total lags used: 17
summary(hyper_fit_no_drift)
## Series: eng_coal_train_ts 
## Regression with ARIMA(2,0,1)(2,1,1)[12] errors 
## 
## Coefficients:
##          ar1      ar2      ma1     sar1     sar2     sma1    xreg
##       1.6372  -0.6474  -0.7861  -0.1056  -0.2605  -0.3382  0.0875
## s.e.  0.4856   0.4625   0.4302   0.3086   0.1781   0.3223  0.2051
## 
## sigma^2 estimated as 61480004:  log likelihood=-736.72
## AIC=1489.44   AICc=1491.76   BIC=1507.54
## 
## Training set error measures:
##                     ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -1015.213 6885.221 5172.932 -1.038683 4.081683 0.4174412
##                    ACF1
## Training set 0.02459151

Now that the two complex models have been trained, we can test them against each other as well as a few other simpler prediction methods for comparison. Specifically, we will add a seasonal naive predictor and an auto.arima model using no predictors

auto_simple_fit <- auto.arima(eng_coal_train_ts)
checkresiduals(auto_simple_fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,0)(2,1,0)[12] with drift
## Q* = 7.633, df = 13, p-value = 0.8667
## 
## Model df: 4.   Total lags used: 17
summary(auto_simple_fit)
## Series: eng_coal_train_ts 
## ARIMA(1,0,0)(2,1,0)[12] with drift 
## 
## Coefficients:
##          ar1     sar1     sar2      drift
##       0.8019  -0.4252  -0.3605  -534.1904
## s.e.  0.0847   0.1278   0.1381   219.8821
## 
## sigma^2 estimated as 57528998:  log likelihood=-735.76
## AIC=1481.52   AICc=1482.45   BIC=1492.84
## 
## Training set error measures:
##                    ME     RMSE     MAE        MPE     MAPE      MASE       ACF1
## Training set 114.9374 6814.622 4991.29 -0.2176097 3.901121 0.4027831 0.05003209

Our selected hyperparameter model performs the best; however, not by much. Also, as showin in the summary plots above, the predictor variables appear to have a very weak influence on the model and barely help the hyperparameter trained model fit outperform the simple ARIMA model.

hyper_fit_df <- hyper_fit %>% 
  forecast(h = length(eng_coal_test_ts), xreg=p_xreg_test) %>% 
  accuracy(eng_coal_test_ts) %>% 
  data.frame() %>%  
  tibble::rownames_to_column(var = "model") %>% 
  mutate(model = paste("Hyper",model))
hyper_fit_no_drift_df <- hyper_fit_no_drift %>% 
  forecast(h = length(eng_coal_test_ts), xreg=p_xreg_test) %>% 
  accuracy(eng_coal_test_ts) %>% 
  data.frame() %>%  
  tibble::rownames_to_column(var = "model") %>% 
  mutate(model = paste("Hyper No Drift",model))
auto_fit_df <- auto_fit %>% 
  forecast(h = length(eng_coal_test_ts), xreg=p_xreg_test) %>% 
  accuracy(eng_coal_test_ts) %>% 
  data.frame() %>%  
  tibble::rownames_to_column(var = "model") %>% 
  mutate(model = paste("Auto",model))
auto_simple_fit_df <- auto_simple_fit %>% 
  forecast(h = length(eng_coal_test_ts)) %>% 
  accuracy(eng_coal_test_ts) %>% 
  data.frame() %>%  
  tibble::rownames_to_column(var = "model") %>% 
  mutate(model = paste("Auto Simple",model))
snaive_fit_df <- snaive(y = eng_coal_train_ts, h = length(eng_coal_test_ts)) %>% 
  accuracy(eng_coal_test_ts)  %>% 
  data.frame() %>%  
  tibble::rownames_to_column(var = "model") %>% 
  mutate(model = paste("Seasonal Naive",model))

hyper_fit_df %>% rbind(
  hyper_fit_no_drift_df,
  auto_fit_df,
  auto_simple_fit_df,
  snaive_fit_df
) %>% 
  filter(grepl('test', model, ignore.case = TRUE)) %>% 
  arrange(RMSE) %>% 
  kable('html') %>% 
  kable_styling(bootstrap_options = "striped", position = 'center', full_width = TRUE)
model ME RMSE MAE MPE MAPE MASE ACF1 Theil.s.U
Hyper Test set -346.8382 8999.774 7183.254 -1.7219416 7.869915 0.5796685 0.3569343 0.5987488
Auto Test set 1575.6913 9179.720 7271.547 0.7468322 7.741919 0.5867935 0.3008190 0.5908042
Auto Simple Test set 154.4648 9225.960 7324.011 -0.7076261 7.886640 0.5910273 0.3467761 0.6026227
Hyper No Drift Test set 3858.1697 11169.105 9283.151 2.8316702 9.676188 0.7491244 0.5301015 0.7188086
Seasonal Naive Test set -17686.9412 24155.824 20598.446 -20.7869917 23.484812 1.6622370 0.5553310 1.6586534

Now, we can plot the predictions of the four models we created in this analysis.

#Adjust future predictors

auto_simple_forecast <- forecast(
    auto_simple_fit,
    h = prediction_length)

auto_forecast <- forecast(
    auto_fit,
    h = prediction_length, 
    xreg=p_xreg_test)

hyper_forecast <- forecast(
  hyper_fit,
  h = prediction_length, 
  xreg=p_xreg_test
)

hyper_no_drift_forecast <- forecast(
  hyper_fit_no_drift,
  h = prediction_length, 
  xreg=p_xreg_test
)

base_plot <- autoplot(
  ts(
      c(p_xreg[,1]),
      start = c(
    as.numeric(format(date_limit_reduced_df["min_date"][[1]], "%Y")), 
    as.numeric(format(date_limit_reduced_df["min_date"][[1]], "%m"))),
      frequency = 12),
    series = 'mean natural gas') +
  autolayer(
    ts(
      c(p_xreg),
      start = c(
    as.numeric(format(date_limit_reduced_df["min_date"][[1]], "%Y")), 
    as.numeric(format(date_limit_reduced_df["min_date"][[1]], "%m"))),
      frequency = 12),
    series = 'actual natural gas') + 
  autolayer(
    snaive(y = eng_coal_train_ts, length(eng_coal_test_ts)),
    PI = FALSE,
    series = "naive") +
  labs(x="Date", y="Million kWh") +
  scale_x_continuous(
    labels=function(x){as.integer(x)},
    limits=c(
      as.numeric(format(date_limit_reduced_df["max_date"][[1]], "%Y"))-5, 
      as.numeric(format(date_limit_reduced_df["max_date"][[1]], "%Y"))+1),
    breaks = seq(
      as.numeric(format(date_limit_reduced_df["max_date"][[1]], "%Y"))-5, 
      as.numeric(format(date_limit_reduced_df["max_date"][[1]], "%Y"))+1, 
      1)) +
  autolayer(
    eng_coal_ts,
    series = "actual coal"
  ) +
  plot_theme +
  scale_color_viridis_d()

base_plot +
  autolayer(auto_simple_forecast, alpha = 0.3) +
  autolayer(
    auto_simple_forecast,
    series = "Prediction",
    PI = FALSE) +
  ggtitle(
    'Auto Arima Model - No Predictors', 
    subtitle = as.character(auto_simple_fit))

base_plot +
  autolayer(auto_forecast, alpha = 0.3) +
  autolayer(
    auto_forecast,
    series = "Prediction",
    PI = FALSE) +
  ggtitle(
    'Auto Arima Model', 
    subtitle = as.character(auto_fit)) +
  plot_theme

base_plot + 
  autolayer(hyper_forecast, alpha = 0.3) +
  autolayer(
    hyper_forecast,
    series = "Prediction",
    PI = FALSE) +
  ggtitle(
    'Hyperparameter Model', 
    subtitle = as.character(hyper_fit)) +
  plot_theme

base_plot + 
  autolayer(hyper_no_drift_forecast, alpha = 0.3) +
  autolayer(
    hyper_no_drift_forecast,
    series = "Prediction",
    PI = FALSE) +
  ggtitle(
    'Hyperparameter Model w/o Drift', 
    subtitle = as.character(hyper_fit_no_drift)) +
  plot_theme

As we can see, the differences between models is minimal, which means we should lean towards selecting the simple arima model with no predictors until we can find some significant attributes that would reduce the error. Given that the simple model has less than a 9% mean absolute percentage error, it seems as though the decrease in consumption of coal is on a steady decline without any correlated fluctuations to the natural gas consumption. Therefore, it is unclear if using natural gas consumption as a predictor is useful, but it does not appear to be in this simple model. If additional feature engineering is performed or other significant predictors are included, then that may remove some of the noise. In general, there are multiple options to choose from this analysis that will give relatively good predictions, and the selection of one over the other may depend on the purposes for it’s future use. Obviously, other model types could be chosen; however, it seems as though more meaningful predictors would be the key to acheiving higher levels of accuracy.