import os, sys
import pprint as p
import pyspark.sql.functions as pysF
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

py_file_path = os.path.join(
    os.getcwd(),
    "..",
    ".."
)

sys.path.append(py_file_path)
from app.SparkTools import MyPySpark

MySpark = None

#ensure only one sc and spark instance is running
global MySpark
MySpark = MySpark or MyPySpark(
    master = 'local[3]', 
    logger_name = 'jupyter')


# Graphing format functions
def treemap_fig_format(in_fig, title_text):
    """
    Provide standard formatting to treemap figures
    """
    return in_fig.update_layout(
    title = {
        "text" : title_text,
        "font" : {
            "size" : 20},
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    })


electricity_dim_df = MySpark\
    .spark\
    .read\
    .parquet("/Processed/ElectricityPlantLevelDimDF")\
    .filter(
        (pysF.col("series_id").rlike("^ELEC\.PLANT\.GEN\.")) &
        (pysF.lower(pysF.col("engine_type")) == 'all primemovers') &
        (pysF.col("f") == "M"))\
    .withColumn(
        "fuel_type",
        pysF.regexp_replace(
            pysF.regexp_replace(
                pysF.col("fuel_type"),
                "[^a-zA-Z0-9\s]", 
                ""),
            "\s",
            "_"))\
    .filter(pysF.col("iso3166").isNotNull())\
    .withColumn(
        "state",
        pysF.regexp_extract(
            pysF.col("iso3166"),
            r"^USA-([A-Z]+)",
            1))\
    .select("series_id", "state", "fuel_type", "engine_type", "units", "plant_name")\

electricity_fact_df = MySpark\
    .spark\
    .read\
    .parquet("/Processed/ElectricityFactDF")

electricity_df = electricity_fact_df.join(
    pysF.broadcast(electricity_dim_df),
    on = "series_id",
    how = "right"
)


#Pivot data and create an 'other' column that captures the 
#difference between total and the sum of the fuel_type categories
fuel_types_l = electricity_df\
    .select("fuel_type")\
    .distinct()\
    .rdd.map(lambda x: x[0])\
    .collect()

electricity_pivoted_summary_df = electricity_df\
    .groupBy("date", "state", "plant_name")\
    .pivot("fuel_type")\
    .agg(pysF.sum("value"))\
    .fillna(
        0, 
        subset=[i for i in fuel_types_l if i])\
    .drop("null")\
    .withColumn(
        "other",
        pysF.col("all_fuels") - sum(pysF.col(i) for i in fuel_types_l if i and i != "all_fuels"))


#Unpivot back to original format
electricity_formatted_df = MySpark.melt(
    df = electricity_pivoted_summary_df,
    id_vars = ["date", "state", "plant_name"],
    value_vars = [i for i in fuel_types_l if i and i != "all_fuels"],
    var_name = 'fuel_type')


#Create state summary df for 2019
state_2019_df = electricity_formatted_df\
    .filter(
        (pysF.year("date") == "2019") & 
        (pysF.col("value") > 0))\
    .groupBy("state", "plant_name", "fuel_type")\
    .agg(pysF.sum("value").alias("generation"))\
    .withColumn("Net Generation", pysF.lit("Net Generation"))


fig = px.treemap(
    data_frame = state_2019_df.toPandas(),
    names = "fuel_type",
    values = "generation",
    path = ["Net Generation", "fuel_type", "state", "plant_name"],
)
treemap_fig_format(
    fig, 
    title_text = "Net Generation From Fuel Source Per State (MWH) - 2019").show()


electricity_formatted_reduced_df = electricity_formatted_df\
    .withColumn(
        "fuel_type",
        pysF.when(
            pysF.col("fuel_type").rlike(".*_coal$"),
            "coal")\
        .otherwise(
            pysF.when(
                (pysF.col("fuel_type").isin([
                    "geothermal",
                    "solar",
                    "wind"])) |
                (pysF.col("fuel_type").rlike(".*hydroelectric.*")), 
                "renewable")
            .otherwise(
                pysF.when(
                    pysF.col("fuel_type").isin(["natural_gas","nuclear"]), 
                    pysF.col("fuel_type"))
                .otherwise("other"))))
    
electricity_formatted_reduced_df.cache()

state_2019_reduced_df = electricity_formatted_reduced_df\
    .filter(
        (pysF.year("date") == "2019") & 
        (pysF.col("value") > 0))\
    .groupBy("state", "plant_name", "fuel_type")\
    .agg(pysF.sum("value").alias("generation"))\
    .withColumn("Net Generation", pysF.lit("Net Generation"))

state_2019_reduced_df.cache()

DataFrame[state: string, plant_name: string, fuel_type: string, generation: double, Net Generation: string]


fig = px.treemap(
    data_frame = state_2019_reduced_df.toPandas(),
    names = "fuel_type",
    values = "generation",
    path = ["Net Generation", "fuel_type", "state", "plant_name"]
)

treemap_fig_format(
    fig, 
    title_text = "Net Generation From Fuel Source Per State, General Fuel Categories (MWH) - 2019").show()


fig = px.treemap(
    data_frame = state_2019_reduced_df.toPandas(),
    names = "fuel_type",
    values = "generation",
    path = ["Net Generation", "state", "fuel_type", "plant_name"]
)
treemap_fig_format(
    fig, 
    title_text = "Net Generation By State Per Fuel Source (MWH)  - 2019").show()


from pyspark.sql.window import Window as pysW
# 5 year average centered at 5 years ago
state_fuel_5_year_windowspec = pysW\
    .partitionBy("state", "fuel_type", "plant_name")\
    .orderBy("year")\
    .rowsBetween(-9, -5)

state_fuel_3_prev_year_windowspec = pysW\
    .partitionBy("state", "fuel_type", "plant_name")\
    .orderBy("year")\
    .rowsBetween(-4, pysW.currentRow)


state_consumption_diff_df = electricity_formatted_reduced_df\
    .withColumn(
        "year",
        pysF.year("date"))\
    .groupBy(
        "year",
        "state", 
        "plant_name",
        "fuel_type")\
    .agg(pysF.sum("value").alias("generation"))\
    .withColumn(
        "generation_diff",
        pysF.coalesce(pysF.avg(pysF.col("generation")).over(state_fuel_3_prev_year_windowspec), pysF.lit(0)) - 
        pysF.coalesce(pysF.avg(pysF.col("generation")).over(state_fuel_5_year_windowspec), pysF.lit(0)))\
    .orderBy("state", "fuel_type", "year")

state_2019_diff_reduced_increase_df = state_consumption_diff_df\
    .select("year", "state", "plant_name", "fuel_type", "generation_diff")\
    .filter(
        (pysF.col("year") == "2019") & 
        (pysF.col("generation_diff").isNotNull()) &
        (pysF.col("generation_diff")>0))\
    .withColumn("Net Generation", pysF.lit("Net Generation"))

state_2019_diff_reduced_decrease_df = state_consumption_diff_df\
    .select("year", "state", "plant_name", "fuel_type", "generation_diff")\
    .filter(
        (pysF.col("year") == "2019") & 
        (pysF.col("generation_diff").isNotNull()) &
        (pysF.col("generation_diff")<0))\
    .withColumn(
        "generation_diff",
        pysF.col("generation_diff") * -1)\
    .withColumn("Net Generation", pysF.lit("Net Generation"))


fig = px.treemap(
    data_frame = state_2019_diff_reduced_increase_df.toPandas(),
    names = "fuel_type",
    values = "generation_diff",
    path = ["Net Generation", "fuel_type", "state", "plant_name"]
)

treemap_fig_format(
    fig, 
    title_text = "Five Year Average Increase in Net Generation By Fuel Per State (MWH) - 2015 vs 2019").show()


fig = px.treemap(
    data_frame = state_2019_diff_reduced_decrease_df.toPandas(),
    names = "fuel_type",
    values = "generation_diff",
    path = ["Net Generation", "fuel_type", "state", "plant_name"]
)

treemap_fig_format(
    fig, 
    title_text = "Five Year Average Decrease in Net Generation By Fuel Per State - 2015 vs 2019").show()


customer_dim_df = MySpark\
    .spark\
    .read\
    .parquet("/Processed/ElectricityRetailDimDF")\
    .filter(
        (pysF.col("series_id").rlike("^ELEC\.CUSTOMERS\.")) &
        (pysF.col("sector") == "all sectors") &
        (pysF.col("iso3166").isNotNull()) &
        (pysF.col("iso3166") != "USA") &
        (pysF.col("f") == "M"))\
    .withColumn(
        "state",
        pysF.regexp_extract(
            pysF.col("iso3166"),
            r"^USA-([A-Z]+)",
            1))\
    .select("series_id", "state")

customer_fact_df = MySpark\
    .spark\
    .read\
    .parquet("/Processed/ElectricityFactDF")

customer_df = customer_fact_df.join(
    pysF.broadcast(customer_dim_df),
    on = "series_id",
    how = "right")\
    .withColumnRenamed("value", "customers")


electricity_customer_df = electricity_formatted_reduced_df.join(
    customer_df,
    on = ["state", "date"],
    how = "left")\
    .withColumn(
        "year",
        pysF.year(pysF.col("date")))

electricity_customer_df\
    .groupBy(["year", "state"])\
    .agg(pysF.sum(pysF.col("customers").isNull().cast("integer")).alias("null_count"))\
    .filter(
        (pysF.col("null_count") > 0) &
        (pysF.col("year") < 2020)
    )\
    .orderBy(pysF.col("year").desc())\
    .limit(10)\
    .toPandas()\
    .head()


max_ak_2016 = float(
    max(
        customer_df\
        .filter(
            (pysF.col("state") == "AK") &
            (pysF.year("date") == 2015))\
        .rdd\
        .map(lambda x : x["customers"])\
        .collect()))

min_ak_2017 = float(
    min(
        customer_df\
        .filter(
            (pysF.col("state") == "AK") &
            (pysF.year("date") == 2017))\
        .rdd\
        .map(lambda x : x["customers"])\
        .collect()))

electricity_customer_df = electricity_customer_df\
    .withColumn(
        "customers",
        pysF.when(
            ((pysF.col("state") == "AK") &
            (pysF.year("date") == 2016)),
            ((min_ak_2017-max_ak_2016)/13 * pysF.month(pysF.col("date")) + max_ak_2016))\
        .otherwise(pysF.col("customers")).cast("integer"))\
    .filter(pysF.year("date") > 2007)

electricity_customer_df.cache()

DataFrame[state: string, date: date, plant_name: string, fuel_type: string, value: double, series_id: string, customers: int, year: int]


electricity_customer_df\
    .filter(
        (pysF.col("state") == "AK") & 
        (pysF.col("year").isin([2015, 2016, 2017])))\
    .groupBy(["date"])\
    .agg(pysF.avg(pysF.col("customers")).alias("customers"))\
    .withColumn(
        "year",
        pysF.year(pysF.col("date")))\
    .withColumn(
        "month_day",
        pysF.concat_ws(
            "-",
            pysF.lpad(pysF.month(pysF.col("date")),2, "0"), 
            pysF.lpad(pysF.dayofmonth(pysF.col("date")), 2, "0")))\
    .groupBy(["month_day"])\
    .pivot("year")\
    .agg(pysF.avg(pysF.col("customers")).alias("cutomers"))\
    .limit(12)\
    .orderBy("month_day")\
    .toPandas()


electricity_customer_df\
    .groupBy(["year", "state"])\
    .agg(pysF.sum(pysF.col("customers").isNull().cast("integer")).alias("null_count"))\
    .filter(
        (pysF.col("null_count") > 0) &
        (pysF.col("year") < 2020))\
    .orderBy(pysF.col("year").desc())\
    .limit(10)\
    .toPandas()\
    .head()


electricity_customer_normalized_df = electricity_customer_df\
    .groupBy(["year", "state", "plant_name", "fuel_type"])\
    .agg(
    (pysF.sum(pysF.col("value")) * 1000 / (pysF.avg(pysF.col("customers")))).alias("kWH_per_total_state_accounts"),
    pysF.avg(pysF.col("customers")).alias("customers"))\
    .withColumn("Net Generation", pysF.lit("Net Generation"))

electricity_customer_normalized_2019_df = electricity_customer_normalized_df\
    .filter(
        (pysF.col("year") == 2019) &
        (pysF.col("kWH_per_total_state_accounts") > 0))

electricity_customer_normalized_2019_df.cache()

DataFrame[year: int, state: string, plant_name: string, fuel_type: string, kWH_per_total_state_accounts: double, customers: double, Net Generation: string]


gen_per_account = sum(
    electricity_customer_normalized_2019_df\
        .groupBy("year", "state", "customers")\
        .agg(pysF.sum(pysF.col("kWH_per_total_state_accounts")).alias("kWH_per_total_state_accounts"))\
        .withColumn(
            "MWH",
            pysF.col("kWH_per_total_state_accounts") / 1000 * pysF.col("customers"))\
        .rdd\
        .map(lambda x:x["MWH"])\
        .collect())
total_generation = sum(state_2019_reduced_df.select("generation").rdd.map(lambda x:x[0]).collect())

print('Total Generation from normalized summary (MWH): {:n}'.format(gen_per_account))
print('Total Generation state summary (MWH): {:n}'.format(total_generation))

Total Generation from normalized summary (MWH): 3.58698e+09
Total Generation state summary (MWH): 3.58794e+09


fig = px.treemap(
    data_frame = electricity_customer_normalized_2019_df.toPandas(),
    names = "fuel_type",
    values = "kWH_per_total_state_accounts",
    path = ["Net Generation", "fuel_type", "state", "plant_name"])

treemap_fig_format(
    fig, 
    title_text = "Normalized Net Generation By Fuel Per State (kWH Per total state customers) 2019").show()


fig = px.treemap(
    data_frame = electricity_customer_normalized_2019_df.toPandas(),
    names = "fuel_type",
    values = "kWH_per_total_state_accounts",
    path = ["Net Generation", "state", "fuel_type", "plant_name"])

treemap_fig_format(
    fig, 
    title_text = "Normalized Net Generation By State Per Fuel Type (kWH Per total state customers) 2019").show()


state_generations_windowspec = pysW\
    .partitionBy("state")

electricity_customer_ratios_2019_df = electricity_customer_normalized_2019_df\
    .groupBy(["year","state", "fuel_type"])\
    .agg(pysF.sum(pysF.col("kWH_per_total_state_accounts")).alias("kWH_per_total_state_accounts"))\
    .withColumn(
        "fuel_ratio",
        pysF.col("kWH_per_total_state_accounts") / pysF.sum(pysF.col("kWH_per_total_state_accounts")).over(state_generations_windowspec))

state_ratio_windowspec = pysW\
    .partitionBy("state")\
    .orderBy(pysF.col("fuel_ratio").desc())

state_primary_coal_2019_df = electricity_customer_ratios_2019_df\
    .withColumn(
        "primary_fuel_type",
        pysF.first("fuel_type").over(state_ratio_windowspec))\
    .filter(
        (pysF.col("fuel_type") == pysF.col("primary_fuel_type")) &
        (pysF.col("primary_fuel_type") == pysF.lit("coal")))


chor_df = electricity_customer_ratios_2019_df.filter(pysF.col("fuel_type")=="coal")\
    .toPandas()

fig = go.Figure(data=go.Choropleth(
    locations=chor_df["state"], # Spatial coordinates
    z = chor_df["fuel_ratio"].astype(float), # Data to be color-coded
    locationmode = 'USA-states', # set of locations match entries in `locations`
    colorscale = 'Blues',
    colorbar_title = "Percent of Net Generation",
))

fig.update_layout(
    geo_scope='usa', # limite map scope to USA
)

treemap_fig_format(fig, title_text = 'Percent Net Generation From Coal - 2019').show()


chor_df = electricity_customer_ratios_2019_df.filter(pysF.col("fuel_type")=="natural_gas")\
    .toPandas()

fig = go.Figure(data=go.Choropleth(
    locations=chor_df["state"], # Spatial coordinates
    z = chor_df["fuel_ratio"].astype(float), # Data to be color-coded
    locationmode = 'USA-states', # set of locations match entries in `locations`
    colorscale = 'Blues',
    colorbar_title = "Percent of Net Generation",
))

fig.update_layout(
    geo_scope='usa', # limite map scope to USA
)

treemap_fig_format(fig, title_text = 'Percent Net Generation From Natural Gas - 2019').show()


chor_df = electricity_customer_ratios_2019_df.filter(pysF.col("fuel_type")=="nuclear")\
    .toPandas()

fig = go.Figure(data=go.Choropleth(
    locations=chor_df["state"], # Spatial coordinates
    z = chor_df["fuel_ratio"].astype(float), # Data to be color-coded
    locationmode = 'USA-states', # set of locations match entries in `locations`
    colorscale = 'Blues',
    colorbar_title = "Percent of Net Generation",
))

fig.update_layout(
    geo_scope='usa', # limite map scope to USA
)

treemap_fig_format(fig, title_text = 'Percent Net Generation From Nuclear - 2019').show()

	month_day	2015	2016	2017
0	01-01	331677.0	336446.0	338282.0
1	02-01	331668.0	336599.0	338307.0
2	03-01	332583.0	336752.0	339073.0
3	04-01	333387.0	336905.0	339129.0
4	05-01	333680.0	337058.0	340120.0
5	06-01	334411.0	337211.0	341092.0
6	07-01	334993.0	337364.0	341026.0
7	08-01	335469.0	337517.0	342212.0
8	09-01	336042.0	337670.0	341820.0
9	10-01	336294.0	337823.0	342470.0
10	11-01	335076.0	337976.0	341618.0
11	12-01	335380.0	338129.0	341354.0

State Summaries From Plant Level Electricity Data¶

Initial Setup¶

Net Generation¶

Filter and Join Data¶

Format Data¶

State Summaries¶

Net Generation with State Subcategories¶

State Categories with Fuel Source Subcategories¶

Normalized Consumption by Customer Account¶

Join And Clean Customer Data¶

Summary¶

	year	state	null_count
0	2016	AK	65559
1	2007	OH	51480
2	2007	NV	21801
3	2007	CT	25272
4	2007	CA	333918