#Import important libriries
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
from pandas import read_csv
cd
df1 = pd.read_excel(r"C:\Users\Student 95\Downloads\electricity-use-ethekwini.xlsx")
df1.head()
# droping variable that are uneeded to the contribution of the gaol
df1 = df1.drop(columns=['Percent energy growth (%)','Percent growth (%)','Percent loss (%)',
'Power factor at system peak (%)','Average monthly load factor (%)',
'Percent growth','Maximum kVA','Energy (kWh) purchased','Number of customers'], axis=1)
df1.shape
df1.head()
df2 = pd.read_excel(r"C:\Users\Student 95\Downloads\electricity-customers-and-revenue.xlsx")
df2.head()
df2.shape
# Renaming the variable
df2 = df2.rename(columns={'Unnamed: 0': 'Year'})
# merging Datasets
df = pd.merge(df1,df2, on = "Year", how = "inner")
df.head()
power.columns
power = power[['A: Total number of customers', 'B: Total use','C: Total revenue: total','D: Cents per total',
'E: Use per total','F: Rands per total','Year', 'Energy (kWh) sold']]
power.head()
columns_rearranged = ['Year', 'A: Total number of customers','C: Total revenue: total','D: Cents per total',
'E: Use per total','F: Rands per total', 'Energy (kWh) sold']
# arraging the datasets
power = power.reindex(columns=columns_rearranged)
power.head()
#We understand the nature of our dataset and the type of attributes
power.info()
power.head()
#Data visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
#Check the Null values for the entire DataFrame
power.isnull().sum().sum()
power.head()
power.columns[power.isna().any()].tolist()
power.head()
#Drop any column with NaN values
power = power.dropna(axis=1, how='all')
#We plot the histogram to check the distribution of the datasets in each attributes
power.hist(column=columns_rearranged, rwidth=0.95, figsize=(15,8))
#We do the count plot of the Energy (kWh) sold in different years range to check if it correspond with the dataset.
from matplotlib.pyplot import figure
figure(figsize=(16,8))
sns.countplot(data=power, x='Year', hue='Energy (kWh) sold')
# Plotting the Histogram barplot to compare the frequency distribution of D: Cents per total variable.
# We also create a range to compare and analyse different distribution.
bins=[40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200]
plt.hist(power['D: Cents per total'], rwidth=0.75, bins=bins, color='g')
plt.xlabel('Cents range')
plt.ylabel('Total no of cents')
plt.title('Cents Per Total')
# The scatter charts between the pair of variables to understand the relationship.
sns.pairplot(power)
from pandas.plotting import scatter_matrix
scatter_matrix(power, alpha=0.2, figsize=(16, 10), diagonal='kde')
power=power.astype('int64', )
power.columns
power['C: Total revenue: total'] = power ['C: Total revenue: total'].astype('int64')
power['D: Cents per total'] = power ['D: Cents per total'].astype('int64')
power['E: Use per total'] = power ['E: Use per total'].astype('int64')
power['F: Rands per total'] = power ['F: Rands per total'].astype('int64')
power['Energy (kWh) sold'] = power ['Energy (kWh) sold'].astype('int64')
power.info()
#Important libraries
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
X = power.iloc[:,1:6]
y = power.iloc[:,-1]
#Apply SeleckKBest class to extarct top 6 best features
bestfeatures = SelectKBest(score_func=chi2, k=5)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#Concat two DataFrames for better visualization
featureScores = pd.concat([dfcolumns, dfscores], axis=1)
featureScores.columns = ['col','Score']
featureScores
# Print the 6 best features
print(featureScores.nlargest(4,'Score'))
sns.countplot(data=power, x='Energy (kWh) sold', palette='hls')
plt.show()
plt.savefig('count_plot')
#The correlation of each feauture in the dataset
corrmat = power.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(16,8))
#We plot the heatmap
sns.heatmap(power[top_corr_features].corr(), annot=True, cmap='RdYlGn')
To understand the model perfomance we split the datasets into training set and test set.
# Split X and y into training and testing set.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
The 75% of the data will be used for model training and 25% for model testing.
# import the class
from sklearn.linear_model import LogisticRegression
#instantiate the model
logreg = LogisticRegression()
#fit the model with train data
logreg.fit(X_train, y_train)
#fit the model with predictor test data(X_test)
y_pred = logreg.predict(X_test)
y_pred
# import the metrics
from sklearn import metrics
cf_matrix = metrics.confusion_matrix(y_test, y_pred)
cf_matrix
# Import the libraries (pipeline and models)
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
pipeline_dt=Pipeline([('dt_classifier',DecisionTreeClassifier(random_state=0))])
pipeline_rf=Pipeline([('rf_classifier',RandomForestClassifier())])
pipeline_knn=Pipeline([('kn_classifier',KNeighborsClassifier())])
pipeline_lr=Pipeline([('lr_classifier',LogisticRegression())])
#Make the list of pipelines
pipelines = [pipeline_dt,pipeline_rf,pipeline_knn,pipeline_lr]
best_accuracy=0.0
best_classifier=0
best_pipeline=""
#Dictionery of pipelines and classifier type for ease of reference
pipe_dict = {0: 'Decision Tree', 1: 'RandomForest', 2: 'KNeighbors', 3:'Logistic Regression'}
#Fit the pipilines
for pipe in pipelines:
pipe.fit(X_train, y_train)
for i,model in enumerate(pipelines):
print("{} Test Accuracy: {}".format(pipe_dict[i],model.score(X_test,y_test)))
for i,model in enumerate(pipelines):
if model.score(X_test,y_test)>best_accuracy:
best_accuracy=model.score(X_test,y_test)
best_pipeline=model
best_classifier=i
print('Classifier with best accuracy:{}'.format(pipe_dict[best_classifier]))
y_pred_0 = pipeline_dt.predict(X_test)
y_pred_0
y_pred_1 = pipeline_rf.predict(X_test)
y_pred_1
y_pred_3 = pipeline_knn.predict(X_test)
y_pred_3
y_pred_4=pipeline_lr.predict(X_test)
y_pred_4
# Use the forest's predict method on the test data
predictions = pipeline_dt.predict(X_test)
# Calculate the absolute errors
errors = abs(predictions - y_test)
import numpy as np
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')
Ensemble techniques combine individual models together to improve the stabilty and predictive power of the model. One of the ensemble methods are Boosting and Gradient Boosting model.
Boosting technique reduce bias by adding all the weak learners sequantially, each classifier trying to correct its predecessor.
Gradient Boosting minimizes the loss function(MSE)of a models by adding weak learners using a gradient descesnt procedure.
#From sklearn we load Gradient Boosting
from sklearn.ensemble import GradientBoostingRegressor
%%time
GB_regressor = GradientBoostingRegressor()
GB_regressor.fit(X_train,y_train)
y_predictions = GB_regressor.predict(X_train)
y_predictions
from sklearn.metrics import r2_score, mean_squared_error
print('Score of Model:', r2_score(y_train,y_predictions))
print('Mean square error:', mean_squared_error(y_train, y_predictions))
Test predictions
y_predictions = GB_regressor.predict(X_test)
y_predictions
print('Score of Model:', r2_score(y_test,y_predictions))
print('Mean square error:', mean_squared_error(y_test,y_predictions))
from xgboost import XGBRegressor
%%time
xgb_regressor = XGBRegressor()
xgb_regressor.fit(X_train, y_train)
y_prediction2 = xgb_regressor.predict(X_train)
y_prediction2
print('Score of Model :', r2_score(y_train,y_prediction2))
print('Mean square error :', mean_squared_error(y_train,y_prediction2))
Test predictions
y_prediction3 = xgb_regressor.predict(X_test)
y_prediction3
print('Score of Model :', r2_score(y_test,y_prediction3))
print('Mean square error :', mean_squared_error(y_test,y_prediction3))