from __future__ import print_function # For Python 2 / 3 compatability
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import chardet
import math
import matplotlib.pyplot as plt
#np.random.seed(0)
#%pip install cchardet
#import cchardet as chardet
file = open(r'C:\Users\user\Downloads\ethekwini_skills.xls',encoding='utf-8') file
data=pd.read_csv(file)
data.head()
# saved excel spreadsheet as a CSV
file = open(r'C:\Users\user\Downloads\ethekwini_skill.csv')
file
data=pd.read_csv(file)
data.head()
data.tail()
new_header = data.iloc[0] #grab the first row for the header
data = data[1:] #take the data less the header row. Essentially taking all the data going downwards
data.columns = new_header #set the header row as the df header
data.head()
data.shape
data.info()
sns.heatmap(data.isnull(),yticklabels=False,cmap='viridis')
data.columns
data.nunique()
data['Gender'].unique()
data['Disability'].unique()
data['Skill Sector'].unique()
for col_name in data.columns:
if data[col_name].dtypes == 'object':
unique_cat = len(data[col_name].unique())
print("Feature '{col_name}' has {unique_cat} unique categories".format(col_name=col_name,unique_cat=unique_cat))
# first three for incomplete data and the next two for irrelevance/redundancy
ndata = data.drop(['Skill Description','Skill Experience Duration','Informal Training Description','Suburb','Informal Experience Duration'], axis = 1) # Policy can be made with regards to wards and not suburbs
ndata.head()
sns.heatmap(ndata.isnull(),yticklabels=False,cmap='viridis')
dataTypeSeries = ndata.dtypes
print('Data type of each column of Dataframe :')
print(dataTypeSeries)
ndata['Age'] = ndata['Age'].astype(int)
print (ndata['Age'].dtypes)
ndata['Age'].plot.hist()
sns.distplot(ndata['Age'])
ndata['Age'].describe()
ndata['Age'] = pd.to_numeric(ndata['Age'])
ndata['Age'] = np.where(ndata['Age'] > 15, '15-20', ndata['Age'])
ndata.head()
nndata = ndata.loc[ndata['Ward No']=='28']
nndata.head(5)
nndata.shape
print (ndata['Age'].dtypes)
plt.figure(figsize=(16, 16))
ax1 = sns.countplot(x="Skill Sector",data=nndata)
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()
plt.figure(figsize=(16, 16))
ax2 = sns.countplot(x="Skill Sector",data=ndata)
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()
plt.figure(figsize=(8, 8))
ax4 = sns.countplot(x='Qualification', data=ndata)
ax4.set_xticklabels(ax4.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()
sns.countplot(x="Disability",data=ndata)
ndata['Disability'].value_counts()
ndata['Gender'].value_counts() #less than 1% unspecified and 44% male
genders = pd.get_dummies(ndata['Gender'],drop_first=True)
license = pd.get_dummies(ndata['Drivers Licence'],drop_first=True)
disability = pd.get_dummies(ndata['Disability'],drop_first=True)
genders.head(5)
ndata.drop(['Gender'],axis=1,inplace=True)
ndata.drop(['Disability'],axis=1,inplace=True)
ndata.drop(['Drivers Licence'],axis=1,inplace=True)
ndata.head()
ndata=pd.concat([ndata,genders,license,disability],axis=1)
ndata
# Qualitative study by Moodley,S 2017 GIBS for MBA
gibs_study = {"Formal,Informal & Non-formal": 5, "Formal & Non-formal": 4, "Formal & Informal": 2, "Formal": 1, "Informal": 1, "Non-formal": 0}
plt.pie([float(v) for v in gibs_study.values()], labels=[k for k in gibs_study.keys()],
autopct='%1.0f%%', pctdistance=0.8, labeldistance=1.2)
dataz = pd.read_csv(r'C:\Users\user\Downloads\ethekwini-business-license-data-in-all-regions.csv')
dataz.head(10)
dataz.drop(['Proprietor', 'BusinessName','RefNo','PhysicalAddress', 'Telephone','PostalAddress','LicenseSubType','CurrentDate'],axis=1,inplace=True)
dataz.head()
dataz.drop(['Employer', 'AnnualNotification','LicenseIssueDate','NotificationUpdateDate', 'Unnamed: 13','Unnamed: 14'],axis=1,inplace=True)
dataz.head()
dataz.tail()
dataz.info()
dataz.nunique()
plt.figure(figsize=(8, 8))
ax3 = sns.countplot(x='LicenseType',data=dataz)
ax3.set_xticklabels(ax3.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()
dataz.head(50)
datas = pd.read_csv(r'C:\Users\user\Downloads\unemploymentrates.csv')
datas.head(10)
datas = datas.iloc[:-1] #take the data less the header row. Essentially taking all the data going downwards
datas.head(10)
corelation = datas.corr()
sns.heatmap(corelation, xticklabels=corelation.columns, yticklabels=corelation,annot=True)
from google