import pandas as pd
import glob
import pandas as pd
import numpy as np
db = '/Users/jennydoyle/Desktop/dsi/04-project-assets/globalterrorismdb_0616dist.csv'
df = pd.read_csv(db, header=0,low_memory=False)
print 'Size = ', df.shape
def eda(dataframe):
from IPython.core import display as ICD
print "dataframe shape: ",dataframe.shape ## rows by columns
print ''
print 'num duplicates:', dataframe.duplicated().sum() ## df.drop_duplicates() to remove dupes
print ''
print "pct missing values \n"
ICD.display(pd.DataFrame(dataframe.isnull().sum()/len(df)*100),columns=['Columns','Pct Missing']), ## count number of null values per column
print ''
print "dataframe types \n"
ICD.display(pd.DataFrame(dataframe.dtypes),columns=['Columns','Dtype']) ## list data type of each column
print ''
print "dataframe describe \n"
ICD.display(pd.DataFrame(dataframe.describe()))## stats -- mean, min, max, etc..
print ''
print '# unique values in series:\n'
u_list = []
for item in dataframe: ## count number of unique values per column
u_list.append([item, dataframe[item].nunique()])
unique_vals = pd.DataFrame(u_list,columns=['Column','Num_unique'])
ICD.display(unique_vals)
eda(df)
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (10, 8)
plt.rcParams['font.size'] = 30
plt.rcParams['font.family'] = 'Courier New, monospace'
plt.rcParams['xtick.labelsize'] = 20
plt.rcParams['ytick.labelsize'] = 20
GOAL: Find something interesting to use in a Bayesian Inference
df.attacktype1_txt.value_counts()
plt.hist(df.iyear,bins=46)
plt.title('Number of Attacks by Year',fontsize=30)
plt.show()
import seaborn as sns
plt.rcParams['figure.figsize'] = (10, 8)
# plt.rcParams['font.size'] = 30
plt.rcParams['font.family'] = 'Courier New, monospace'
plt.rcParams['xtick.labelsize'] = 10
# plt.rcParams['ytick.labelsize'] = 20
sns.countplot(x="attacktype1_txt", data=df)
plt.title('Number of Attacks by Type',size=30)
plt.xticks(rotation=70)
plt.show()
attacks_by_type = df[['iyear','attacktype1_txt','attacktype1']].groupby(['iyear','attacktype1_txt']).attacktype1.count().reset_index()
attacks_by_type.rename(columns={'attacktype1_txt':'Attack Type'},inplace=True)
attacks_by_type.set_index('iyear',inplace=True)
for atype in list(df.attacktype1_txt.unique()):
plt.plot(attacks_by_type[['attacktype1']][attacks_by_type['Attack Type']==atype],label=atype,linewidth=3.0)
# plt.plot(attacks_by_type,hue='Attack Type')
plt.xlabel('Year',fontsize=20)
plt.ylabel('Number of Attacks',fontsize=20)
plt.title('Terror Attacks by Type',fontsize=30)
plt.legend(prop={'size':15})
plt.show()
attacks_by_type = df[['iyear','attacktype1_txt','attacktype1']][df.suicide==1].groupby(['iyear','attacktype1_txt']).attacktype1.count().reset_index()
attacks_by_type.rename(columns={'attacktype1_txt':'Attack Type'},inplace=True)
attacks_by_type.set_index('iyear',inplace=True)
for atype in list(df.attacktype1_txt.unique()):
plt.plot(attacks_by_type[['attacktype1']][attacks_by_type['Attack Type']==atype],label=atype,linewidth=3.0)
# plt.plot(attacks_by_type,hue='Attack Type')
plt.rcParams['figure.figsize'] = (10, 8)
plt.xlabel('Year',fontsize=20)
plt.ylabel('Number of Attacks',fontsize=20)
plt.title('Suicide Attacks by Type',fontsize=30)
plt.legend(prop={'size':15})
plt.show()
df['weaptype1_txt'].replace({'Vehicle (not to include vehicle-borne explosives, i.e., car or truck bombs)':'Vehicle'},inplace=True)
attacks_by_weapon = df[['iyear','weaptype1_txt','weaptype1']].groupby(['iyear','weaptype1_txt']).weaptype1.count().reset_index()
attacks_by_weapon.rename(columns={'weaptype1_txt':'Weapon'},inplace=True)
attacks_by_weapon.set_index('iyear',inplace=True)
attacks_by_weapon
for atype in list(df.weaptype1_txt.unique()):
plt.plot(attacks_by_weapon[['weaptype1']][attacks_by_weapon['Weapon']==atype],label=atype,linewidth=3.0)
# plt.plot(attacks_by_type,hue='Attack Type')
plt.xlabel('Year',fontsize=20)
plt.ylabel('Number of Attacks',fontsize=20)
plt.title('Terror Attacks by Weapon',fontsize=30)
plt.legend(prop={'size':15})
plt.show()
df['weaptype1_txt'].replace({'Vehicle (not to include vehicle-borne explosives, i.e., car or truck bombs)':'Vehicle'},inplace=True)
attacks_by_weapon = df[['iyear','weaptype1_txt','weaptype1']][df.suicide==1].groupby(['iyear','weaptype1_txt']).weaptype1.count().reset_index()
attacks_by_weapon.rename(columns={'weaptype1_txt':'Weapon'},inplace=True)
attacks_by_weapon.set_index('iyear',inplace=True)
attacks_by_weapon
for atype in list(df.weaptype1_txt.unique()):
plt.plot(attacks_by_weapon[['weaptype1']][attacks_by_weapon['Weapon']==atype],label=atype,linewidth=3.0)
# plt.plot(attacks_by_type,hue='Attack Type')
plt.xlabel('Year',fontsize=20)
plt.ylabel('Number of Attacks',fontsize=20)
plt.title('Suicide Attacks by Weapon',fontsize=30)
plt.legend(prop={'size':15})
plt.show()
import seaborn as sns
# plt.rcParams['figure.figsize'] = (16, 14)
plt.rcParams['font.size'] = 30
plt.rcParams['font.family'] = 'Courier New, monospace'
plt.rcParams['xtick.labelsize'] = 20
plt.rcParams['ytick.labelsize'] = 20
countries = df.groupby(['country_txt']).attacktype1.count().reset_index().sort_values('attacktype1',ascending=False)[0:10]
countries = list(countries.country_txt)
countries.append('United States')
mask = df[df.country_txt.isin(countries)]
sns.countplot(x="country_txt", data=mask)
plt.title('Number of Attacks by Country',size=30)
plt.xticks(rotation=70)
plt.show()
attacks_by_region = df[['iyear','region_txt','region']].groupby(['iyear','region_txt']).region.count().reset_index()
attacks_by_region.rename(columns={'region_txt':'Region'},inplace=True)
attacks_by_region.set_index('iyear',inplace=True)
for atype in list(df.region_txt.unique()):
plt.plot(attacks_by_region[['region']][attacks_by_region['Region']==atype],label=atype,linewidth=3.0)
# plt.plot(attacks_by_type,hue='Attack Type')
plt.xlabel('Year',fontsize=20)
plt.ylabel('Number of Attacks',fontsize=20)
plt.title('Terror Attacks by Region',fontsize=30)
plt.legend(prop={'size':15})
plt.show()
attacks_by_region = df[['iyear','region_txt','region']][df.suicide==1].groupby(['iyear','region_txt']).region.count().reset_index()
attacks_by_region.rename(columns={'region_txt':'Region'},inplace=True)
attacks_by_region.set_index('iyear',inplace=True)
for atype in list(df.region_txt.unique()):
plt.plot(attacks_by_region[['region']][attacks_by_region['Region']==atype],label=atype,linewidth=3.0)
# plt.plot(attacks_by_type,hue='Attack Type')
plt.xlabel('Year',fontsize=20)
plt.ylabel('Number of Attacks',fontsize=20)
plt.title('Suicide Attacks by Region',fontsize=30)
plt.legend(prop={'size':15})
plt.show()
from matplotlib import pyplot as plt
# get total attacks per year
attacks_by_year = df[['iyear','eventid']].groupby(['iyear']).eventid.count().reset_index()
attacks_by_year.rename(columns={'eventid':'Total Attacks'},inplace=True)
# get number of suicide attacks per year
suicide_by_year = df[['iyear','suicide']].groupby(['iyear']).suicide.sum().reset_index()
# get number of successful attacks per year
successes_by_year = df[['iyear','success']].groupby(['iyear']).success.sum().reset_index()
# get number of suicide attacks per year
nperps_by_year = df[['iyear','nperps']][df.nperps!=-99].groupby(['iyear']).nperps.sum().reset_index()
# get number killed per year
killed_by_year = df[['iyear','nkill']].groupby(['iyear']).nkill.sum().reset_index()
# get number wounded per year
wounded_by_year = df[['iyear','nwound']].groupby(['iyear']).nwound.sum().reset_index()
# get number wounded per year
propval_by_year = df[['iyear','propvalue']].groupby(['iyear']).propvalue.sum().reset_index()
# combine into one df
attacks_by_year = attacks_by_year.merge(suicide_by_year,left_on='iyear',right_on='iyear')
attacks_by_year = attacks_by_year.merge(successes_by_year,left_on='iyear',right_on='iyear')
attacks_by_year = attacks_by_year.merge(nperps_by_year,left_on='iyear',right_on='iyear')
attacks_by_year = attacks_by_year.merge(killed_by_year,left_on='iyear',right_on='iyear')
attacks_by_year = attacks_by_year.merge(wounded_by_year,left_on='iyear',right_on='iyear')
attacks_by_year = attacks_by_year.merge(propval_by_year,left_on='iyear',right_on='iyear')
attacks_by_year.rename(columns={'eventid':'Total Attacks',
'suicide':'Suicide Attacks',
'success':'Successful Attacks',
# 'nperps':'Number of Perpetrators',
'nkill':'Number Killed',
'nwound':'Number Wounded'},inplace=True)
attacks_by_year.set_index('iyear',inplace=True)
plt.plot(attacks_by_year['Total Attacks'], label="Total Attacks",linewidth=3.0)
plt.plot(attacks_by_year['Suicide Attacks'], label="Suicide Attacks",linewidth=3.0)
plt.plot(attacks_by_year['Successful Attacks'], label="Successful Attacks",linewidth=3.0)
# plt.plot(attacks_by_year['Number of Perpetrators'], label="Number of Perps")
plt.plot(attacks_by_year['Number Killed'], label="Number Wounded",linewidth=3.0)
plt.plot(attacks_by_year['Number Wounded'], label="Number Killed",linewidth=3.0)
# plt.plot(attacks_by_year['Total Property Value'], label="Total Property Value")
plt.xlabel('Year',fontsize=30)
plt.ylabel('Number of Attacks')
plt.title('Total Terror Attacks',fontsize=30)
plt.legend(prop={'size':20})
plt.show()
Up until now, most of the line graphs have looked pretty similar, but check out the green Suicide Attacks line. It looks nearly constant. However, towards the more recent years, there's a slight trend. I want to investigate this further. Now I don't regret writing up all of this code in the last block.
I'm going to see if there's a statistically significant difference between suicide attacks pre 9/11 and post 9/11.
I will do this by comparing number of suicide attacks per year for 10 years before 9/11 and 10 years after 9/11.
# compare distributions
plt.hist(df.iyear[(df.iyear>=1990)&(df.iyear<2001)],bins=46)
plt.title('Number of Attacks by Year--Before 9/11',fontsize=30)
plt.show()
plt.hist(df.iyear[(df.iyear>2001)&(df.iyear<=2011)],bins=46)
plt.title('Number of Attacks by Year--After 9/11',fontsize=30)
print 'Avg # suicide attacks before 9/11 =', round(df.iyear[(df.iyear>=1990)&(df.iyear<2001)].value_counts().mean(),2),'; total =', df.iyear[(df.iyear>=1990)&(df.iyear<2001)].value_counts().sum()
print 'Avg # suicide attacks after 9/11 =', round(df.iyear[(df.iyear>2001)&(df.iyear<=2011)].value_counts().mean(),2), '; total =', df.iyear[(df.iyear>2001)&(df.iyear<=2011)].value_counts().sum()
So they're both pretty high averages, but I can't say with confidence whether or not the difference is statistically significant. Let's try to figure that out.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import pymc3 as pm
plt.style.use('fivethirtyeight')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
QUESTION: By comparing terror attacks before and after 9/11, can we say that the 9/11 attack resulted in a statistically significant difference in number of suicide attacks?
As this was one of the deadliest attacks in hugely populated areas of the US, and was broadly publicized, I would assume that the number of suicide attacks would increase sadly due to popularization of radical extremist groups.
# df with only the suicide attacks
base = df[df.suicide==1]
# df for prior
df_prior = df[df.iyear<1990].iyear.value_counts()
# df for both of the time periods I'm comparing
df_before = df[(df.iyear<2001)&(df.iyear>=1990)].iyear.value_counts()
df_after = df[(df.iyear>2001)&(df.iyear<=2011)].iyear.value_counts()
# set up bayesian model and priors on means
mean_prior_mean = df_prior.mean()
mean_prior_std = df_prior.std()
print 'mean =', round(mean_prior_mean,2)
print 'std =', round(mean_prior_std,2)
std_prior_lower = 0.01
std_prior_upper = 100.0
# set the prior distribution to both time periods before and after 9/11
# prior belief that the number of suicide attacks is the same
with pm.Model() as model:
before_mean = pm.Normal('before_mean', mean_prior_mean, sd=mean_prior_std)
after_mean = pm.Normal('after_mean', mean_prior_mean, sd=mean_prior_std)
before_std = pm.Uniform('before_std', lower=std_prior_lower, upper=std_prior_upper)
after_std = pm.Uniform('after_std', lower=std_prior_lower, upper=std_prior_upper)
# using the prior distribution, we will set up the posterior distribution
# so using before/after_means, stds, and then adding the observed suicide attacks
# in both the time periods before and after 9/11
with model:
posterior_before = pm.Normal('Before_9_11', mu=before_mean, sd=before_std, observed=df_before)
posterior_after = pm.Normal('After_9_11)', mu=after_mean, sd=after_std, observed=df_after)
with model:
diff_of_means = pm.Deterministic('difference of means', after_mean - before_mean)
diff_of_stds = pm.Deterministic('difference of stds', after_std - before_std)
effect_size = pm.Deterministic('effect size',
diff_of_means / np.sqrt((after_std**2 + before_std**2) / 2))
with model:
trace = pm.sample(25000, njobs=4)
pm.traceplot(trace);
# these colors are horrendous
pm.plot_posterior(trace[3000:],
varnames=['after_mean', 'before_mean', 'before_std', 'after_std'],
color='#87ceeb');
pm.plot_posterior(trace[3000:],
varnames=['difference of means', 'difference of stds', 'effect size'],
ref_val=0,
color='#87ceeb');
pm.summary(trace[3000:],
varnames=['difference of means', 'difference of stds', 'effect size'])
It appears as though the difference in suicide attacks/year before and after 9/11 is not statistically significant. I guess it's a positive thing that 9/11 didn't lead to a huge increase in suicide attacks, but it says nothing about other types of attacks.
The codebook for the dataset describes how the poor handling of the data over time has led to the loss of some of the 1993 data. It is therefore excluded from the dataset as to not provide a false representation of the actual distribution.
The instruction was to impute the number of 1993 bombings, but I'll also do it for the other attack types.
# Compile list of new dates, setting new date to the first day of the year
dates = []
for i in range(0,len(df)):
dates.append(str(df.iyear.loc[i])+'-01-01')
# create a clean date for the actual/real date
dates_real = []
for i in range(0,len(df)):
month = df.imonth.loc[i]
day = df.iday.loc[i]
if month==0:
month = 1
if day==0:
day = 1
dates_real.append(str(df.iyear.loc[i])+'-'+str(month)+'-'+str(day))
# Add Date series to df
df['Date'] = pd.Series(dates)
df.Date = pd.to_datetime(df.Date)
df['Date_real'] = pd.Series(dates_real)
df.Date_real = pd.to_datetime(df.Date_real)
impute = df[['Date','attacktype1','attacktype1_txt']][(df.attacktype1==3)&(df.iyear<=1992)].groupby(['Date','attacktype1']).attacktype1_txt.count().reset_index()
impute.rename(columns={'attacktype1_txt':'num_attacks'},inplace=True)
impute.drop(['attacktype1'],axis=1,inplace=True)
#converting 'date' column to a datetime type
impute['Date'] = pd.to_datetime(impute['Date'])
# resetting date back to the index
impute.set_index('Date',inplace = True)
import matplotlib.pyplot as plt
temp = pd.DataFrame(df[['Date_real','eventid']].groupby('Date_real').eventid.count())
temp.index = pd.to_datetime(temp.index)
plt.plot(figsize=(10,8))
plt.plot(temp)
plt.show()
I'm going to make a judgment call that the data is close enough to stationary that it doesn't need de-trending. It appears as though after the mean peaks around 2013-2014, it begins to fall a bit. Also, due to the sparsity of data in the earlier years, the lines are shorter than maybe they would be had all terror attacks been recorded. Conversely, it's definitely possible that before the rise of social media, less attacks were actually known. This could potentially balance out this line instead of looking like it's trending.
from pandas.tools.plotting import autocorrelation_plot
autocorrelation_plot(temp)
plt.show()
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
# Look Back 20 months to analyze Autocorr and Partial Autocorr
plot_pacf(impute, lags = 22)
plt.show()
plot_acf(impute, lags = 22)
plt.show()
My data is grouped by year, so lags are in years. Aside from 0 in the partial autocorrelation plot, the line from 1 has the highest partial autocorr, and is the only line outside of the 95% confidence interval. This means that we will set our p=1. For the ACF plot, there is no sharp cutoff, so we will try out q=0.
from __future__ import division
from statsmodels.tsa.arima_model import ARMA
# Find optimal p, q:
final_AIC = 500
p,q=0,0
for i in range(0,5):
for j in range(0,5):
try:
AIC = ARMA(impute,order=(i,j)).fit().aic
print i,j,AIC
if AIC < final_AIC:
final_AIC = AIC
p, q =i,j
except:
pass
final_AIC, p, q
# Really glad I saved the results from the first time it ran, because I re-ran this and it doesn't do anything...
# 0 0 363.8556849
# 0 1 342.951384017
# 0 3 328.177608525
# 1 0 315.857908067
# 1 1 317.517697807
# 1 2 318.145000956
# 1 3 nan
# 1 4 331.215672293
# 2 0 317.307563434
# 2 1 316.698746446
# 2 3 nan
# 2 4 nan
# 3 0 317.73817456
# 3 2 334.171843641
# 3 3 nan
# 3 4 nan
# 4 0 319.320923525
# 4 2 nan
# 4 4 nan
# (315.85790806726385, 1, 0)
# Anyways this right here ^^ is the result
This test has confirmed my choice for p=1, q=0.
from matplotlib import pyplot as plt
# fit model
impute.num_attacks = impute.num_attacks.astype(float)
model = ARMA(impute, order=(1,0))
model_fit = model.fit()
print(model_fit.summary())
# plot residual errors
residuals = pd.DataFrame(model_fit.resid)
residuals.plot()
plt.show()
residuals.plot(kind='kde')
plt.show()
plt.plot(impute,color='blue',label='true')
plt.plot(model_fit.fittedvalues, color='red',label='predicted')
plt.legend()
plt.show()
print(residuals.describe())
from statsmodels.tsa.arima_model import _arma_predict_out_of_sample
# get what you need for predicting one-step ahead
params = model_fit.params
residuals = model_fit.resid
p = model_fit.k_ar
q = model_fit.k_ma
k_exog = model_fit.k_exog
k_trend = model_fit.k_trend
steps = 1
_arma_predict_out_of_sample(params, steps, residuals, p, q, k_trend, k_exog, endog=impute, exog=None, start=len(impute))
Here's a function to predict the number of attacks for the other attack types.
from statsmodels.tsa.arima_model import _arma_predict_out_of_sample
from statsmodels.tsa.arima_model import ARMA
from matplotlib import pyplot as plt
def impute_attacks(attack_type,step=1):
impute = df[['Date','attacktype1']][(df.attacktype1==attack_type)&(df.iyear<=1992)].groupby('Date').attacktype1.count().reset_index()
impute.rename(columns={'attacktype1':'num_attacks'},inplace=True)
#converting 'date' column to a datetime type
impute['Date'] = pd.to_datetime(impute['Date'])
# resetting date back to the index
impute.set_index('Date',inplace = True)
impute.num_attacks = impute.num_attacks.astype(float)
model = ARMA(impute, order=(1,0))
model_fit = model.fit()
params = model_fit.params
residuals = model_fit.resid
p = model_fit.k_ar
q = model_fit.k_ma
k_exog = model_fit.k_exog
k_trend = model_fit.k_trend
steps = step
attack_txt = df.attacktype1_txt[df.attacktype1==attack_type].unique()[0]
return attack_txt, _arma_predict_out_of_sample(params, steps, residuals, p, q, k_trend, k_exog, endog=impute, exog=None, start=len(impute))[0]
# loop through each attack type and predict the number of attacks
predicted = []
for attack_type in (df.attacktype1.unique()):
predicted.append(impute_attacks(attack_type))
# turn into dataframe and spit out total number of 1993 terror attacks
predicted = pd.DataFrame(predicted,columns=['Attack Type','1993 Predicted Number of Attacks'])
print 'Total # of Terror Attacks in 1993: ',predicted.sum()[1]; predicted