## Analyzing traffic accidents

Data: Accident data in NYC
Techniques: Exploratory analysis

Analyzing publicly available data on traffic accidents in NYC from 2009-2016 (work in progress).

``````%matplotlib inline
import numpy as np
import pandas as pd
from collections import defaultdict
import csv
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.plotly as py
import plotly.tools as tls
from ggplot import *
``````
``````def make_int(i):
"""
Make strings into integers
"""
if i == '':
return None
else:
return int(i)

def group_data(data, key_name):
"""
Building a dict of lists
"""
grouped_data = defaultdict(list)
for data_point in data:
key = data_point[key_name]
grouped_data[key].append(data_point)
return grouped_data

def sum_grouped_items(grouped_data, field_name):
summed_data = {}
for key, data_points in grouped_data.items():
total = 0
for data_point in data_points:
total += data_point[field_name]
summed_data[key] = total
return summed_data

def cleanup(dic):
"""
Make string years into integers and sort (by year)
"""
year, total = zip(*(sorted(dic.items())))
return list(map(int,year)), total

def describe_data(data):
print('Mean:', np.mean(data))
print('Std:', np.std(data))
print('Min:', np.min(data))
print('Max:', np.max(data))

def percent_of_total(specific,total):
"""Get percent of total"""
return (specific/total)*100
``````
``````with open('fatality_monthly.csv') as f:

for record in records:
record['Fatalities'] = make_int(record['Fatalities'])
record['PedFatalit'] = make_int(record['PedFatalit'])
record['BikeFatali'] = make_int(record['BikeFatali'])
record['MVOFatalit'] = make_int(record['MVOFatalit'])

# Group by year
accidents_by_year = group_data(records, 'YR')

# Deleting 2016 because we don't have complete records.
del accidents_by_year['2016']
``````

### Fatalities by year

``````total_by_year = sum_grouped_items(accidents_by_year, 'Fatalities')
total_bike_by_year = sum_grouped_items(accidents_by_year, 'BikeFatali')
total_ped_by_year = sum_grouped_items(accidents_by_year, 'PedFatalit')
total_mvo_by_year = sum_grouped_items(accidents_by_year, 'MVOFatalit')
``````

#### Total fatalities by year

``````total_fatalities = total_by_year.values()
describe_data(list(total_fatalities))
``````
``````Mean: 220.428571429
Std: 19.1971511492
Min: 191
Max: 255
``````

#### The mean number of fatalities from 2009 - 2015 was about 220.

``````total_by_year_list = cleanup(total_by_year)

plt.bar(range(len(total_by_year_list)),total_by_year_list,align='center')
plt.title('Total fatalities by year')
plt.ylabel('fatalities')
plt.xlabel('year')
plt.xticks(range(len(total_by_year_list)),total_by_year_list)
`````` ### Bike fatalities by year

``````total_bike_by_year_list = cleanup(total_bike_by_year)

plt.bar(range(len(total_bike_by_year_list)),total_bike_by_year_list,align='center')
plt.xticks(range(len(total_bike_by_year_list)),total_bike_by_year_list)
plt.title('Total bike fatalities by year')
plt.xlabel('year')
plt.ylabel('bike fatalities')
`````` ### Pedestrian fatalities by year

``````total_ped_by_year_list = cleanup(total_ped_by_year)

plt.bar(range(len(total_ped_by_year_list)),total_ped_by_year_list,align='center')
plt.xticks(range(len(total_ped_by_year_list)),total_ped_by_year_list)

plt.title('Total pedestrianfatalities by year')
plt.xlabel('year')
plt.ylabel('pedestrian fatalities')
`````` ### Motor vehicle fatalities by year

``````total_mvo_by_year_list = cleanup(total_mvo_by_year)
plt.bar(range(len(total_mvo_by_year_list)),total_mvo_by_year_list,align='center')
plt.xticks(range(len(total_mvo_by_year_list)),total_mvo_by_year_list)

plt.title('Total pedestrian fatalities by year')
plt.xlabel('year')
plt.ylabel('pedestrian fatalities')
`````` ### Fatalities by type as a percentage of total

``````total_year_s = pd.Series(total_by_year)
total_bike_year_s = pd.Series(total_bike_by_year)
total_ped_year_s = pd.Series(total_ped_by_year)
total_mvo_year_s = pd.Series(total_mvo_by_year)
``````
``````bike_percent = percent_of_total(total_bike_year_s, total_year_s)
ped_percent = percent_of_total(total_ped_year_s, total_year_s)
mvo_percent = percent_of_total(total_mvo_year_s, total_year_s)

print(bike)
print(ped)
print(mvo)
``````
``````2009    5.381166
2010    8.636364
2011    9.478673
2012    7.627119
2013    4.705882
2014    9.178744
2015    6.806283
dtype: float64
2009    28.699552
2010    29.545455
2011    29.383886
2012    33.050847
2013    29.803922
2014    33.333333
2015    35.602094
dtype: float64
2009    28.699552
2010    29.545455
2011    29.383886
2012    33.050847
2013    29.803922
2014    33.333333
2015    35.602094
dtype: float64
``````
``````# Check that the percentanges add to 100
bike_pct = pd.Series(bike_percent)
ped_pct = pd.Series(ped_percent)
mvo_pct = pd.Series(mvo_percent)

bike_pct + ped_pct + mvo_pct
``````
``````2009    100.0
2010    100.0
2011    100.0
2012    100.0
2013    100.0
2014    100.0
2015    100.0
dtype: float64
``````

### Fatalities by type in one plot

``````multiple_bars=plt.figure()
x=[2009,2010,2011,2012,2013,2014,2015]
ind=np.arange(len(x))
bike=bike_percent
ped=ped_percent
mvo=mvo_percent

ax.set_title('Percentage of fatalities by type')

ax=plt.subplot(111)
ax.bar(ind-.2, ped, width=.2, color='r', align='center', label='ped')
ax.bar(ind, mvo, width=.2, color='g', align='center', label='MV')
ax.bar(ind+.2, bike, width=.2, color='b', align='center', label='bike')

ax.set_xticks(ind)
ax.set_xticklabels(x)
ax.set_ylabel('Percentage of total')
ax.set_xlabel('Year')
ax.legend(loc='best')
`````` ### Mapping fatalities

``````fatalities = pd.read_csv('fatality_monthly.csv')

# Delete all 2016 entries because they're not complete
fatalities_subset = fatalities[fatalities.YR !=2016]

# Group by year
grouped_year = fatalities_subset.groupby('YR')
``````

### Using latitide/longitude data to map fatalities

``````df = pd.DataFrame({'nodeX':fatalities_subset.loc[:,'nodeX'],
'nodeY':fatalities_subset.loc[:,'nodeY'],
'YR':fatalities_subset.loc[:,'YR']})

# NYC map of fatalitites from 2009-2015
print(ggplot(aes(x='nodeX', y='nodeY', color='factor(YR)'), data=df) + geom_point())
`````` What’s interesting here is that we can look at this data to see if Vision Zero a project aimed to increase traffic safety, has made a difference. Now this is a bit hard to analyze without other data- for example, if we look at the total number of bike fatalities after 2013, there was actually a spike. But is this because the total number of bikers increased? We would need number of bike fatalities as a proportion of total bikers. The same goes for pedestrian and motor vehicle data.

Next on the agenda: trends over time/time series analysis…