Loading¶
In [29]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from mpl_toolkits.basemap import Basemap
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
import xgboost as xgb
from sklearn.metrics import roc_curve,roc_auc_score
#load_df = pd.read_csv("Killed_and_Seriously_Injured.csv")
load_df = pd.read_csv("allfilter_injury_data2.csv")
load_df.head()
Out[29]:
X | Y | OBJECTID | INDEX_ | ACCNUM | DATE | TIME | STREET1 | STREET2 | OFFSET | ... | SPEEDING | AG_DRIV | REDLIGHT | ALCOHOL | DISABILITY | HOOD_158 | NEIGHBOURHOOD_158 | HOOD_140 | NEIGHBOURHOOD_140 | DIVISION | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 642702.4974 | 4.855938e+06 | 20 | 3363207 | 882024.0 | 2006/01/07 10:00:00+00 | 2325 | STEELES AVE E | NINTH LINE ST | NaN | ... | NaN | NaN | NaN | NaN | NaN | 144 | Morningside Heights | 131 | Rouge (131) | D42 |
1 | 616144.1868 | 4.841944e+06 | 32 | 3363869 | 882497.0 | 2006/01/08 10:00:00+00 | 1828 | ISLINGTON AVE | GOLFDOWN DR | NaN | ... | NaN | Yes | NaN | NaN | NaN | 5 | Elms-Old Rexdale | 5 | Elms-Old Rexdale (5) | D23 |
2 | 638249.2383 | 4.847699e+06 | 35 | 3363416 | 882174.0 | 2006/01/09 10:00:00+00 | 1435 | KENNEDY RD | GLAMORGAN AVE | NaN | ... | NaN | NaN | NaN | NaN | NaN | 126 | Dorset Park | 126 | Dorset Park (126) | D41 |
3 | 636288.2909 | 4.842392e+06 | 43 | 3363879 | 882501.0 | 2006/01/11 10:00:00+00 | 1120 | BARTLEY DR | JINNAH CRT | NaN | ... | Yes | Yes | NaN | NaN | NaN | 43 | Victoria Village | 43 | Victoria Village (43) | D55 |
4 | 638765.5901 | 4.848810e+06 | 63 | 3371161 | 886230.0 | 2006/01/21 10:00:00+00 | 1829 | MIDLAND AVE | GOODLAND GT | NaN | ... | Yes | Yes | NaN | NaN | NaN | 128 | Agincourt South-Malvern West | 128 | Agincourt South-Malvern West (128) | D42 |
5 rows × 54 columns
ETL: persons to incidents¶
In [30]:
# fatal_rows = (load_df['ACCLASS'] == 'Fatal') & (load_df['INJURY'] == 'Fatal')
# df_fatal = load_df.loc[fatal_rows]
# # df_fatal = df_fatal.drop_duplicates(subset=['ACCNUM'])
# no_fatal_row = (load_df['ACCLASS'] == 'Non-Fatal Injury')
# df_non_fatal = load_df.loc[no_fatal_row]
# df_non_fatal = df_non_fatal.drop_duplicates(subset=['ACCNUM'])
# df_final = pd.concat([df_fatal, df_non_fatal], ignore_index=True)
# df_final.to_csv('allfilter_injury_data2.csv', index=False)
EDA: exploring data initially¶
In [31]:
load_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5299 entries, 0 to 5298 Data columns (total 54 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 X 5299 non-null float64 1 Y 5299 non-null float64 2 OBJECTID 5299 non-null int64 3 INDEX_ 5299 non-null int64 4 ACCNUM 4962 non-null float64 5 DATE 5299 non-null object 6 TIME 5299 non-null int64 7 STREET1 5299 non-null object 8 STREET2 4804 non-null object 9 OFFSET 732 non-null object 10 ROAD_CLASS 5155 non-null object 11 DISTRICT 5208 non-null object 12 LATITUDE 5299 non-null float64 13 LONGITUDE 5299 non-null float64 14 ACCLOC 3452 non-null object 15 TRAFFCTL 5269 non-null object 16 VISIBILITY 5287 non-null object 17 LIGHT 5297 non-null object 18 RDSFCOND 5286 non-null object 19 ACCLASS 5299 non-null object 20 IMPACTYPE 5290 non-null object 21 INVTYPE 5296 non-null object 22 INVAGE 5299 non-null object 23 INJURY 2487 non-null object 24 FATAL_NO 864 non-null float64 25 INITDIR 4000 non-null object 26 VEHTYPE 4816 non-null object 27 MANOEUVER 3448 non-null object 28 DRIVACT 3194 non-null object 29 DRIVCOND 3192 non-null object 30 PEDTYPE 672 non-null object 31 PEDACT 673 non-null object 32 PEDCOND 667 non-null object 33 CYCLISTYPE 109 non-null object 34 CYCACT 114 non-null object 35 CYCCOND 113 non-null object 36 PEDESTRIAN 2402 non-null object 37 CYCLIST 623 non-null object 38 AUTOMOBILE 4691 non-null object 39 MOTORCYCLE 530 non-null object 40 TRUCK 296 non-null object 41 TRSN_CITY_VEH 282 non-null object 42 EMERG_VEH 7 non-null object 43 PASSENGER 1223 non-null object 44 SPEEDING 660 non-null object 45 AG_DRIV 2547 non-null object 46 REDLIGHT 350 non-null object 47 ALCOHOL 208 non-null object 48 DISABILITY 145 non-null object 49 HOOD_158 5299 non-null object 50 NEIGHBOURHOOD_158 5299 non-null object 51 HOOD_140 5299 non-null object 52 NEIGHBOURHOOD_140 5299 non-null object 53 DIVISION 5299 non-null object dtypes: float64(6), int64(3), object(45) memory usage: 2.2+ MB
In [32]:
print("\nMissing values:")
print(load_df.isnull().sum())
Missing values: X 0 Y 0 OBJECTID 0 INDEX_ 0 ACCNUM 337 DATE 0 TIME 0 STREET1 0 STREET2 495 OFFSET 4567 ROAD_CLASS 144 DISTRICT 91 LATITUDE 0 LONGITUDE 0 ACCLOC 1847 TRAFFCTL 30 VISIBILITY 12 LIGHT 2 RDSFCOND 13 ACCLASS 0 IMPACTYPE 9 INVTYPE 3 INVAGE 0 INJURY 2812 FATAL_NO 4435 INITDIR 1299 VEHTYPE 483 MANOEUVER 1851 DRIVACT 2105 DRIVCOND 2107 PEDTYPE 4627 PEDACT 4626 PEDCOND 4632 CYCLISTYPE 5190 CYCACT 5185 CYCCOND 5186 PEDESTRIAN 2897 CYCLIST 4676 AUTOMOBILE 608 MOTORCYCLE 4769 TRUCK 5003 TRSN_CITY_VEH 5017 EMERG_VEH 5292 PASSENGER 4076 SPEEDING 4639 AG_DRIV 2752 REDLIGHT 4949 ALCOHOL 5091 DISABILITY 5154 HOOD_158 0 NEIGHBOURHOOD_158 0 HOOD_140 0 NEIGHBOURHOOD_140 0 DIVISION 0 dtype: int64
Transfrom columns:¶
'TRAFFCTL', 'VISIBILITY', 'LIGHT', 'RDSFCOND', 'ACCLASS', 'IMPACTYPE', 'INVTYPE', 'INVAGE', 'VEHTYPE', 'ALCOHOL'
In [33]:
'''
1: 'Small Vehicles',
2: 'Trucks and Vans',
3: 'Public Transit',
4: 'Emergency and Unknown',
5: 'Special Equipment',
6: 'Off-Road',
7: 'Bicycles and Mopeds',
8: 'Motorcycles',
9: 'Rickshaws',
10: 'Others'
'''
load_df["VEHTYPE"] = load_df["VEHTYPE"].fillna('Other')
classification = {
'Automobile, Station Wagon': 1,
'Bicycle': 7,
'Motorcycle': 8,
'Pick Up Truck': 1,
'Passenger Van': 1,
'Taxi': 1,
'Moped': 7,
'Delivery Van': 2,
'Truck - Open': 2,
'Truck - Closed (Blazer, etc)': 2,
'Truck - Dump': 2,
'Truck-Tractor': 2,
'Truck (other)': 2,
'Truck - Tank': 2,
'Tow Truck': 2,
'Truck - Car Carrier': 2,
'Municipal Transit Bus (TTC)': 3,
'Street Car': 3,
'Bus (Other) (Go Bus, Gray Coa': 3,
'Intercity Bus': 3,
'School Bus': 3,
'Other': 10,
'Unknown': 4,
'Police Vehicle': 4,
'Fire Vehicle': 4,
'Other Emergency Vehicle': 4,
'Construction Equipment': 5,
'Rickshaw': 9,
'Ambulance': 4,
'Off Road - 2 Wheels': 6,
'Off Road - 4 Wheels': 6,
'Off Road - Other': 6
}
load_df['VEHTYPE'] = load_df['VEHTYPE'].map(classification)
load_df['VEHTYPE'].value_counts()
Out[33]:
VEHTYPE 1 2741 10 1923 8 351 7 127 2 96 3 54 4 5 6 1 9 1 Name: count, dtype: int64
In [34]:
'''
1Normal
2Impaired (includes inattentive, medical or physical disability, had been drinking, alcohol impairment, drug impairment)
3Other (includes other and fatigue)
'''
load_df["DRIVCOND"] = load_df["DRIVCOND"].fillna('Other')
drivcond_classification = {
'Normal': 1,
'Inattentive': 2,
'Unknown': 2,
'Medical or Physical Disability': 2,
'Had Been Drinking': 2,
'Ability Impaired, Alcohol Over .08': 2,
'Ability Impaired, Alcohol': 2,
'Other': 3,
'Fatigue': 3,
'Ability Impaired, Drugs': 2
}
load_df['DRIVCOND'] = load_df['DRIVCOND'].map(drivcond_classification)
load_df['DRIVCOND'].value_counts()
Out[34]:
DRIVCOND 3 2193 1 1757 2 1349 Name: count, dtype: int64
In [35]:
'''
1Infants and Young Children (0 to 9)
2Adolescents (10 to 19)
3Young Adults (20 to 34)
4Middle-Aged Adults (35 to 49)
5Older Adults (50 and above)
6Unknown
'''
load_df['INVAGE'].value_counts()
age_classification = {
'unknown': 6, # Category 6: Unknown
'0 to 4': 1, # Category 1: Infants and Young Children
'5 to 9': 1, # Category 1: Infants and Young Children
'10 to 14': 2, # Category 2: Adolescents
'15 to 19': 2, # Category 2: Adolescents
'20 to 24': 3, # Category 3: Young Adults
'25 to 29': 3, # Category 3: Young Adults
'30 to 34': 3, # Category 3: Young Adults
'35 to 39': 4, # Category 4: Middle-Aged Adults
'40 to 44': 4, # Category 4: Middle-Aged Adults
'45 to 49': 4, # Category 4: Middle-Aged Adults
'50 to 54': 5, # Category 5: Older Adults
'55 to 59': 5, # Category 5: Older Adults
'60 to 64': 5, # Category 5: Older Adults
'65 to 69': 5, # Category 5: Older Adults
'70 to 74': 5, # Category 5: Older Adults
'75 to 79': 5, # Category 5: Older Adults
'80 to 84': 5, # Category 5: Older Adults
'85 to 89': 5, # Category 5: Older Adults
'90 to 94': 5, # Category 5: Older Adults
'Over 95': 5 # Category 5: Older Adults
}
# Apply classification to the DataFrame
load_df['INVAGE'] = load_df['INVAGE'].map(age_classification)
load_df['INVAGE'].value_counts()
Out[35]:
INVAGE 5 1799 3 1214 4 1048 6 994 2 200 1 44 Name: count, dtype: int64
In [36]:
'''
1: No Control (e.g., 'No Control')
2: Traffic Control Devices (e.g., 'Traffic Signal', 'Stop Sign', 'Pedestrian Crossover', etc.)
3: Other (e.g., 'Traffic Gate', 'School Guard', 'Police Control')
'''
load_df["TRAFFCTL"] = load_df["TRAFFCTL"].fillna('No Control')
load_df['TRAFFCTL'].value_counts()
traffic_control_classification = {
'No Control': 1,
'Traffic Signal': 2,
'Stop Sign': 2,
'Pedestrian Crossover': 2,
'Traffic Controller': 2,
'Yield Sign': 2,
'Streetcar (Stop for)': 2,
'Traffic Gate': 3,
'School Guard': 3,
'Police Control': 3
}
load_df['TRAFFCTL'] = load_df['TRAFFCTL'].map(traffic_control_classification)
load_df['TRAFFCTL'].value_counts()
Out[36]:
TRAFFCTL 2 2668 1 2627 3 4 Name: count, dtype: int64
In [37]:
'''
1: Clear (e.g., 'Clear')
2: Adverse Weather (e.g., 'Rain', 'Snow', 'Fog, Mist, Smoke, Dust', etc.)
3: Severe Weather (e.g., 'Strong wind')
'''
load_df["VISIBILITY"] = load_df["VISIBILITY"].fillna('Clear')
load_df['VISIBILITY'].value_counts()
# Define the classification
visibility_classification = {
'Clear': 1,
'Rain': 2,
'Snow': 2,
'Other': 2,
'Fog, Mist, Smoke, Dust': 2,
'Freezing Rain': 2,
'Drifting Snow': 2,
'Strong wind': 3
}
# Apply classification to the DataFrame
load_df['VISIBILITY'] = load_df['VISIBILITY'].map(visibility_classification)
load_df['VISIBILITY'].value_counts()
Out[37]:
VISIBILITY 1 4530 2 766 3 3 Name: count, dtype: int64
In [38]:
'''
1: Daylight (e.g., 'Daylight', 'Daylight, artificial')
2: Artificial Light (e.g., 'Dark, artificial', 'Dusk, artificial', 'Dawn, artificial')
3: Low Light (e.g., 'Dark', 'Dusk', 'Dawn', 'Other')
'''
load_df["LIGHT"] = load_df["LIGHT"].fillna('Other')
load_df['LIGHT'].value_counts()
light_classification = {
'Daylight': 1,
'Daylight, artificial': 1,
'Dark': 3,
'Dark, artificial': 2,
'Dusk': 3,
'Dusk, artificial': 2,
'Dawn': 3,
'Dawn, artificial': 2,
'Other': 3
}
# Apply classification to the DataFrame
load_df['LIGHT'] = load_df['LIGHT'].map(light_classification)
load_df['LIGHT'].value_counts()
Out[38]:
LIGHT 1 3071 3 1318 2 910 Name: count, dtype: int64
In [39]:
'''
Dry (1)
Wet (2): Includes Wet and Spilled Liquid conditions.
Slushy/Other (3): Includes Slush and any other unspecified conditions.
Loose Surface (4): Includes Loose Snow, Packed Snow, and Loose Sand/Gravel.
Ice (5): Purely icy conditions.
'''
load_df["RDSFCOND"] = load_df["RDSFCOND"].fillna('Other')
load_df['RDSFCOND'].value_counts()
road_condition_classification = {
'Dry': 1, # Category 1: Dry
'Wet': 2, # Category 2: Wet
'Slush': 3, # Category 3: Slushy
'Loose Snow': 4, # Category 4: Loose Snow
'Packed Snow': 4, # Category 4: Packed Snow
'Ice': 5, # Category 5: Ice
'Loose Sand or Gravel': 4, # Category 4: Loose Sand/Gravel
'Spilled liquid': 2, # Category 2: Wet (Spilled Liquid)
'Other': 3 # Category 3: Slushy/Other
}
load_df['RDSFCOND'] = load_df['RDSFCOND'].map(road_condition_classification)
load_df['RDSFCOND'].value_counts()
Out[39]:
RDSFCOND 1 4201 2 921 3 95 4 63 5 19 Name: count, dtype: int64
In [40]:
'''
Drivers (1): Includes all types of drivers (e.g., Car Driver, Motorcycle Driver, Truck Driver).
Cyclists/Skaters (2): Includes Cyclists, Cyclist Passengers, and In-Line Skaters.
Passengers (3): Includes Car, Motorcycle, and Moped Passengers.
Pedestrians (4): Includes Pedestrians and those using Wheelchairs.
Vehicle & Property Owners (5): Includes Vehicle Owners and Other Property Owners.
Other/Special Cases (6): Includes Witnesses, Trailer Owners, and Other unspecified cases.
'''
load_df["INVTYPE"] = load_df["INVTYPE"].fillna('Other')
invtype_classification = {
'Driver': 1,
'Motorcycle Driver': 1,
'Truck Driver': 1,
'Moped Driver': 1,
'Driver - Not Hit': 1,
'Cyclist': 2,
'In-Line Skater': 2,
'Passenger': 3,
'Motorcycle Passenger': 3,
'Pedestrian': 4,
'Wheelchair': 4,
'Vehicle Owner': 5,
'Other Property Owner': 5,
'Other': 6
}
# Apply classification to the DataFrame
load_df['INVTYPE'] = load_df['INVTYPE'].map(invtype_classification)
load_df = load_df.dropna(subset=['INVTYPE'])
load_df['INVTYPE'].value_counts()
Out[40]:
INVTYPE 1 3253 5 759 4 676 3 463 2 118 6 30 Name: count, dtype: int64
In [41]:
'''
1: Collisions Involving Vulnerable Road Users (e.g., 'Pedestrian Collisions', 'Cyclist Collisions')
2: Vehicle-to-Vehicle Collisions (e.g., 'Turning Movement', 'Rear End', 'Angle', 'Sideswipe', 'Approaching')
3: Other (e.g., 'SMV Other', 'Other', 'SMV Unattended Vehicle')
'''
load_df["IMPACTYPE"] = load_df["IMPACTYPE"].fillna('Other')
load_df['IMPACTYPE'].value_counts()
impact_type_classification = {
'Pedestrian Collisions': 1,
'Cyclist Collisions': 1,
'Turning Movement': 2,
'Rear End': 2,
'SMV Other': 2,
'Angle': 2,
'Approaching': 2,
'Sideswipe': 2,
'Other': 3,
'SMV Unattended Vehicle': 3
}
# Apply classification to the DataFrame
load_df['IMPACTYPE'] = load_df['IMPACTYPE'].map(impact_type_classification)
load_df['IMPACTYPE'].value_counts()
Out[41]:
IMPACTYPE 1 2970 2 2210 3 119 Name: count, dtype: int64
In [42]:
load_df["ACCLASS"] = load_df["ACCLASS"].fillna('Non-Fatal Injury')
load_df['ACCLASS'].value_counts()
load_df["ACCLASS"] = (
load_df["ACCLASS"].map(
{"Non-Fatal Injury": 0,
"Fatal": 1,
"Property Damage O": 0
}
)
)
load_df["ACCLASS"].value_counts()
Out[42]:
ACCLASS 0 4325 1 974 Name: count, dtype: int64
map¶
In [43]:
def mapToronto(data_full):
# Coordinates for Toronto, Canada
llcrnrlat = 43.581024 # Lower left corner latitude
urcrnrlat = 43.855457 # Upper right corner latitude
llcrnrlon = -79.639219 # Lower left corner longitude
urcrnrlon = -79.115218 # Upper right corner longitude
# Initialize the Basemap
m = Basemap(projection='merc', llcrnrlat=llcrnrlat, urcrnrlat=urcrnrlat,
llcrnrlon=llcrnrlon, urcrnrlon=urcrnrlon, resolution='i')
# Draw map details
m.drawcountries()
m.drawparallels(np.arange(-90, 91., 2.), labels=[1,0,0,0])
m.drawmeridians(np.arange(-180, 181., 2.), labels=[0,0,0,1])
# Extract data from dataframe
lat = data_full['LATITUDE'].values
lon = data_full['LONGITUDE'].values
a_1 = data_full['ACCLASS'].values
# Plot data
m.scatter(lon, lat, latlon=True, c=a_1, s=50, linewidth=1, edgecolors='red', cmap='hot', alpha=1)
# Add color bar
cbar = m.colorbar()
cbar.set_label('Fatality Count')
# Add title
plt.title("Toronto, Canada Fatalities", fontsize=30)
plt.show()
# Set the style and size of the plot
sns.set(style="white", font_scale=1.5)
plt.figure(figsize=(20,20))
# Call the function to plot the map
mapToronto(load_df)