import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import Image
Features to predict on: #deaths, #cases, (measurement that curve is flattening), %death/case
1.) Feature selection (choose best feature that predicts high deaths/cases)
2.) Using timeseries, determine which counties have lowered their case/death count
states = pd.read_csv('covid19/4.18states.csv')
counties = pd.read_csv('covid19/abridged_couties.csv')
cases = pd.read_csv('covid19/time_series_covid19_confirmed_US.csv')
deaths = pd.read_csv('covid19/time_series_covid19_deaths_US.csv')
print('Num rows:', len(deaths))
print('Num cols:', len(deaths.columns))
s = deaths.isnull().sum(axis=0)
s.loc[s > 0]
print('Num rows:', len(cases))
print('Num cols:', len(cases.columns))
s = cases.isnull().sum(axis=0)
s.loc[s > 0]
for col in deaths.columns:
if col not in cases.columns:
print(col)
# Verify that the NaNs in FIPS correspond between deaths and cases
print(deaths[deaths['FIPS'].isnull()]['Admin2'])
print(cases[cases['FIPS'].isnull()]['Admin2'])
From the cells above, we see that the deaths
and cases
dataframes are very similar. They have all the same columns except that cases
doesn't have the Population
column, and the two dataframes share the same rows that have NaNs in the FIPS
and Admin2
columns.
Because there are only 4 out of 3255 rows missing value in FIPS
we decide to drop those, and ignore Admin2
as it is only the name of the county and we can fill them in later with their respective FIPS code if needed. We will also ignore filling in Population
until we decide it is needed later.
# Verify 4 rows were deleted
deaths = deaths[deaths['FIPS'].notnull()]
cases = cases[cases['FIPS'].notnull()]
print('Deaths new num rows:', len(deaths))
print('Cases new num rows:', len(cases))
There are a total of 87 columns in the counties dataset, but as we can see that some of the columns have a very significant amount of NaNs. Later when we try to model what are key indicators of cases and deaths of coronavirus, we will decide what cutoff to use to drop columns. An example would be if there are more than 324 NaNs (over 10% of the rows are missing data) we could drop the column.
# Counties Dataset
print('Num rows:', len(counties))
print('Num cols:', len(counties.columns))
print("Number of nans in each column:")
counties.isnull().sum(axis=0).sort_values(ascending=False)[:10]
# np.array(counties.isnull().sum(axis=0).tolist())
MINIMUM_DATA_POINTS = 2000
number_nans_in_col = np.array(counties.isnull().sum(axis=0).tolist())
high_nan_idx= np.where(number_nans_in_col > MINIMUM_DATA_POINTS)[0]
high_nan_cols = [counties.columns[i] for i in high_nan_idx]
counties = counties.drop(high_nan_cols, axis=1)
print('Num rows:', len(counties))
print('Num cols:', len(counties.columns))
Similarly, some of the rows in the dataset have very few data entries for each feature. In an extreme example, the last two rows for New York City and Kansas City have 0 features or data points. Rows that are missing values for more than 40 features are dropped from the table
print("Number of nans in each county:")
counties.set_index("CountyName").isnull().sum(axis=1).sort_values(ascending=False)[:10]
MINIMUM_FEATURES = 40
number_nans_in_row = np.array(counties.isnull().sum(axis=1).tolist())
high_nan_idx= np.where(number_nans_in_row > MINIMUM_FEATURES)[0]
counties = counties.drop(high_nan_idx, axis=0)
print('Num rows:', len(counties))
print('Num cols:', len(counties.columns))
The data seems to be ordered relatively geographically (states are grouped together). We will make the assumption that rows consecutive rows will have similar values, so we will fill in the NaNs using ffill
followed by bfill
to fill in all values. This approach solves the issue with determining what a mean or median is for categorical data points.
# Verify no for NaNs
counties = counties.ffill().bfill()
counties.isnull().sum(axis=0).sort_values(ascending=False)[:5]
sns.boxplot(counties['federal guidelines'],color='green',orient='h')
Some of the data columns have 0 variance: all of the data points are the same. As a result, these features have 0 predictive power and instead make the model more prone to overfitting. In this step, we remove such features from the dataset.
print("Removed Features: ")
for col in counties:
data = counties[col]
if data.dtype == 'float64':
if np.max(data) - np.min(data) < 1:
print("\t" + col)
counties = counties.drop(col, axis=1)
print('Num rows:', len(counties))
print('Num cols:', len(counties.columns))
Some features have values that are much larger than other features. For example, values for population will be much higher than values for population density. To prevent these larger values from dominating predictions, we normalize and demean each quantitative feature, leaving categorical features the same.
for col in counties:
data = counties[col]
if data.dtype == 'float64':
counties[col] = (data - np.mean(data)) / np.std(data)
plt.rcParams.update({'figure.max_open_warning': 0})
quan_counties = counties.copy()
print("Removed categorical features: ")
for col in quan_counties:
data = quan_counties[col]
if data.dtype != 'float64':
print("\t" + col)
quan_counties = quan_counties.drop(col, axis=1)
df = quan_counties
l = df.columns.values
number_of_columns=12
number_of_rows = len(l)-1/number_of_columns
plt.figure(figsize=(18,5*number_of_rows))
for i in range(0,len(l)):
plt.subplot(number_of_rows + 1,number_of_columns,i+1)
sns.set_style('whitegrid')
sns.boxplot(df[l[i]],color='green',orient='v')
plt.tight_layout()
plt.scatter(counties['PopMale15-192010'], counties['PopFmle15-192010'])
plt.xlabel('PopMale15-192010')
plt.ylabel('PopFmle15-192010')
plt.title("PopMale15-192010 vs PopFmle15-192010")
plt.figure(figsize=(24,16))
a = sns.heatmap(quan_counties.corr(),cmap='Blues',annot=False)
# a.set_axis_labels("Features", "Features")
# quan_counties.corr()
a
df.boxplot(["3-YrDiabetes2015-17"])
l = df.columns.values
number_of_columns=12
number_of_rows = len(l)-1/number_of_columns
plt.figure(figsize=(2*number_of_columns,5*number_of_rows))
for i in range(0,len(l)):
plt.subplot(number_of_rows + 1,number_of_columns,i+1)
sns.distplot(df[l[i]],kde=True)
counties[">50 gatherings"]
df.boxplot([">500 gatherings"])
counties["countyFIPS"] = counties["countyFIPS"].astype(int)
deaths['FIPS'] = deaths['FIPS'].astype(int)
counties_deaths = counties.set_index(['countyFIPS']).join(deaths.set_index("FIPS"), how='inner')
cases["totalCases"] = cases["4/9/20"] + cases["4/10/20"] + \
cases["4/11/20"] + cases["4/12/20"] + \
cases["4/13/20"] + cases["4/14/20"] + \
cases["4/15/20"] + cases["4/16/20"] + \
cases["4/17/20"] + cases["4/18/20"]
cases['FIPS'] = cases['FIPS'].astype(int)
counties_deaths = counties_deaths.join(cases[['FIPS', 'totalCases']].set_index("FIPS"), how='inner')
cases[['FIPS', 'totalCases']]
counties_deaths["totalDeaths"] = counties_deaths["4/9/20"] + counties_deaths["4/10/20"] + \
counties_deaths["4/11/20"] + counties_deaths["4/12/20"] + \
counties_deaths["4/13/20"] + counties_deaths["4/14/20"] + \
counties_deaths["4/15/20"] + counties_deaths["4/16/20"] + \
counties_deaths["4/17/20"] + counties_deaths["4/18/20"]
counties_deaths_Y = counties_deaths
counties_deaths
quan_counties = counties_deaths.copy()
for col in quan_counties:
data = quan_counties[col]
if data.dtype != 'float64':
quan_counties = quan_counties.drop(col, axis=1)
mod_counties_deaths = quan_counties.copy()
keep_col = [
# 'STATEFP',
# 'COUNTYFP',
# 'lat',
# 'lon',
# 'POP_LATITUDE',
# 'POP_LONGITUDE',
# 'PopTotalMale2017',
# 'PopTotalFemale2017',
# 'PopulationEstimate65+2017',
# 'CensusPopulation2010',
# '#EligibleforMedicare2018',
# '3-YrDiabetes2015-17',
# '#FTEHospitalTotal2017',
# "TotalM.D.'s,TotNon-FedandFed2017",
# '#Hospitals', '#ICU_beds',
# 'PopFmle<52010',
# 'PopFmle5-92010',
# 'PopFmle10-142010',
# 'PopFmle20-242010',
# 'PopFmle25-292010',
# 'PopFmle30-342010',
# 'PopFmle35-442010',
# 'PopFmle45-542010',
# 'PopFmle55-592010',
# 'PopFmle65-742010',
# 'PopFmle60-642010',
# 'PopFmle>842010',
# '3-YrMortalityAge25-34Years2015-17',
# '3-YrMortalityAge35-44Years2015-17',
# '3-YrMortalityAge45-54Years2015-17',
# '3-YrMortalityAge55-64Years2015-17',
# '3-YrMortalityAge65-74Years2015-17',
# '3-YrMortalityAge75-84Years2015-17',
# '3-YrMortalityAge85+Years2015-17',
# 'HPSAShortage', 'HPSAServedPop', 'HPSAUnderservedPop', 'Lat',
# 'Long_'
# 'PopFmle75-842010',
# 'PopFmle15-192010',
'Rural-UrbanContinuumCode2013',
'PopulationEstimate2018',
'PopulationDensityperSqMile2010',
'MedianAge2010',
'MedicareEnrollment,AgedTot2017',
'DiabetesPercentage',
'HeartDiseaseMortality', 'StrokeMortality', 'Smokers_Percentage',
'RespMortalityRate2014',
'#HospParticipatinginNetwork2017',
'dem_to_rep_ratio',
'PopMale<52010',
'PopMale5-92010',
'PopMale10-142010',
'PopMale15-192010',
'PopMale20-242010',
'PopMale25-292010',
'PopMale30-342010',
'PopMale35-442010',
'PopMale45-542010',
'PopMale55-592010',
'PopMale60-642010',
'PopMale65-742010',
'PopMale75-842010',
'PopMale>842010',
'stay at home',
'>50 gatherings', '>500 gatherings', 'public schools',
'restaurant dine-in', 'entertainment/gym', 'SVIPercentile',
]
for i, col in enumerate(mod_counties_deaths.columns):
if col not in keep_col:
mod_counties_deaths = mod_counties_deaths.drop(col, axis=1)
mod_counties_deaths
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plt
model = LinearRegression()
# float_idx = np.where(np.array([counties_deaths[col].dtype for col in counties_deaths.columns]) == 'float64')[0]
# X = counties_deaths.iloc[:, float_idx]
X = mod_counties_deaths
y = counties_deaths_Y['totalDeaths']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
svc = SVC(kernel="linear", C=1)
n_features = list(range(5, 6))
score = []
ranking = []
for n in n_features:
rfe = RFE(estimator=svc, n_features_to_select=n, step=1)
rfe.fit(X_train, y_train)
score.append(rfe.score(X_test, y_test))
ranking.append(rfe.ranking_)
# model.fit(X_train, y_train)
# model.score(X_test, y_test)
sum(rfe.predict(X_train) == 0) - len(X_train)
mod_counties_deaths.columns
a = list(zip(ranking[0], mod_counties_deaths.columns))
a.sort(key=lambda x: x[0])
a
mod_counties_deaths.columns[np.where(np.array(ranking[0] == 20))]