library(knitr)
library(knitrBootstrap)
library(DT)
library(tidyverse)
library(ggpubr)
library(gridExtra)
library(readxl)
library(MASS)
library(pROC)
library(ROCR)
library(boot)
library(nnet)
remove(list = ls())
# Globally controlling code blocks
opts_chunk$set(message = FALSE, # prevent messages
warning = FALSE, # prevent warnings
fig.align = "center", # graph position
fig.height = 3, # graph default height and width
fig.width = 6)
variables <- c("aircraft", "duration", "no_pasg", "speed_ground",
"speed_air", "height", "pitch", "distance")
descriptions <- c("the make of an aircraft (Boeing or Airbus)",
"duration in minutes - should always be greater than 40min",
"number of passengers in the flight",
"the ground speed of an aircraft when passing over the threshold of the runway.if its value is less than 30MPH or greater than 140MPH, then the landing would be considered as abnormal",
"the air speed of the aircraft, follows the same restrictions",
"The height of an aircraft when it is passing over the threshold of the runway. The landing aircraft is required to be at least 6 meters high at the threshold of the runway",
"Pitch angle of an aircraft when it is passing over the threshold of the runway",
"The landing distance of an aircraft. More specifically, it refers to the distance between the threshold of the runway and the point where the aircraft can be fully stopped. The length of the airport runway is typically less than 6000 feet")
data.frame("Variable" = variables, "Description" = descriptions) %>% kable()
Variable | Description |
---|---|
aircraft | the make of an aircraft (Boeing or Airbus) |
duration | duration in minutes - should always be greater than 40min |
no_pasg | number of passengers in the flight |
speed_ground | the ground speed of an aircraft when passing over the threshold of the runway.if its value is less than 30MPH or greater than 140MPH, then the landing would be considered as abnormal |
speed_air | the air speed of the aircraft, follows the same restrictions |
height | The height of an aircraft when it is passing over the threshold of the runway. The landing aircraft is required to be at least 6 meters high at the threshold of the runway |
pitch | Pitch angle of an aircraft when it is passing over the threshold of the runway |
distance | The landing distance of an aircraft. More specifically, it refers to the distance between the threshold of the runway and the point where the aircraft can be fully stopped. The length of the airport runway is typically less than 6000 feet |
file1 <- read_excel("FAA1.xls")
file2 <- read_excel("FAA2.xls")
file1_vars <- file1 %>% colnames() %>% as.data.frame()
colnames(file1_vars) <- "File 1"
file2_vars <- file2 %>% colnames()
file2_vars <- c(file2_vars[1], NA, file2_vars[2:7]) %>% as_data_frame()
colnames(file2_vars) <- "File 2"
bind_cols(file1_vars, file2_vars) %>% kable(caption = "Variables in two files")
File 1 | File 2 |
---|---|
aircraft | aircraft |
duration | NA |
no_pasg | no_pasg |
speed_ground | speed_ground |
speed_air | speed_air |
height | height |
pitch | pitch |
distance | distance |
We do not have duration
variable in the second file.
file2$duration <- NA
data <- bind_rows(file1, file2)
data %>% head(5) %>% kable(caption = "Raw data form the two files")
aircraft | duration | no_pasg | speed_ground | speed_air | height | pitch | distance |
---|---|---|---|---|---|---|---|
boeing | 98.47909 | 53 | 107.91568 | 109.3284 | 27.41892 | 4.043515 | 3369.836 |
boeing | 125.73330 | 69 | 101.65559 | 102.8514 | 27.80472 | 4.117432 | 2987.804 |
boeing | 112.01700 | 61 | 71.05196 | NA | 18.58939 | 4.434043 | 1144.922 |
boeing | 196.82569 | 56 | 85.81333 | NA | 30.74460 | 3.884236 | 1664.218 |
boeing | 90.09538 | 70 | 59.88853 | NA | 32.39769 | 4.026096 | 1050.264 |
data <- data[-(which(duplicated(data[,-2]) == TRUE)), ]
summary(data[,-1]) %>% as.data.frame() %>%
separate(Freq, into = c("stat", "value"), sep = ":" ) %>%
spread(key = stat, value = value) %>%
dplyr::select(2:9) %>% rename("Variable" = "Var2") %>%
kable(caption = "Summary statistics of each variable")
Variable | 1st Qu. | 3rd Qu. | Max. | Mean | Median | Min. | NA’s |
---|---|---|---|---|---|---|---|
duration | 119.49 | 188.91 | 305.62 | 154.01 | 153.95 | 14.76 | 50 |
no_pasg | 55.0 | 65.0 | 87.0 | 60.1 | 60.0 | 29.0 | NA |
speed_ground | 65.90 | 92.06 | 141.22 | 79.45 | 79.64 | 27.74 | NA |
speed_air | 96.25 | 109.40 | 141.72 | 103.80 | 101.15 | 90.00 | 642 |
height | 23.314 | 36.993 | 59.946 | 30.144 | 30.093 | -3.546 | NA |
pitch | 3.642 | 4.377 | 5.927 | 4.009 | 4.008 | 2.284 | NA |
distance | 883.79 | 1936.95 | 6533.05 | 1526.02 | 1258.09 | 34.08 | NA |
Boeing
and Airbus
duration = 50
, speed_air = 642
Going by the variable dictionary, there exist records with abonormal values.
abnormal_data <- data %>% filter(duration < 40)
abnormal_data <- bind_rows(abnormal_data, data %>% filter(height < 0))
data <- data %>% setdiff(abnormal_data)
summary(data[,-1]) %>% as.data.frame() %>%
separate(Freq, into = c("stat", "value"), sep = ":" ) %>%
spread(key = stat, value = value) %>%
dplyr::select(2:9) %>% rename("Variable" = "Var2") %>%
kable(caption = "Summary statistics after removing abnormal values")
Variable | 1st Qu. | 3rd Qu. | Max. | Mean | Median | Min. | NA’s |
---|---|---|---|---|---|---|---|
duration | 119.76 | 189.64 | 305.62 | 154.89 | 154.26 | 41.95 | 50 |
no_pasg | 55.00 | 65.00 | 87.00 | 60.08 | 60.00 | 29.00 | NA |
speed_ground | 65.93 | 91.89 | 141.22 | 79.47 | 79.70 | 27.74 | NA |
speed_air | 96.26 | 109.46 | 141.72 | 103.83 | 101.18 | 90.00 | 635 |
height | 23.46170 | 36.98963 | 59.94596 | 30.28298 | 30.13915 | 0.08611 | NA |
pitch | 3.641 | 4.376 | 5.927 | 4.008 | 4.004 | 2.284 | NA |
distance | 886.25 | 1936.32 | 6533.05 | 1527.64 | 1259.89 | 41.72 | NA |
histo_viz <- list()
for (i in variables) {
if (i == "aircraft") {
histo_viz[[i]] <- data %>% ggplot(aes_string(i)) + geom_bar()
}
else {
histo_viz[[i]] <- data %>% ggplot(aes_string(i)) + geom_histogram()
}
}
grid.arrange(histo_viz[[1]], histo_viz[[2]], histo_viz[[3]], histo_viz[[4]],
histo_viz[[5]], histo_viz[[6]], histo_viz[[7]], histo_viz[[8]],
nrow = 2, ncol = 4, top = "Histograms to visualize the data ranges" )
Boeing
and Airbus
duration = 50
, speed_air = 642
speed_ground
and speed_air
, all values are too small to be actually causing a correlation to landing distance. But the p-values indicate that speed_air
, speed_ground
, aircraft_bool
, height
and pitch
variables have a significant effect. data_with_bool <- data
data_with_bool$aircraft_bool <- ifelse(data$aircraft == "airbus", 0, 1)
data_with_bool2 <- data_with_bool
corr_table <- expand.grid(variable = c(variables[-c(1,8)], "aircraft_bool"),
stringsAsFactors = FALSE,
correlation = 0,
corr_direction = "text",
corr_p_value = 0)
j <- 1
for (i in c(variables[-c(1,8)], "aircraft_bool")) {
cor_test_var <- cor.test(data_with_bool$distance, data_with_bool[[i]])
corr_table$correlation[j] <- cor_test_var$estimate
if (cor_test_var$estimate < 0) {
corr_table$corr_direction[j] <- "negative"
} else if (cor_test_var$estimate > 0) {
corr_table$corr_direction[j] <- "positive"
} else {
corr_table$corr_direction[j] <- "neutral"
}
corr_table$correlation[j] <- cor_test_var$estimate
corr_table$corr_p_value[j] <- cor_test_var$p.value
j <- j + 1
}
corr_table %>% arrange(desc(abs(correlation))) %>% kable(caption = "Pairwise Correlation")
variable | correlation | corr_direction | corr_p_value |
---|---|---|---|
speed_air | 0.9478654 | positive | 0.0000000 |
speed_ground | 0.8616676 | positive | 0.0000000 |
aircraft_bool | 0.2362963 | positive | 0.0000000 |
height | 0.1154323 | positive | 0.0008030 |
pitch | 0.0972624 | positive | 0.0047810 |
duration | -0.0528738 | negative | 0.1375954 |
no_pasg | -0.0245707 | negative | 0.4769768 |
pairwise_viz <- list()
for (i in variables[-c(8)]) {
if (i == "aircraft") {
pairwise_viz[[i]] <- data %>% ggplot(aes_string(i, "distance")) +
geom_point(aes_string(col = i))
}
else {
pairwise_viz[[i]] <- data %>% ggplot(aes_string("distance", i)) + geom_point()
}
}
grid.arrange(pairwise_viz[[1]], pairwise_viz[[2]], pairwise_viz[[3]], pairwise_viz[[4]],
pairwise_viz[[5]], pairwise_viz[[6]], pairwise_viz[[7]],
nrow = 2, ncol = 4, top = "Pairwise correlation of each variable")
speed_air
, speed_ground
, aircraft_bool
, height
and pitch
variables have a significant effect on the landing distance, when regressed individually. uni_lm_table <- expand.grid(variable = c(variables[-c(1,8)], "aircraft_bool"),
stringsAsFactors = FALSE,
lm_coefficient = 0,
lm_p_value = 0)
j <- 1
for (i in c(variables[-c(1,8)], "aircraft_bool")) {
lm_summary <- lm(data_with_bool$distance ~ data_with_bool[[i]]) %>% summary()
uni_lm_table$lm_coefficient[j] <- lm_summary$coefficients[2,1]
uni_lm_table$lm_p_value[j] <- lm_summary$coefficients[2,4]
j <- j + 1
}
uni_lm_table %>% arrange(lm_p_value) %>% kable(caption = "Coefficient table of linear model")
variable | lm_coefficient | lm_p_value |
---|---|---|
speed_ground | 41.856226 | 0.0000000 |
speed_air | 82.162917 | 0.0000000 |
aircraft_bool | 438.268337 | 0.0000000 |
height | 10.702621 | 0.0008030 |
pitch | 170.844705 | 0.0047810 |
duration | -1.022458 | 0.1375954 |
no_pasg | -3.034033 | 0.4769768 |
data_standardized <- data_with_bool
for (i in c(2:9)) {
data_standardized[i] <- (data_standardized[i] - mean(data_standardized[[i]], na.rm = TRUE)) /
var(data_standardized[[i]], na.rm = TRUE)^0.5
}
uni_lm_table_std <- expand.grid(variable = c(variables[-c(1,8)], "aircraft_bool"),
stringsAsFactors = FALSE,
std_coefficient = 0,
std_p_value = 0)
j <- 1
for (i in c(variables[-c(1,8)], "aircraft_bool")) {
lm_summary <- lm(data_standardized$distance ~ data_standardized[[i]]) %>% summary()
uni_lm_table_std$std_coefficient[j] <- lm_summary$coefficients[2,1]
uni_lm_table_std$std_p_value[j] <- lm_summary$coefficients[2,4]
j <- j + 1
}
uni_lm_table_std %>% arrange(std_p_value) %>%
kable(caption = "Coefficient table of standardized linear model")
variable | std_coefficient | std_p_value |
---|---|---|
speed_ground | 0.8616676 | 0.0000000 |
speed_air | 0.9142403 | 0.0000000 |
aircraft_bool | 0.2362963 | 0.0000000 |
height | 0.1154323 | 0.0008030 |
pitch | 0.0972624 | 0.0047810 |
duration | -0.0534240 | 0.1375954 |
no_pasg | -0.0245707 | 0.4769768 |
table0 <- corr_table %>%
inner_join(uni_lm_table, by = "variable") %>%
inner_join(uni_lm_table_std, by = "variable") %>%
arrange(lm_p_value, std_p_value, corr_p_value)
table0 %>% kable(caption = "Summary of all three analyses")
variable | correlation | corr_direction | corr_p_value | lm_coefficient | lm_p_value | std_coefficient | std_p_value |
---|---|---|---|---|---|---|---|
speed_ground | 0.8616676 | positive | 0.0000000 | 41.856226 | 0.0000000 | 0.8616676 | 0.0000000 |
speed_air | 0.9478654 | positive | 0.0000000 | 82.162917 | 0.0000000 | 0.9142403 | 0.0000000 |
aircraft_bool | 0.2362963 | positive | 0.0000000 | 438.268337 | 0.0000000 | 0.2362963 | 0.0000000 |
height | 0.1154323 | positive | 0.0008030 | 10.702621 | 0.0008030 | 0.1154323 | 0.0008030 |
pitch | 0.0972624 | positive | 0.0047810 | 170.844705 | 0.0047810 | 0.0972624 | 0.0047810 |
duration | -0.0528738 | negative | 0.1375954 | -1.022458 | 0.1375954 | -0.0534240 | 0.1375954 |
no_pasg | -0.0245707 | negative | 0.4769768 | -3.034033 | 0.4769768 | -0.0245707 | 0.4769768 |
It was observed earlier that speed_air
and speed_ground
were correlated. This will lead to multicollinearity when building a model to predict the landing distance. The effect is clearly visible in the below table where the coefficient of speed_ground
changes direction and stops being significant. Thus is makes sense to keep only one of the variables in the final model.
speed_air
variable has 642 NA
rows, it’s advisable to leave that out. multi_col_table <- uni_lm_table %>% filter(variable %in% c("speed_air", "speed_ground"))
multi_col_table <- rbind(multi_col_table, multi_col_table[1:2,])
air_ground_model <- lm(distance ~ speed_air + speed_ground, data) %>% summary()
multi_col_table$variable[4] <- "ground + air - air"
multi_col_table$lm_coefficient[4] <- air_ground_model$coefficients["speed_air",1]
multi_col_table$lm_p_value[4] <- air_ground_model$coefficients["speed_air",4]
multi_col_table$variable[3] <- "ground + air - ground"
multi_col_table$lm_coefficient[3] <- air_ground_model$coefficients["speed_ground",1]
multi_col_table$lm_p_value[3] <- air_ground_model$coefficients["speed_ground",4]
multi_col_table %>% kable(caption = "Effect of multi-collinearity on both speed variables")
variable | lm_coefficient | lm_p_value |
---|---|---|
speed_ground | 41.85623 | 0.0000000 |
speed_air | 82.16292 | 0.0000000 |
ground + air - ground | -14.77547 | 0.2605788 |
ground + air - air | 96.98248 | 0.0000000 |
Now that the individual effect of the variables is established, multiple features can be used in the model to check for higher performance.
signif_vars <- table0 %>% filter(variable != "speed_air") %>% .$variable
final_coef <- expand.grid(no_of_vars = 1:length(signif_vars),
variables = "text", stringsAsFactors = FALSE,
r_squared = 0,
adj_r_squared = 0,
aic = 0)
for (i in 1:length(signif_vars)) {
vars_to_lm <- signif_vars[1:i]
lm_model <- lm(as.formula(paste("distance ~", paste(vars_to_lm, collapse = " + "))),
data_with_bool)
lm_summary <- lm_model %>% summary()
final_coef$variables[i] <- paste(vars_to_lm, collapse = " + ")
final_coef$r_squared[i] <- lm_summary$r.squared
final_coef$adj_r_squared[i] <- lm_summary$adj.r.squared
final_coef$aic[i] <- lm_model %>% AIC()
}
final_coef %>% kable(caption = "Performance metrics of models with multiple variables")
no_of_vars | variables | r_squared | adj_r_squared | aic |
---|---|---|---|---|
1 | speed_ground | 0.7424711 | 0.7421638 | 12725.62 |
2 | speed_ground + aircraft_bool | 0.8170914 | 0.8166544 | 12440.22 |
3 | speed_ground + aircraft_bool + height | 0.8404220 | 0.8398493 | 12327.60 |
4 | speed_ground + aircraft_bool + height + pitch | 0.8412080 | 0.8404473 | 12325.45 |
5 | speed_ground + aircraft_bool + height + pitch + duration | 0.8417045 | 0.8406950 | 11608.33 |
6 | speed_ground + aircraft_bool + height + pitch + duration + no_pasg | 0.8420378 | 0.8408274 | 11608.67 |
grid.arrange(final_coef %>% ggplot(aes(no_of_vars, r_squared)) + geom_point() + geom_line(),
final_coef %>% ggplot(aes(no_of_vars, adj_r_squared)) + geom_point() + geom_line(),
final_coef %>% ggplot(aes(no_of_vars, aic)) + geom_point() + geom_line(),
nrow = 3, ncol = 1, top = "Increasing performance with increasing number of response variables")
When looking at the R2 and adj.R2, the performance plateaus at 3 while AIC reduces to the least value with 5 variables. Building the model with 5 variables would be optimal.
The model and the parameters are tabulated below.
no_of_vars | variables | r_squared | adj_r_squared | aic | |
---|---|---|---|---|---|
5 | 5 | speed_ground + aircraft_bool + height + pitch + duration | 0.8417045 | 0.840695 | 11608.33 |
stepAIC
expects data to be NA free, the duration
variable is filtered. data_na_rem <- na.omit(data_with_bool[,-c(1,5)])
full_model <- lm(distance ~ . , data_na_rem)
null_model <- lm(distance ~ 1 , data_na_rem)
step_aic_model <- stepAIC(null_model, scope = list(lower = null_model,
upper = full_model),
direction = "forward",
trace = 0) #disable the trace output
distance ~ speed_ground + aircraft_bool + height
. data_na_rem <- na.omit(data_with_bool[,-c(1,5)])
for (i in 1:length(signif_vars)) {
vars_to_lm <- signif_vars[1:i]
lm_model <- lm(as.formula(paste("distance ~", paste(vars_to_lm, collapse = " + "))),
data_na_rem)
lm_summary <- lm_model %>% summary()
final_coef$variables[i] <- paste(vars_to_lm, collapse = " + ")
final_coef$aic[i] <- lm_model %>% AIC()
}
final_coef %>% dplyr::select(c("variables", "aic")) %>%
kable(caption = "AIC plateaus at line three")
variables | aic |
---|---|
speed_ground | 11977.73 |
speed_ground + aircraft_bool | 11711.45 |
speed_ground + aircraft_bool + height | 11606.04 |
speed_ground + aircraft_bool + height + pitch | 11606.46 |
speed_ground + aircraft_bool + height + pitch + duration | 11608.33 |
speed_ground + aircraft_bool + height + pitch + duration + no_pasg | 11608.67 |
Now the model matches with the results of stepAIC
, as it can be seen that the model distance ~ speed_ground + aircraft_bool + height
has the best AIC.
long_landing
and risky_landing
are introduced into the dataset. They are binary, set to 1 if distance > 2500
or distance > 3000
respectively. The distributions of the new variables are shown below.
data_with_bool$long_landing <- as.factor(ifelse(data$distance > 2500, 1, 0))
data_with_bool$risky_landing <- as.factor(ifelse(data$distance > 3000, 1, 0))
histo_viz <- list()
for (i in c("long_landing", "risky_landing")) {
histo_viz[[i]] <- data_with_bool %>% ggplot(aes_string(i, label = i)) + geom_bar() +
geom_text(stat = "count", aes(label = ..count..), vjust=-1)
}
grid.arrange(grobs = histo_viz, nrow = 1, ncol = 2, top = "Distribution of binary variables" )
speed_air
, speed_ground
, aircraft_bool
and pitch
variables have a significant effect on long_landing
. uni_glm_table_long <- expand.grid(variable = c(variables[-c(1,8)], "aircraft_bool"),
stringsAsFactors = FALSE,
glm_coefficient = 0,
glm_p_value = 0,
glm_direction = "a",
glm_odds = 0)
j <- 1
for (i in c(variables[-c(1,8)], "aircraft_bool")) {
set.seed(1804)
lm_summary <- glm(data_with_bool$long_landing ~ data_with_bool[[i]],
family = binomial) %>% summary()
uni_glm_table_long$glm_coefficient[j] <- lm_summary$coefficients[2,1]
uni_glm_table_long$glm_p_value[j] <- lm_summary$coefficients[2,4]
uni_glm_table_long$glm_direction[j] <- ifelse(lm_summary$coefficients[2,1] > 0,
"positive", "negative")
uni_glm_table_long$glm_odds[j] <- exp(lm_summary$coefficients[2,1])
j <- j + 1
}
uni_glm_table_long %>% arrange(glm_p_value) %>% mutate_if(is.numeric, round, 4) %>%
kable(caption = "Coefficient table of LONG_LANDING, arranged by p-value")
variable | glm_coefficient | glm_p_value | glm_direction | glm_odds |
---|---|---|---|---|
speed_ground | 0.4723 | 0.0000 | positive | 1.6038 |
speed_air | 0.5123 | 0.0000 | positive | 1.6692 |
aircraft_bool | 0.8774 | 0.0001 | positive | 2.4047 |
pitch | 0.4335 | 0.0297 | positive | 1.5427 |
height | 0.0109 | 0.2966 | positive | 1.0110 |
no_pasg | -0.0080 | 0.5637 | negative | 0.9920 |
duration | -0.0011 | 0.6027 | negative | 0.9989 |
long_landing
, the significant factors identified earlier are visualized and it’s observed that the distribution is similar.
long_signif_vars <- uni_glm_table_long %>% filter(glm_p_value < 0.05) %>% arrange(glm_p_value)
association_viz <- list()
for (i in long_signif_vars$variable) {
association_viz[[i]] <- data_with_bool %>%
ggplot(aes_string(x = i, fill = "long_landing")) +
geom_histogram(position = "dodge", binwidth = 1)
}
grid.arrange(grobs = association_viz, nrow = 2, ncol = 2, top = "Distrubution of binary variables")
association_viz2 <- list()
for (i in long_signif_vars$variable) {
association_viz2[[i]] <- data_with_bool %>% ggplot(aes_string(i, "long_landing")) +
geom_jitter(alpha = 0.8)
}
grid.arrange(grobs = association_viz2, nrow = 2, ncol = 2, top = "Association")
As established earlier in the multicollinearity section, speed_air
and speed_ground
are highly correlated and hence speed_air
will not be used in the model. Thus the full model will be long_landing ~ speed_ground, aircraft_bool, pitch
. The coefficient table output is presented below.
set.seed(1804)
long_model <- glm(long_landing ~ speed_ground + pitch + aircraft_bool,
family = binomial, data_with_bool)
long_aic <- summary(long_model) %>% .$aic
summary(long_model) %>% .$coefficients %>% as.data.frame() %>%
rownames_to_column("Variable") %>% mutate_if(is.numeric, round, 4) %>%
arrange(`Pr(>|z|)`) %>% kable()
Variable | Estimate | Std. Error | z value | Pr(>|z|) |
---|---|---|---|---|
(Intercept) | -67.9285 | 10.4840 | -6.4793 | 0.0000 |
speed_ground | 0.6147 | 0.0918 | 6.6936 | 0.0000 |
aircraft_bool | 3.0435 | 0.7334 | 4.1496 | 0.0000 |
pitch | 1.0660 | 0.6039 | 1.7652 | 0.0775 |
set.seed(1804)
data_na_rem <- na.omit(data_with_bool[,-c(1,5,8)])
full_model <- glm(long_landing ~ .,
family = binomial, data_na_rem)
null_model <- glm(long_landing ~ 1,
family = binomial, data_na_rem)
step_aic_model <- step(null_model, scope = list(lower = null_model,
upper = full_model),
direction = "forward",
trace = 0) #disable the trace output
The model with best AIC has the formula long_landing ~ speed_ground + aircraft_bool + height + pitch
. This almost matches with significance test output except for the height
variable.
set.seed(1804)
data_na_rem <- na.omit(data_with_bool[,-c(1,5,8)])
full_model <- glm(long_landing ~ .,
family = binomial, data_na_rem)
null_model <- glm(long_landing ~ 1,
family = binomial, data_na_rem)
step_bic_model <- step(null_model, scope = list(lower = null_model,
upper = full_model),
direction = "forward", k = log(nrow(data_na_rem)),
trace = 0) #disable the trace output
The model with best BIC has the formula long_landing ~ speed_ground + aircraft_bool + height
, same as that we got for stepAIC.
uni_glm_table_long %>% arrange(desc(glm_coefficient)) %>%
mutate_if(is.numeric, round, 4) %>% dplyr::select(c(1, 2, 4)) %>% kable()
variable | glm_coefficient | glm_direction |
---|---|---|
aircraft_bool | 0.8774 | positive |
speed_air | 0.5123 | positive |
speed_ground | 0.4723 | positive |
pitch | 0.4335 | positive |
height | 0.0109 | positive |
duration | -0.0011 | negative |
no_pasg | -0.0080 | negative |
speed_air
and speed_ground
variables have a significant effect on risky_landing
. uni_glm_table_risky <- expand.grid(variable = c(variables[-c(1,8)], "aircraft_bool"),
stringsAsFactors = FALSE,
glm_coefficient = 0,
glm_p_value = 0,
glm_direction = "a",
glm_odds = 0)
j <- 1
for (i in c(variables[-c(1,8)], "aircraft_bool")) {
set.seed(1804)
lm_summary <- glm(data_with_bool$risky_landing ~ data_with_bool[[i]],
family = binomial) %>% summary()
uni_glm_table_risky$glm_coefficient[j] <- lm_summary$coefficients[2,1]
uni_glm_table_risky$glm_p_value[j] <- lm_summary$coefficients[2,4]
uni_glm_table_risky$glm_direction[j] <- ifelse(lm_summary$coefficients[2,1] > 0,
"positive", "negative")
uni_glm_table_risky$glm_odds[j] <- exp(lm_summary$coefficients[2,1])
j <- j + 1
}
uni_glm_table_risky %>% arrange(glm_p_value) %>% mutate_if(is.numeric, round, 4) %>%
kable(caption = "Coefficient table of risky_landing, arranged by p-value")
variable | glm_coefficient | glm_p_value | glm_direction | glm_odds |
---|---|---|---|---|
speed_ground | 0.6142 | 0.0000 | positive | 1.8482 |
speed_air | 0.8704 | 0.0000 | positive | 2.3879 |
aircraft_bool | 1.0334 | 0.0003 | positive | 2.8106 |
pitch | 0.4319 | 0.0836 | positive | 1.5403 |
no_pasg | -0.0257 | 0.1420 | negative | 0.9746 |
duration | -0.0012 | 0.6536 | negative | 0.9988 |
height | 0.0010 | 0.9374 | positive | 1.0010 |
risky_landing
, the significant factors identified earlier are visualized and it’s observed that the distribution is similar.
long_signif_vars <- uni_glm_table_risky %>% filter(glm_p_value < 0.05) %>% arrange(glm_p_value)
association_viz <- list()
for (i in long_signif_vars$variable) {
association_viz[[i]] <- data_with_bool %>%
ggplot(aes_string(x = i, fill = "risky_landing")) +
geom_histogram(position = "dodge", binwidth = 1)
}
grid.arrange(grobs = association_viz, nrow = 2, ncol = 2, top = "Distrubution of binary variables" )
association_viz2 <- list()
for (i in long_signif_vars$variable) {
association_viz2[[i]] <- data_with_bool %>% ggplot(aes_string(i, "risky_landing")) +
geom_jitter(alpha = 0.8)
}
grid.arrange(grobs = association_viz2, nrow = 2, ncol = 2, top = "Association")
As established earlier in the multicollinearity section, speed_air
and speed_ground
are highly correlated and hence speed_air
will not be used in the model. Thus the full model will be risky_landing ~ speed_ground, aircraft_bool
. The coefficient table output is presented below.
set.seed(1804)
risky_model <- glm(risky_landing ~ speed_ground + aircraft_bool,
family = binomial, data_with_bool)
risky_aic <- summary(risky_model) %>% .$aic
summary(risky_model) %>% .$coefficients %>% as.data.frame() %>%
rownames_to_column("Variable") %>% mutate_if(is.numeric, round, 4) %>%
arrange(`Pr(>|z|)`) %>% kable()
Variable | Estimate | Std. Error | z value | Pr(>|z|) |
---|---|---|---|---|
(Intercept) | -102.0772 | 24.7750 | -4.1202 | 0.0000 |
speed_ground | 0.9263 | 0.2248 | 4.1206 | 0.0000 |
aircraft_bool | 4.0190 | 1.2494 | 3.2168 | 0.0013 |
To ensure the correct set of variables are chosen for the model, stepwise regression is performed with all the variables we have, with AIC to measure performance.
set.seed(1804)
data_na_rem <- na.omit(data_with_bool[,-c(1,5,8)])
full_model <- glm(risky_landing ~ .,
family = binomial, data_na_rem)
null_model <- glm(risky_landing ~ 1,
family = binomial, data_na_rem)
step_aic_model <- step(null_model, scope = list(lower = null_model,
upper = full_model),
direction = "forward",
trace = 0) #disable the trace output
The model with best AIC has the formula risky_landing ~ speed_ground + aircraft_bool
. This matches with significance test output.
set.seed(1804)
data_na_rem <- na.omit(data_with_bool[,-c(1,5,8)])
full_model <- glm(risky_landing ~ .,
family = binomial, data_na_rem)
null_model <- glm(risky_landing ~ 1,
family = binomial, data_na_rem)
step_bic_model <- step(null_model, scope = list(lower = null_model,
upper = full_model),
direction = "forward", k = log(nrow(data_na_rem)),
trace = 0) #disable the trace output
The model with best BIC has the formula risky_landing ~ speed_ground + aircraft_bool
, same as AIC and the significance test parameters.
uni_glm_table_risky %>% arrange(desc(glm_coefficient)) %>%
mutate_if(is.numeric, round, 4) %>% dplyr::select(c(1, 2, 4)) %>% kable()
variable | glm_coefficient | glm_direction |
---|---|---|
aircraft_bool | 1.0334 | positive |
speed_air | 0.8704 | positive |
speed_ground | 0.6142 | positive |
pitch | 0.4319 | positive |
height | 0.0010 | positive |
duration | -0.0012 | negative |
no_pasg | -0.0257 | negative |
long_landing
was modeled with speed_ground, aircraft_bool and pitch
while risky_landing
was modeled with speed_ground and aircraft_bool
long_landing
model 89.30904 while that of risky_landing
is 46.0965964. This could indicate that we may have higer success of risky landing prediction
The ROC curves of both the models are plotted below and they follow a very similar pattern.
prediction(predict(long_model, type = "response"), data_with_bool$long_landing) %>%
performance("tpr", "fpr") %>%
plot(colorize = TRUE)
prediction(predict(risky_model, type = "response"), data_with_bool$risky_landing) %>%
performance("tpr", "fpr") %>%
plot(add = TRUE, colorize = TRUE)
A commercial airplane is passing over the threshold of the runway, at this moment we have its basic information and measures of its airborne performance (Boeing, duration=200, no_pasg=80, speed_ground=115, speed_air=120, height=40, pitch=4). Predict its probability of being a long landing and a risky landing, respectively. Report the predicted probability as well as its 95% confidence interval.
test_data <- data.frame(aircraft_bool = 1, duration = 200, no_pasg = 80, speed_ground = 115,
speed_air = 120, height = 40, pitch = 4)
long_prediction <- predict(long_model, test_data, type = "link", se.fit = TRUE)
risky_prediction <- predict(risky_model, test_data, type = "link", se.fit = TRUE)
long_prediction <- c(inv.logit(long_prediction$fit),
inv.logit(long_prediction$fit - 1.96 * long_prediction$se.fit),
inv.logit(long_prediction$fit + 1.96 * long_prediction$se.fit))
risky_prediction <- c(inv.logit(risky_prediction$fit),
inv.logit(risky_prediction$fit - 1.96 * risky_prediction$se.fit),
inv.logit(risky_prediction$fit + 1.96 * risky_prediction$se.fit))
The predicted probability of long_landing
is 0.9999577 and the 95% confidence interval is 0.9989955 and 0.9999982.
The predicted probability of risky_landing
is 0.999789 and the 95% confidence interval is 0.9874844 and 0.9999965.
Looks like this plane is headed for trouble.
The last few steps of risky_landing
are repeated with probit and complimentary log-log logistic models. As expected, the probit and c log-log coefficients are about 1.6 ~ 1.8 times lower than logit.
set.seed(1804)
risky_model <- glm(risky_landing ~ speed_ground + aircraft_bool,
family = binomial, data_with_bool)
logit_coef <- summary(risky_model) %>% .$coefficients %>% as.data.frame() %>%
rownames_to_column("Variable") %>% mutate_if(is.numeric, round, 4) %>%
arrange(`Pr(>|z|)`)
set.seed(1804)
prob_risky_model <- glm(risky_landing ~ speed_ground + aircraft_bool,
family = binomial(link = "probit"), data_with_bool)
prob_coef <- summary(prob_risky_model) %>% .$coefficients %>% as.data.frame() %>%
rownames_to_column("Variable") %>% mutate_if(is.numeric, round, 4) %>%
arrange(`Pr(>|z|)`)
set.seed(1804)
clog_risky_model <- glm(risky_landing ~ speed_ground + aircraft_bool,
family = binomial(link = "cloglog"), data_with_bool)
clog_coef <- summary(clog_risky_model) %>% .$coefficients %>% as.data.frame() %>%
rownames_to_column("Variable") %>% mutate_if(is.numeric, round, 4) %>%
arrange(`Pr(>|z|)`)
bind_rows(logit_coef %>% mutate(Link = "Logit"),
prob_coef %>% mutate(Link = "Probit"),
clog_coef %>% mutate(Link = "C Log-Log")) %>%
dplyr::select(c(6, 1:5)) %>% kable(caption = "Coefficients compared")
Link | Variable | Estimate | Std. Error | z value | Pr(>|z|) |
---|---|---|---|---|---|
Logit | (Intercept) | -102.0772 | 24.7750 | -4.1202 | 0.0000 |
Logit | speed_ground | 0.9263 | 0.2248 | 4.1206 | 0.0000 |
Logit | aircraft_bool | 4.0190 | 1.2494 | 3.2168 | 0.0013 |
Probit | (Intercept) | -58.6931 | 13.3132 | -4.4086 | 0.0000 |
Probit | speed_ground | 0.5322 | 0.1207 | 4.4112 | 0.0000 |
Probit | aircraft_bool | 2.3567 | 0.7016 | 3.3591 | 0.0008 |
C Log-Log | (Intercept) | -69.2654 | 14.7396 | -4.6993 | 0.0000 |
C Log-Log | speed_ground | 0.6221 | 0.1326 | 4.6898 | 0.0000 |
C Log-Log | aircraft_bool | 2.8984 | 0.8002 | 3.6221 | 0.0003 |
Plotting the ROC curves for all three models, it’s observed that they follow an extremely similar path.
prediction(predict(risky_model, type = "response"), data_with_bool$risky_landing) %>%
performance("tpr", "fpr") %>%
plot(colorize = TRUE)
prediction(predict(prob_risky_model, type = "response"), data_with_bool$risky_landing) %>%
performance("tpr", "fpr") %>%
plot(add = TRUE, colorize = TRUE)
prediction(predict(clog_risky_model, type = "response"), data_with_bool$risky_landing) %>%
performance("tpr", "fpr") %>%
plot(add = TRUE, colorize = TRUE)
The top 5 flights predicted to risky_landing
are printed below for each model.
top_5_risk <- bind_cols(predict(risky_model, type = "response") %>% as.data.frame() %>%
rownames_to_column("Logit") %>% dplyr::arrange(desc(`.`)) %>%
dplyr::select(1) %>% head(5),
predict(prob_risky_model, type = "response") %>% as.data.frame() %>%
rownames_to_column("Probit") %>% dplyr::arrange(desc(`.`)) %>%
dplyr::select(1) %>% head(5),
predict(clog_risky_model, type = "response") %>% as.data.frame() %>%
rownames_to_column("C Log Log") %>% dplyr::arrange(desc(`.`)) %>%
dplyr::select(1) %>% head(5))
top_5_risk %>% kable(caption = "Indices of top 5 flights at risk")
Logit | Probit | C Log Log |
---|---|---|
12 | 12 | 12 |
164 | 57 | 20 |
368 | 66 | 30 |
313 | 138 | 31 |
66 | 164 | 57 |
Matching the index we identify ten unique flights out of the 15 predicted by the three models. All three models predict Boeing flights to be at higher risk than Airbus flights.
data_with_bool[top_5_risk %>% as.matrix(),] %>%
filter(!duplicated(data_with_bool[top_5_risk %>% as.matrix(),])) %>%
kable(caption = "Flights that have high probability of risky landing")
aircraft | duration | no_pasg | speed_ground | speed_air | height | pitch | distance | aircraft_bool | long_landing | risky_landing |
---|---|---|---|---|---|---|---|---|---|---|
boeing | 180.61656 | 54 | 141.2186 | 141.7249 | 23.57594 | 5.216802 | 6533.048 | 1 | 1 | 1 |
boeing | 119.92455 | 64 | 136.6592 | 136.4234 | 44.28611 | 4.169404 | 6309.946 | 1 | 1 | 1 |
boeing | 63.32952 | 52 | 132.7847 | 132.9115 | 18.17703 | 4.110664 | 5343.201 | 1 | 1 | 1 |
boeing | 154.52460 | 67 | 129.3072 | 127.5933 | 23.97850 | 5.154699 | 5058.470 | 1 | 1 | 1 |
boeing | 161.89247 | 72 | 129.2649 | 128.4177 | 33.94900 | 4.139951 | 5381.959 | 1 | 1 | 1 |
boeing | 116.98454 | 67 | 122.7566 | 123.8826 | 30.21657 | 3.213703 | 4807.880 | 1 | 1 | 1 |
boeing | 209.10635 | 58 | 124.5699 | 125.9869 | 40.10112 | 4.648428 | 5147.410 | 1 | 1 | 1 |
boeing | 163.90650 | 55 | 119.3805 | 120.4447 | 38.55854 | 3.701449 | 4524.279 | 1 | 1 | 1 |
boeing | 140.23631 | 65 | 118.7420 | 119.4021 | 19.85619 | 4.646266 | 4217.129 | 1 | 1 | 1 |
boeing | 130.46356 | 52 | 116.7134 | 117.6565 | 36.19553 | 3.894352 | 4240.094 | 1 | 1 | 1 |
The three models are applied on the new line of data that we saw earlier and the probabilities are compared.
risky_prediction <- predict(risky_model, test_data, type = "link", se.fit = TRUE)
probit_prediction <- predict(prob_risky_model, test_data, type = "link", se.fit = TRUE)
clog_prediction <- predict(clog_risky_model, test_data, type = "link", se.fit = TRUE)
risky_prediction <- c(inv.logit(risky_prediction$fit),
inv.logit(risky_prediction$fit - 1.96 * risky_prediction$se.fit),
inv.logit(risky_prediction$fit + 1.96 * risky_prediction$se.fit))
probit_prediction <- c(inv.logit(probit_prediction$fit),
inv.logit(probit_prediction$fit - 1.96 * probit_prediction$se.fit),
inv.logit(probit_prediction$fit + 1.96 * probit_prediction$se.fit))
clog_prediction <- c(inv.logit(clog_prediction$fit),
inv.logit(clog_prediction$fit - 1.96 * clog_prediction$se.fit),
inv.logit(clog_prediction$fit + 1.96 * clog_prediction$se.fit))
data.frame(model = c("Probability", "Left Interval", "Right Interval"),
logit = risky_prediction,
probit = probit_prediction,
clog = clog_prediction) %>% kable()
model | logit | probit | clog |
---|---|---|---|
Probability | 0.9997890 | 0.9924005 | 0.9943444 |
Left Interval | 0.9874844 | 0.9347090 | 0.9463193 |
Right Interval | 0.9999965 | 0.9991612 | 0.9994300 |
It is observed that the standard errors of probit and complementary log-log is much higher, causing the probability to fluctuate much more than logit does.
A multinomial variable Y
is created and attached to the dataset.
Y = 1 if distance < 1000
Y = 2 if 1000 < = distance < 2500
Y = 3 otherwise
The distribution of the new variable is visualized in the histogram below.
data <- na.omit(data_with_bool[, -c(1, 5, 10, 11)])
data$y <- as.factor(ifelse(data$distance < 1000, 1,
ifelse(data$distance < 2500, 2, 3)))
data <- data[, -6]
data$y <- factor(data$y, levels = c(3, 2, 1))
data %>% ggplot(aes(y, label = y)) + geom_bar(aes(fill = y)) +
geom_text(stat = "count", aes(label = ..count..), vjust = 2)
Visible in below graph is that, the proportion of y is significantly different in aircraft_bool
and speed_ground
. A slighter pattern is observable in case of height
and pitch
but none for duration
and no_pasg
.
viz <- list()
viz[[1]] <- data %>% group_by(aircraft_bool, y) %>% summarize(count = n()) %>%
ggplot(aes(as.factor(aircraft_bool))) + geom_col(aes(y = count, fill = y)) +
theme(axis.title.x = element_blank())
viz[[2]] <- data %>% arrange(y) %>% ggplot(aes(1:nrow(data), speed_ground)) +
geom_line(aes(col = y), show.legend = FALSE) + theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())
viz[[3]] <- data %>% arrange(y) %>% ggplot(aes(1:nrow(data), duration)) +
geom_line(aes(col = y), show.legend = FALSE) + theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())
viz[[4]] <- data %>% arrange(y) %>% ggplot(aes(1:nrow(data), no_pasg)) +
geom_line(aes(col = y), show.legend = FALSE) + theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())
viz[[5]] <- data %>% arrange(y) %>% ggplot(aes(1:nrow(data), height)) +
geom_line(aes(col = y), show.legend = FALSE) + theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())
viz[[6]] <- data %>% arrange(y) %>% ggplot(aes(1:nrow(data), pitch)) +
geom_line(aes(col = y), show.legend = FALSE) + theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())
grid.arrange(grobs = viz, nrow = 3, ncol = 2,
top = "Relationship between each variable against y" )
Stepwise regression is performed to get the model with the lowest AIC. Direction is set to both.
set.seed(1804)
full_model <- multinom(y ~ . , data)
null_model <- multinom(y ~ 1 , data)
step_aic_model <- stepAIC(null_model, scope = list(lower = null_model,
upper = full_model),
direction = "both",
trace = 0) #disable the trace output
# as.formula(step_aic_model$call[[2]])
The model y ~ speed_ground + aircraft_bool + height + pitch
is chosen as the best model and this corroborates with our inference from the visualization earlier. The summary of the model is printed below.
model_summary <- multinom(as.formula(step_aic_model$call[[2]]), data) %>%
summary() %>% .$coefficients %>% as_data_frame() %>% mutate(y = c("y = 2", "y = 1")) %>%
dplyr::select(6, 1:5)
model_summary %>% kable(caption = "Intercepts for two of the multinomial variables")
y | (Intercept) | speed_ground | aircraft_bool | height | pitch |
---|---|---|---|---|---|
y = 2 | 113.2645 | -0.9720931 | -4.919093 | -0.2417678 | -1.337133 |
y = 1 | 132.8014 | -1.1896100 | -8.645257 | -0.3824548 | -1.011077 |
Clearly evident, the distribution of number of passengers is normal. But since the number cannot be negative, a log-normal distribution would be a better fit.
data_with_bool <- data_with_bool2[, -c(1)]
grid.arrange(data_with_bool %>% ggplot(aes(no_pasg)) + geom_histogram(),
ggplot(data_with_bool, aes(sample = no_pasg)) +
geom_qq(alpha = 0.3) + geom_qq_line(linetype = "dashed", color = "red2"),
ncol = 2, nrow = 1)
The mean and standard deviation in our dataset is 60.0809524 and 7.5013305.
The other variables of the dataset is correlated against no_pasg
to test if they could be predicted. But none of the correleation is high enough/ significant enough suggesting that no_pasg
is a independant variable.
corr_table <- expand.grid(variable = colnames(data_with_bool[-2]),
stringsAsFactors = FALSE,
correlation = 0,
corr_direction = "text",
corr_p_value = 0)
j <- 1
for (i in colnames(data_with_bool[-2])) {
cor_test_var <- cor.test(data_with_bool$no_pasg, data_with_bool[[i]])
corr_table$correlation[j] <- cor_test_var$estimate
if (cor_test_var$estimate < 0) {
corr_table$corr_direction[j] <- "negative"
} else if (cor_test_var$estimate > 0) {
corr_table$corr_direction[j] <- "positive"
} else {
corr_table$corr_direction[j] <- "neutral"
}
corr_table$correlation[j] <- cor_test_var$estimate
corr_table$corr_p_value[j] <- cor_test_var$p.value
j <- j + 1
}
corr_table %>% arrange(desc(abs(corr_p_value))) %>% kable(caption = "Pairwise Correlation")
variable | correlation | corr_direction | corr_p_value |
---|---|---|---|
speed_ground | -0.0010907 | negative | 0.9748193 |
speed_air | -0.0115736 | negative | 0.8691793 |
aircraft_bool | -0.0194000 | negative | 0.5744694 |
pitch | -0.0198713 | negative | 0.5652069 |
distance | -0.0245707 | negative | 0.4769768 |
height | 0.0319047 | positive | 0.3557235 |
duration | -0.0383693 | negative | 0.2814213 |