Starting fresh
rm(list = ls())
Get the Data
# Opening a file from a url (file in Excel), name it 'fragility23'
# linkGit stores the URL to the raw Excel file on GitHub
linkGit <- "https://github.com/DACSS-Fundamentals/overview/raw/refs/heads/main/FSI-2023-DOWNLOAD.xlsx"
#install.packages('rio') #package needed for importing Excel files from URLs
library(rio)
## Warning: package 'rio' was built under R version 4.4.3
# rio::import() to read files directly from URLs.
fragility23 <- rio::import(file = linkGit)
Exploratory commands
# names() returns a vector of all column names in the dataframe
names(x = fragility23)
## [1] "Country" "Year"
## [3] "Rank" "Total"
## [5] "S1: Demographic Pressures" "S2: Refugees and IDPs"
## [7] "C3: Group Grievance" "E3: Human Flight and Brain Drain"
## [9] "E2: Economic Inequality" "E1: Economy"
## [11] "P1: State Legitimacy" "P2: Public Services"
## [13] "P3: Human Rights" "C1: Security Apparatus"
## [15] "C2: Factionalized Elites" "X1: External Intervention"
# str() shows the structure of the object including data types
# finds out if numeric columns have been read as should
str(object = fragility23)
## 'data.frame': 179 obs. of 16 variables:
## $ Country : chr "Somalia" "Yemen" "South Sudan" "Congo Democratic Republic" ...
## $ Year : num 2023 2023 2023 2023 2023 ...
## $ Rank : chr "1st" "2nd" "3rd" "4th" ...
## $ Total : num 112 109 109 107 107 ...
## $ S1: Demographic Pressures : num 10 9.6 9.7 9.7 7.4 9.2 8.8 9.3 9.5 8.8 ...
## $ S2: Refugees and IDPs : num 9 9.6 10 9.8 9.1 8.6 9.6 9.5 9 7.7 ...
## $ C3: Group Grievance : num 8.7 8.8 8.6 9.4 9.1 8.3 9.3 8.1 8.1 5.5 ...
## $ E3: Human Flight and Brain Drain: num 8.6 6.4 6.5 6.4 8 8.5 7.5 6.2 7.7 8.3 ...
## $ E2: Economic Inequality : num 9.1 7.9 8.6 8.4 6.5 8.2 8.5 9.6 8.7 9.2 ...
## $ E1: Economy : num 9.5 9.9 8.6 8.1 9.6 9.6 9.3 8.2 8.4 8.9 ...
## $ P1: State Legitimacy : num 9.6 9.8 9.8 9.3 10 9.4 9.4 8.9 9.1 9.9 ...
## $ P2: Public Services : num 9.8 9.6 9.7 9.3 9 10 8.6 10 9.6 9.8 ...
## $ P3: Human Rights : num 9 9.6 8.7 9.3 9.1 8.7 9.2 9.1 8.4 8.7 ...
## $ C1: Security Apparatus : num 9.5 8.6 9.9 8.8 9.4 9.7 8.3 8 8.7 6.8 ...
## $ C2: Factionalized Elites : num 10 9.9 9.2 9.6 9.9 8.7 9.6 9.4 9.5 9.7 ...
## $ X1: External Intervention : num 9.1 9.2 9.2 9.1 10 7.7 8.1 9.4 7.9 9.6 ...
# Show me the first 10 rows
# head() displays the first n rows of the dataframe
head(x = fragility23, 10)
## Country Year Rank Total S1: Demographic Pressures
## 1 Somalia 2023 1st 111.9 10.0
## 2 Yemen 2023 2nd 108.9 9.6
## 3 South Sudan 2023 3rd 108.5 9.7
## 4 Congo Democratic Republic 2023 4th 107.2 9.7
## 5 Syria 2023 5th 107.1 7.4
## 6 Afghanistan 2023 6th 106.6 9.2
## 7 Sudan 2023 7th 106.2 8.8
## 8 Central African Republic 2023 8th 105.7 9.3
## 9 Chad 2023 9th 104.6 9.5
## 10 Haiti 2023 10th 102.9 8.8
## S2: Refugees and IDPs C3: Group Grievance E3: Human Flight and Brain Drain
## 1 9.0 8.7 8.6
## 2 9.6 8.8 6.4
## 3 10.0 8.6 6.5
## 4 9.8 9.4 6.4
## 5 9.1 9.1 8.0
## 6 8.6 8.3 8.5
## 7 9.6 9.3 7.5
## 8 9.5 8.1 6.2
## 9 9.0 8.1 7.7
## 10 7.7 5.5 8.3
## E2: Economic Inequality E1: Economy P1: State Legitimacy P2: Public Services
## 1 9.1 9.5 9.6 9.8
## 2 7.9 9.9 9.8 9.6
## 3 8.6 8.6 9.8 9.7
## 4 8.4 8.1 9.3 9.3
## 5 6.5 9.6 10.0 9.0
## 6 8.2 9.6 9.4 10.0
## 7 8.5 9.3 9.4 8.6
## 8 9.6 8.2 8.9 10.0
## 9 8.7 8.4 9.1 9.6
## 10 9.2 8.9 9.9 9.8
## P3: Human Rights C1: Security Apparatus C2: Factionalized Elites
## 1 9.0 9.5 10.0
## 2 9.6 8.6 9.9
## 3 8.7 9.9 9.2
## 4 9.3 8.8 9.6
## 5 9.1 9.4 9.9
## 6 8.7 9.7 8.7
## 7 9.2 8.3 9.6
## 8 9.1 8.0 9.4
## 9 8.4 8.7 9.5
## 10 8.7 6.8 9.7
## X1: External Intervention
## 1 9.1
## 2 9.2
## 3 9.2
## 4 9.1
## 5 10.0
## 6 7.7
## 7 8.1
## 8 9.4
## 9 7.9
## 10 9.6
# Show me the last 10 rows ,tail() displays the last n rows of the dataframe
tail(x = fragility23, 10)
## Country Year Rank Total S1: Demographic Pressures
## 170 Sweden 2023 170th 20.6 3.0
## 171 Luxembourg 2023 172nd 19.5 2.4
## 172 Ireland 2023 171st 19.5 2.8
## 173 Canada 2023 173rd 18.9 1.2
## 174 Denmark 2023 174th 17.9 2.3
## 175 Switzerland 2023 175th 17.8 2.4
## 176 New Zealand 2023 176th 16.7 1.1
## 177 Finland 2023 177th 16.0 1.7
## 178 Iceland 2023 178th 15.7 1.5
## 179 Norway 2023 179th 14.5 1.4
## S2: Refugees and IDPs C3: Group Grievance E3: Human Flight and Brain Drain
## 170 3.7 2.3 0.6
## 171 2.8 1.5 1.7
## 172 1.6 0.5 2.5
## 173 2.0 2.0 0.7
## 174 3.0 3.1 1.0
## 175 3.2 2.1 1.0
## 176 1.2 2.0 1.6
## 177 1.9 0.3 1.5
## 178 1.5 0.5 1.6
## 179 1.7 3.1 0.7
## E2: Economic Inequality E1: Economy P1: State Legitimacy
## 170 2.3 1.3 0.5
## 171 1.8 2.4 0.3
## 172 1.8 1.7 0.5
## 173 2.5 1.4 0.4
## 174 1.8 1.0 0.3
## 175 2.4 1.6 0.3
## 176 2.6 2.6 0.5
## 177 1.6 2.7 0.4
## 178 1.5 2.6 0.4
## 179 1.4 1.4 0.4
## P2: Public Services P3: Human Rights C1: Security Apparatus
## 170 1.0 1.5 2.1
## 171 1.3 1.1 0.4
## 172 1.9 1.6 2.1
## 173 1.7 1.9 2.2
## 174 1.7 0.6 1.1
## 175 1.6 0.4 1.4
## 176 1.1 0.5 1.6
## 177 1.0 0.5 2.0
## 178 0.9 0.4 0.4
## 179 1.0 0.4 1.4
## C2: Factionalized Elites X1: External Intervention
## 170 1.8 0.5
## 171 3.4 0.4
## 172 1.5 1.0
## 173 2.5 0.4
## 174 1.4 0.6
## 175 1.0 0.4
## 176 1.4 0.5
## 177 1.4 1.0
## 178 1.8 2.6
## 179 1.1 0.5
Transformative commands
# Keep Country, Total, S1: Demographic Pressures, P1: State Legitimacy, E2: Economic Inequality into object 'frag23_sub'
# Using grep() to find column names matching a pattern
# grep() searches for patterns in the column names
# pattern = "Country|S1|P1|E2|Total" means find columns with any of these names
# x = names(fragility23) searches within the column names
# fixed = F allows for regex pattern matching
# value = T returns the actual names (not positions)
grep(pattern = "Country|S1|P1|E2|Total", x = names(fragility23), fixed = F, value = T)
## [1] "Country" "Total"
## [3] "S1: Demographic Pressures" "E2: Economic Inequality"
## [5] "P1: State Legitimacy"
# Using keep-grep to select only the columns we want
# This combines grep with subsetting to keep only matched columns
# names(fragility23) gets all column names
# fixed = F allows pattern matching
# value = T returns column names instead of positions
keep <- grep("Country|S1|P1|E2|Total", names(fragility23), fixed = F, value = T)
# Create the subset dataframe using the keep vector
frag23_sub <- fragility23[, keep]
# see the structure of the new subset
str(frag23_sub)
## 'data.frame': 179 obs. of 5 variables:
## $ Country : chr "Somalia" "Yemen" "South Sudan" "Congo Democratic Republic" ...
## $ Total : num 112 109 109 107 107 ...
## $ S1: Demographic Pressures: num 10 9.6 9.7 9.7 7.4 9.2 8.8 9.3 9.5 8.8 ...
## $ E2: Economic Inequality : num 9.1 7.9 8.6 8.4 6.5 8.2 8.5 9.6 8.7 9.2 ...
## $ P1: State Legitimacy : num 9.6 9.8 9.8 9.3 10 9.4 9.4 8.9 9.1 9.9 ...
# see beginning rows of the subset
head(frag23_sub)
## Country Total S1: Demographic Pressures
## 1 Somalia 111.9 10.0
## 2 Yemen 108.9 9.6
## 3 South Sudan 108.5 9.7
## 4 Congo Democratic Republic 107.2 9.7
## 5 Syria 107.1 7.4
## 6 Afghanistan 106.6 9.2
## E2: Economic Inequality P1: State Legitimacy
## 1 9.1 9.6
## 2 7.9 9.8
## 3 8.6 9.8
## 4 8.4 9.3
## 5 6.5 10.0
## 6 8.2 9.4
# display renamed columns
names(frag23_sub)
## [1] "Country" "Total"
## [3] "S1: Demographic Pressures" "E2: Economic Inequality"
## [5] "P1: State Legitimacy"
# order the dataframe by E2 in ascending order (best = lowest values) to get top ten best countries on e2
ordered_by_E2 <- frag23_sub[order(frag23_sub$`E2: Economic Inequality`), ]
# take the first 10 rows (top 10 best/lowest E2 scores)
top10_best_E2 <- ordered_by_E2[1:10, ]
print(top10_best_E2)
## Country Total S1: Demographic Pressures E2: Economic Inequality
## 179 Norway 14.5 1.4 1.4
## 178 Iceland 15.7 1.5 1.5
## 177 Finland 16.0 1.7 1.6
## 169 Netherlands 21.0 2.5 1.8
## 171 Luxembourg 19.5 2.4 1.8
## 172 Ireland 19.5 2.8 1.8
## 174 Denmark 17.9 2.3 1.8
## 151 Czech Republic 40.2 3.2 1.9
## 155 Slovak Republic 37.8 2.6 2.2
## 160 Belgium 31.4 4.2 2.2
## P1: State Legitimacy
## 179 0.4
## 178 0.4
## 177 0.4
## 169 0.3
## 171 0.3
## 172 0.5
## 174 0.3
## 151 3.9
## 155 3.2
## 160 0.8
# get the top 10 worst (highest E2 scores) using decreasing=TRUE
worst_ordered <- frag23_sub[order(frag23_sub$`E2: Economic Inequality`, decreasing = TRUE), ]
top10_worst_E2 <- worst_ordered[1:10, ]
print(top10_worst_E2)
## Country Total S1: Demographic Pressures
## 8 Central African Republic 105.7 9.3
## 10 Haiti 102.9 8.8
## 22 Mozambique 94.0 9.6
## 49 Madagascar 81.7 9.6
## 1 Somalia 111.9 10.0
## 48 Zambia 81.8 9.4
## 31 Guinea Bissau 89.9 8.9
## 39 Angola 86.9 9.3
## 9 Chad 104.6 9.5
## 3 South Sudan 108.5 9.7
## E2: Economic Inequality P1: State Legitimacy
## 8 9.6 8.9
## 10 9.2 9.9
## 22 9.2 7.1
## 49 9.2 6.7
## 1 9.1 9.6
## 48 9.1 6.7
## 31 8.9 9.1
## 39 8.8 8.1
## 9 8.7 9.1
## 3 8.6 9.8
Computations
# Give the statistical description of 'frag23_sub'
summary(frag23_sub)
## Country Total S1: Demographic Pressures
## Length:179 Min. : 14.50 Min. : 1.100
## Class :character 1st Qu.: 49.00 1st Qu.: 4.100
## Mode :character Median : 68.20 Median : 5.900
## Mean : 65.83 Mean : 5.956
## 3rd Qu.: 82.20 3rd Qu.: 8.050
## Max. :111.90 Max. :10.000
## E2: Economic Inequality P1: State Legitimacy
## Min. :1.400 Min. : 0.300
## 1st Qu.:3.650 1st Qu.: 3.650
## Median :5.200 Median : 6.400
## Mean :5.323 Mean : 5.741
## 3rd Qu.:7.200 3rd Qu.: 8.100
## Max. :9.600 Max. :10.000
# The value of the worst quartile in Total with na.rm=TRUE to remove missing values before calculation
worst_quartile_value <- quantile(frag23_sub$Total, 0.75, na.rm = TRUE)
cat("Worst Quartile Threshold (75th percentile):", worst_quartile_value, "\n")
## Worst Quartile Threshold (75th percentile): 82.2
# show all quartiles
quartiles_total <- quantile(frag23_sub$Total, probs = c(0.25, 0.5, 0.75, 1), na.rm = TRUE)
print(quartiles_total)
## 25% 50% 75% 100%
## 49.0 68.2 82.2 111.9
# Show correlations between 'S1', 'E2', 'P1' must select only the numeric columns we want to correlate
correlation_vars <- frag23_sub[, c('S1: Demographic Pressures',
'P1: State Legitimacy',
'E2: Economic Inequality')]
# Calculate the correlation matrix with use="complete.obs" excludes rows with any missing values
cor_matrix <- cor(correlation_vars, use = "complete.obs")
# Display the correlation matrix rounded to 3 decimal places
print("Correlation Matrix:")
## [1] "Correlation Matrix:"
print(round(cor_matrix, 3))
## S1: Demographic Pressures P1: State Legitimacy
## S1: Demographic Pressures 1.000 0.657
## P1: State Legitimacy 0.657 1.000
## E2: Economic Inequality 0.854 0.666
## E2: Economic Inequality
## S1: Demographic Pressures 0.854
## P1: State Legitimacy 0.666
## E2: Economic Inequality 1.000
# Test if correlation between S1 and E2 is statistically significant using cor.test() performs a hypothesis test for correlation
cor_test_S1_E2 <- cor.test(frag23_sub$`S1: Demographic Pressures`,
frag23_sub$`E2: Economic Inequality`)
cat("\n\nCorrelation Test: S1 (Demographic Pressures) vs E2 (Economic Inequality)\n")
##
##
## Correlation Test: S1 (Demographic Pressures) vs E2 (Economic Inequality)
print(cor_test_S1_E2)
##
## Pearson's product-moment correlation
##
## data: frag23_sub$`S1: Demographic Pressures` and frag23_sub$`E2: Economic Inequality`
## t = 21.814, df = 177, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8082838 0.8890834
## sample estimates:
## cor
## 0.8537425
# Test correlation between P1 and E2
cor_test_P1_E2 <- cor.test(frag23_sub$`P1: State Legitimacy`,
frag23_sub$`E2: Economic Inequality`)
cat("\n\nCorrelation Test: P1 (State Legitimacy) vs E2 (Economic Inequality)\n")
##
##
## Correlation Test: P1 (State Legitimacy) vs E2 (Economic Inequality)
print(cor_test_P1_E2)
##
## Pearson's product-moment correlation
##
## data: frag23_sub$`P1: State Legitimacy` and frag23_sub$`E2: Economic Inequality`
## t = 11.884, df = 177, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5757609 0.7404948
## sample estimates:
## cor
## 0.6661759
# test correlation between S1 and P1
cor_test_S1_P1 <- cor.test(frag23_sub$`S1: Demographic Pressures`,
frag23_sub$`P1: State Legitimacy`)
cat("\n\nCorrelation Test: S1 (Demographic Pressures) vs P1 (State Legitimacy)\n")
##
##
## Correlation Test: S1 (Demographic Pressures) vs P1 (State Legitimacy)
print(cor_test_S1_P1)
##
## Pearson's product-moment correlation
##
## data: frag23_sub$`S1: Demographic Pressures` and frag23_sub$`P1: State Legitimacy`
## t = 11.598, df = 177, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5648893 0.7331297
## sample estimates:
## cor
## 0.6571171
# Regress P1 and E2 on S1 predict S1 using P1 and E2 as predictors (linear regression)
# dependent variable (outcome) is S1, independent variables (predictors) are P1 and E2
regression_model <- lm(`S1: Demographic Pressures` ~ `P1: State Legitimacy` + `E2: Economic Inequality`,
data = frag23_sub)
#display results
summary(regression_model)
##
## Call:
## lm(formula = `S1: Demographic Pressures` ~ `P1: State Legitimacy` +
## `E2: Economic Inequality`, data = frag23_sub)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.6991 -0.8670 0.1260 0.7808 2.6197
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.85359 0.24234 3.522 0.000545 ***
## `P1: State Legitimacy` 0.12477 0.04024 3.100 0.002250 **
## `E2: Economic Inequality` 0.82389 0.05645 14.594 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.162 on 176 degrees of freedom
## Multiple R-squared: 0.7429, Adjusted R-squared: 0.74
## F-statistic: 254.3 on 2 and 176 DF, p-value: < 2.2e-16
Plotting
``` r
# create a histogram to show the distribution of state legitimacy
hist(frag23_sub$`P1: State Legitimacy`,
main = "Distribution of State Legitimacy (P1)",
xlab = "State Legitimacy Score",
col = "steelblue",
border = "black")
# Visual correlation between S1 and E2, color points if country is on the worst quartile of Total
# Total >= worst_quartile_value returns TRUE/FALSE for each country
in_worst_quartile <- frag23_sub$Total >= worst_quartile_value
# create colors vector: red for worst quartile, blue for others. ifelse() returns "red" when TRUE, "blue" when FALSE
point_colors <- ifelse(in_worst_quartile, "red", "blue")
# create scatter plot using color vectors
plot(frag23_sub$`S1: Demographic Pressures`,
frag23_sub$`E2: Economic Inequality`,
main = "Demographic Pressures vs Economic Inequality",
xlab = "S1: Demographic Pressures",
ylab = "E2: Economic Inequality",
pch = 19,
col = point_colors)
# added a legend to explain the colors
legend("topright",
legend = c("Not in Worst Quartile", "Worst Quartile of Total"),
col = c("blue", "red"),
pch = 19)
# create predicted values from our regression model
frag23_sub$predicted_S1 <- predict(regression_model, frag23_sub)
# plot actual vs predicted values
plot(frag23_sub$`S1: Demographic Pressures`,
frag23_sub$predicted_S1,
main = "Regression Model: Actual vs Predicted S1",
sub = "S1 predicted by P1 and E2",
xlab = "Actual S1 (Demographic Pressures)",
ylab = "Predicted S1",
pch = 19,
col = "darkgreen")
#abline() adds a straight line: intercept=0, slope=1 (if predictions were perfect, all points would fall on this line)
abline(a = 0, b = 1, lty = 2, col = "red")
# legend
legend("topleft",
legend = c("Data points", "Perfect prediction line"),
col = c("darkgreen", "red"),
lty = c(NA, 2),
pch = c(19, NA))
# Create a visual correlation matrix, load corrplot library for correlation visualization
#install.packages('corrplot')
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
# Create a correlation plot
# corrplot() creates a visual matrix of correlations
# method="circle" uses circles sized by correlation strength, type="upper" shows only upper triangle, addCoef.col="black" adds correlation values in black text
corrplot(cor_matrix,
method = "circle",
type = "upper",
tl.col = "black",
tl.srt = 45,
addCoef.col = "black",
number.cex = 0.8,
title = "Correlation Matrix: S1, P1, E2",
mar = c(0,0,2,0))
```