Analyzing-human-development-in-India

Data Science Project conducted under the supervision of Prof. Kylie Bemis at Northeastern


# Loading all the required libraries 

library(tidyverse)
library(RODBC)
library(ineq)     # for Lc and Gini
library(ggplot2)
library(scales)
library(showtext) # for fonts
library(stringr)  # for str_wrap
library(grid)
library(ROCR)
library(ggcorrplot)
library(modelr)
library(caret)
library(leaps)
library(nnet)
library(randomForest)
library(dplyr)



load("C:/Users/Akshay/Desktop/ICPSR_36151/DS0003/36151-0003-Data.rda")



women <- da36151.0003

women <- filter(women, EW6 >= 25 & EW6 <= 59)
women <- filter(women, RO6 == "(1) Married 1" | RO6 == "(0) married, spouse absent")
women <- women %>% filter(!is.na(GR46), GR46 != '-')

women <- dplyr::select(women, HHID, PERSONID, IDHH, IDPERSON, EW5, EW6, EW8, EW9, EW10, GR46, ID11, ID13, 
                HHEDUC, HHEDUCM, HHEDUCF, NCHILDM, NCHILDF, SPED6, SPED2, SPED3, SPRO5, INCCROP, INCAGPROP, 
                INCANIMAL, INCAG, INCBUS, INCBUSCALC, INCOTHER, INCOME, INCOMEPC, INCNONAG, INCAGLAB, INCSALARY, 
                INCNREGA,RSUNEARN, GR48, GR46B,GR46A)

women <- mutate(women, education = ifelse(EW8 == "(00) none 0", "illiterate", 
                                          ifelse(EW8 == "(01) 1st class 1" | EW8 == "(02) 2nd class 2" | EW8 == "(03) 3rd class 3" | EW8 == "(04) 4th class 4", "preprimary", 
                                                 ifelse(EW8 == "(05) 5th class 5" | EW8 == "(06) 6th class 6" | EW8 == "(07) 7th class 7" | EW8 == "(08) 8th class 8" | EW8 == "(09) 9th class 9", "primary & postprimary", 
                                                        ifelse(EW8 == "(10) Secondary 10" | EW8 == "(11) 11th Class 11", "secondary", 
                                                               ifelse(EW8 == "(12) High Secondary 12" | EW8 == "(13) 1 year post-secondary 13" | EW8 == "(14) 2 years post-secondary 14", "higher secondary", 
                                                                      ifelse(EW8 == "(15) Bachelors 15" | EW8 == "(16) Above Bachelors 16", "college graduate or higher", "-")))))))
View(head(women, 20))

women$education <- factor(women$education, levels = c("illiterate","preprimary" ,"primary & postprimary" , "secondary", "higher secondary", "college graduate or higher"))
ggplot(data=women %>% filter(!is.na(education), !is.na(GR46B), GR46B == '(1) Yes 1'), aes(x = education)) + geom_bar() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  labs(x = "Education level", y = "Women employed in labor force")


summary <- women %>%
  filter(!is.na(education), !is.na(RSUNEARN)) %>%
  group_by(education) %>%
  summarise(other_household_income = mean(RSUNEARN))

summary



ed_prop <- prop.table(table(women$education))

ggplot(data = summary) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  geom_histogram(aes(x= education, y = other_household_income,group = 1),stat="identity") +
  geom_line(aes(x= education, y = other_household_income,group = 1),stat="identity",color="red",lwd=2) +
  labs(x = "Education level", y = "Other Income")

ggplot(data = women %>% filter(!is.na(education), !is.na(GR48))) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  geom_bar(aes(x= education, fill= GR48), position="fill") +
  labs(x = "Education level", y = "Willingness")


ggplot(data = women %>% filter(!is.na(education), !is.na(GR46))) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  geom_bar(aes(x= education, fill= GR46), position="dodge") +
  labs(x = "Education level", y = "Ever worked in their life")

ggplot(data = women %>% filter(!is.na(education), !is.na(ID13), !is.na(GR46B), GR46B == '(1) Yes 1')) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  geom_bar(aes(x= education, fill = ID13),show.legend = FALSE) + 
  facet_wrap(~ID13) +
  labs(x = "Education level", y = "Caste")





# ````````````````````````````````````````````````````````````````````````````````````



set.seed(1)
women_par <- resample_partition(women, c(train = 0.8, test = 0.2))



# Converting partitions into tibbles

train_partition <-  as_tibble(women_par$train)
test_partition <- as_tibble(women_par$test)



# Checking for factors with single/ 1 levels ( DROP = Factor has only one level)

(l <- sapply(women, function(x) is.factor(x)))
m <- women[, l]
ifelse(n <- sapply(m, function(x) length(levels(x))) == 1, "DROP", "NODROP")

# No factors in dataset with single levels which mightve caused contrast error while running the model.



# GLM / Logistic Regression

fit <- glm(GR46 ~(education), data = women_par$train, family = binomial(link= "logit")) # Single predictor variable

fit1 <- glm(GR46 ~ EW8+ID11+ID13+RSUNEARN+INCOME, data = women_par$train, family = binomial(link= "logit"))


summary(fit)
summary(fit1)




# ````````````````````````````````````````````````````````````````````````````````````

# FITTING DIFFERENT GLM MODELS

#fit1 <- glm(GR46 ~ EW8+SPED2+INCOME+ID11+ID13, data = women_par$train, family = binomial(link = "logit"))
#
# fit2 <- glm(GR46 ~ (education), data = women_par$train, family = binomial(link="logit"))
# 
# fit3 <- glm(GR46 ~ education + INCOME, data = women, family = binomial())
# 
# fit4 <- glm(GR46 ~ education + RSUNEARN + EW6 + SPED6 , data = women, family = binomial())
# 
# fit5 <- glm(GR46 ~ education + RSUNEARN + EW6 + SPED6 + ID13, data = women, family = binomial())
#
# fit6 <- glm(GR46 ~ EW8 + RSUNEARN + EW6 + SPED6 + ID13 + ID11, data = women_par$train, family = binomial(link = "logit"))

# ````````````````````````````````````````````````````````````````````````````````````




# Stepwise generalized logistic regression linear regression to check if if adding diff variables improves the model or not.

lm.one <- glm(GR46 ~ education,data=train_partition,family = binomial(link= "logit"),na.action = na.roughfix)

lm.all <- glm(GR46 ~ EW5+ EW6+ education+ EW9+ EW10+ID11+ ID13+HHEDUC+ HHEDUCM+ HHEDUCF+SPED6+ SPED2+ SPED3+ INCOME+RSUNEARN,data=train_partition,family = binomial(link= "logit"),na.action = na.roughfix)

step(lm.one,scope=list(upper=lm.all, lower= lm.one), direction = "forward",trace = 1)




# Trying multinomial regression 

fit1m <- multinom(GR46 ~ EW8+EW9+EW10+SPED2+INCOME+ID11+ID13, data = train_partition,hess=TRUE)




# Caluculating prediction values ( to be run everytime after running the model)
 
pred <- as_tibble(predict.glm(fit1, test_partition,na.action = na.pass))


# Converting predictions to match the levels of the response variable

pred_fc <- mutate(pred, value = ifelse(pred > 0.5,"(1) Yes 1","(0) No 0"))
p_class <- factor(pred_fc$value, levels = levels(women$GR46))



# Displaying Confusion Matrix 

Cmatrix <- confusionMatrix( p_class, test_partition$GR46, dnn = c("Prediction", "Reference"))
Cmatrix
cat("The F Score is",Cmatrix$byClass["F1"])


cdplot(GR46 ~ education, data=women)

# Running randomForest on "women" data.

women.rf1=randomForest(GR46 ~EW6+ education+ SPED6+ SPRO5+ INCOME+ RSUNEARN, data = train_partition, na.action = na.roughfix, mtry=6)
women.rf1
plot(women.rf1)

  

# Calculating predictions for randomForest

cl <- as_tibble(predict(women.rf1, test_partition))
p_cl <- factor(cl$value, levels = levels(women$GR46))


# Displaying COnfusion Matrix

Cmatrix1 <- confusionMatrix( p_cl,test_partition$GR46, dnn = c("Prediction", "Reference"))
Cmatrix1
cat("The F Score is",Cmatrix1$byClass["F1"])

importance(women.rf1)


# ````````````````````````````````````````````````````````````````````````````````````

# Plotting Precision , Recall and  TPR/FPR graph (ROC) 

OOB.votes <- predict (women.rf1,test_partition,type="prob")
OOB.pred <- OOB.votes[,2]

pred.obj <- prediction (OOB.pred,test_partition$GR46)

RP.perf <- performance(pred.obj, "rec","prec")
plot (RP.perf)

ROC.perf <- performance(pred.obj, "tpr","fpr")
plot (ROC.perf)

library(tidyverse)
library(dplyr)

# Loading DS1 Dataframe 

load("C:/Users/Akshay/Desktop/ICPSR_36151/DS0001/36151-0001-Data.rda")
load("C:/Users/Akshay/Desktop/ICPSR_36151/DS0010/36151-0010-Data.rda")

ds1 <- as_tibble(da36151.0001)

ds1 <- dplyr::select(ds1, HHID, NF5, NF25, NF45, INCCROP, INCAGPROP, INCANIMAL, INCAG, INCBUS, 
                     INCOTHER, INCEARN, INCBENEFITS, INCOME, INCREMIT, INCOMEPC, WS3NM, WS4, WS5, 
                     WS7MONTHS, WSEARN, WS12, WSEARNAGLAB, WSEARNNONAG, WSEARNSALARY, WSEARNNREGA, 
                     RSUNEARN, INCNONAG, INCAGLAB, INCSALARY, INCNREGA, INCNONNREGA, HHEDUC, HHEDUCM, 
                     HHEDUCF, MG10)

ds10 <- as_tibble(da36151.0010)

# Loading IHDS 2005 data from tsv files (no .rda available)

ihds2005 <- as_tibble(read.delim('C:/DMP/22626-0001-Data.tsv',sep = "\t"))



# preliminary LC and GIni Plots 

x <- ineq(ds1$INCOME,type= "Gini")

y <- Lc(ds1$INCOME)

plot(y)




# Plotting Gini Curves for 2004-05 and 2011-12 IHDS Data.


font.add.google("Poppins", "myfont")
showtext.auto()


Income05 <- ihds2005$income
Income <- ds1$INCOME

lorenz05 <- Lc(Income05)
lorenz_df1 <- data.frame(prop_pop = lorenz05$p, income = lorenz05$L) %>%
   mutate(prop_equality = prop_pop)

ineq(Income05,type="Gini")
ineq(Income,type="Gini")

lorenz11 <- Lc(Income)
lorenz_df2 <- data.frame(prop_pop = lorenz11$p, income = lorenz11$L) %>%
   mutate(prop_equality = prop_pop)

p1 <- ggplot(lorenz_df1, aes(x = prop_pop, y = income)) +
   geom_ribbon(data=lorenz_df2,aes(ymax = prop_equality, ymin = income), fill = "yellow",alpha=0.5)+
   geom_line() +
   geom_abline(slope = 1, xintercept = 0, type="l", lty=2,lwd=2) +
   scale_x_continuous("Cumulative proportion of population", label = percent) +
   scale_y_continuous("Cumulative proportion of income", label = percent) +
   theme_minimal(base_family = "myfont") +
   coord_equal() +
   annotate("text",0.53, 0.32, label = "Inequality Gap", family = "myfont") +
   annotate("text", 0.5 , 0.6, label = "Complete equality line", angle = 45, family = "myfont") + 
   ggtitle (
      str_wrap("Cumulative distribution of income from all sources", 200))

print(p1)

grid.text("Source: IHDS 2011-12", 0.8, 0.23, 
       gp = gpar(fontfamily = "myfont", fontsize = 15))

ggsave("IncomeEquality.jpeg")



font.add.google("Poppins", "myfont")
showtext.auto()


p2 <- ggplot(lorenz_df1, aes(x = prop_pop, y = income)) +
   geom_ribbon(aes(ymax = prop_equality, ymin = income), fill = "red",alpha=0.5) +
   geom_line() +
   geom_abline(slope = 1, xintercept = 0, type="l", lty=2,lwd=2) +
   scale_x_continuous("Cumulative proportion of population", label = percent) +
   scale_y_continuous("Cumulative proportion of income", label = percent) +
   theme_minimal(base_family = "myfont") +
   coord_equal() +
   annotate("text",0.53, 0.32, label = "Inequality Gap", family = "myfont") +
   annotate("text", 0.5 , 0.6, label = "Complete equality line", angle = 45, family = "myfont") + 
   ggtitle (
      str_wrap("Cumulative distribution of income from all sources", 200))

print(p2)

grid.text("Source: IHDS 2004-05", 0.8, 0.23, 
       gp = gpar(fontfamily = "myfont", fontsize = 10))



# loading Agrarian Data 

load("C:/Users/Akshay/Desktop/ICPSR_36151/DS0002/36151-0002-Data.rda")


household_data <- da36151.0002
View(head(household_data,20))

# Sorting and Selecting relevant variables for the model

agrarian <- dplyr::select(household_data, HHID, ID14, FM2, FM3,FM4A,FM5A,FM6A,FM4B,FM5B, FM6B,FM4C,FM26A, FM26B,
                   FM5C, FM6C, FM40E, FM11A, FM11B, FM11C, FM27B, FM29, FM30, FM31, FM32, FM33, FM34, FM29RS, FM30RS, FM40E)

# FM29RS - "Rupees spent last year on fertilizer and manure"
# FM30RS - "Rs last year on herbicides and pesticides"
# FM27A -  "Hired farm labour days"
# ID14 -   "Main income source"
# FM2 - "Local area unit name"
# FM3 - "Local units/acre"
# FM4A - "Owned kharif"
# FM5A - "Rented in kharif"
# FM6A - "Rented out kharif"
# FM4B - "Owned rabi"
# FM5B - "Rented in rabi"
# FM6B - "Rented out rabi"
# FM4C - "Owned summer"
# FM5C - "Rented in summer"
# FM6C - "Rented out summer"
# FM40E - "Tractors/Tillers"
# FM11A - "Cultivated kharif"
# FM11B - "Cultivated rabi"
# FM11C - "Cultivated summer"
# FM26A - "Crop residue total value (rupees)"
# FM26B - "Crop residue sold (rupees)"
# FM27B - "Hired farm labour Rs"
# FM29: - "Fertilizers Rs"
# FM30: - "Pesticides Rs"
# FM31: - "Irrigation water Rs"
# FM32: - "Hired Equipment/Animals Rs"
# FM33: - "Agriculture loan repayment Rs"
# FM34: - "Farm miscellaneous Rs"


agrarian <- agrarian %>%
  mutate(cultivated_land = (FM11A + FM11B + FM11C) /FM3)

#agrarian <- na.omit(agrarian)

agrarian <-  filter(agrarian, cultivated_land <= 30)

summary(agrarian$cultivated_land)

ggplot(agrarian) + geom_histogram(aes(cultivated_land), binwidth = 5) + coord_cartesian(xlim = c(0, 75), ylim = c(0,5000))


#Partitioning data for training and Testing

agrarian1 <-  resample_partition(agrarian, c(train = 0.8,test = 0.1, valid = 0.1))

agrarian_train <- as_tibble(agrarian1$train)
agrarian_valid <- as_tibble(agrarian1$valid)
agrarian_test <- as_tibble(agrarian1$test)


train=sample(1:nrow(agrarian_train),nrow(agrarian_train))


#test = sample(1:nrow(agrarian_test),nrow(agrarian_test)) 
#valid = sample(1:nrow(agrarian_valid),nrow(agrarian_valid)) 
#agri.rf1=randomForest(cultivated_land ~ FM27B+FM29+FM30+FM31+FM32+FM33+FM34, data = agrarian, subset = train,na.action = na.roughfix)



# Running randomForest 

agri.rf1=randomForest(cultivated_land ~ FM27B+FM29RS+FM30RS+FM31+FM32+FM34+FM26A, data = agrarian_train, 
                      na.action = na.roughfix, ntree = 200)

ggplot(agrarian) + geom_line(mapping = aes( x = log(FM32) , y = cultivated_land))


agri.rf1

plot(agri.rf1)
importance(agri.rf1) 


# Plotting Actual vs Predicted cultvated land values to visually test the model fit 


cl <- as_tibble(predict(agri.rf1, na.omit(agrarian_test)))
test_omit <- na.omit(agrarian_test)
x1 <- seq(1:nrow(test_omit))
df <- mutate(cl,arpita = row_number())
df1 <- mutate(test_omit, no_rows = row_number())
ggplot() + geom_smooth(data = df, mapping = aes(x = arpita, y = value,color='black')) + 
geom_smooth(data = df1, mapping = aes(x= no_rows, y = df1$cultivated_land,color='red')) +
  scale_colour_manual(name = 'cultivated land', 
                      values =c('black'='black','red'='red'), labels = c('predicted','actual')) + 
  labs(x = "observation number", y = "cultivated land")



# Plotting Residuals against predictor variables

agrarian_test %>% add_residuals(agri.rf1) %>%
  ggplot(aes(x=FM27B, y=resid)) + geom_point() + scale_x_continuous(limits = c(0,20000)) + 
  labs(x = "Hired farm labour (Rs)")

agrarian_test %>% add_residuals(agri.rf1) %>%
  ggplot(aes(x=FM29RS, y=resid)) + geom_point() + scale_x_continuous(limits = c(0,20000)) + 
  labs(x = "Rupees spent last year on fertilizer and manure")

agrarian_test %>% add_residuals(agri.rf1) %>%
  ggplot(aes(x=FM30RS, y=resid)) + geom_point() + scale_x_continuous(limits = c(0,20000)) + 
  labs(x = "Rs last year on herbicides and pesticides")

agrarian_test %>% add_residuals(agri.rf1) %>%
  ggplot(aes(x=FM31, y=resid)) + geom_point() + scale_x_continuous(limits = c(0,10000)) + 
  labs(x = "Rupees spent on irrigation")

agrarian_test %>% add_residuals(agri.rf1) %>%
  ggplot(aes(x=FM32, y=resid)) + geom_point() + scale_x_continuous(limits = c(0,20000)) + 
  labs(x = "Hired Equipment/Animals Rs")

agrarian_test %>% add_residuals(agri.rf1) %>%
  ggplot(aes(x=FM34, y=resid)) + geom_point() + scale_x_continuous(limits = c(0,10000)) + 
  labs(x = "Farm miscellaneous Rs")



# total costs evaluations

df_test1 <- mutate(agrarian, money_invested = FM29 + FM32 + 
                     FM31 + FM30 + FM27B + FM34, unsold_residue = FM26A - FM26B)%>%  
  filter(ID14=="(01) Cultivation 1") %>%
  summarise(total_money = sum(money_invested, na.rm = TRUE), 
            Crop_residue = sum(FM26A, na.rm = TRUE), 
            Residue_sold = sum(FM26B, na.rm = TRUE),
            unsold_residue = sum(unsold_residue, na.rm = TRUE))