Characteristic Leakage, and figuring out it with Exploratory information evaluation and Machine Studying


library(tidyverse) # Loading some information loan_data <- construction(checklist(finalClass = c("Reject/Cancel", "Success", "Reject/Cancel", "Success", "Success", "Reject/Cancel", "Reject/Cancel", "Success", "Reject/Cancel", "Success", "Reject/Cancel", "Success", "Reject/Cancel", "Success", "Success", "Reject/Cancel", "Success", "Reject/Cancel", "Reject/Cancel", "Success", "Reject/Cancel", "Success", "Reject/Cancel", "Success", "Success", "Reject/Cancel", "Reject/Cancel", "Success", "Success", "Reject/Cancel", "Success", "Reject/Cancel", "Success", "Reject/Cancel", "Reject/Cancel", "Success", "Reject/Cancel", "Reject/Cancel", "Success"), balance_new_bracket = c("01. <= 10okay", "01. <= 10okay", "02. 10okay - 20okay", "02. 10okay - 20okay", "03. 20okay - 30okay", "03. 20okay - 30okay", "04. 30okay - 40okay", "04. 30okay - 40okay", "05. 40okay - 50okay", "05. 40okay - 50okay", "06. 50okay - 60okay", "06. 50okay - 60okay", "07. 60okay - 70okay", "07. 60okay - 70okay", "08. 70okay - 80okay", "08. 70okay - 80okay", "09. 80okay - 90okay", "09. 80okay - 90okay", "10. 90okay - 100okay", "10. 90okay - 100okay", "11. 100okay - 200okay", "11. 100okay - 200okay", "12. 200okay - 300okay", "12. 200okay - 300okay", "13. 300okay - 400okay", "13. 300okay - 400okay", "14. 400okay - 500okay", "14. 400okay - 500okay", "15. 500okay - 600okay", "15. 500okay - 600okay", "16. 600okay - 1M", "16. 600okay - 1M", "17. 1M - 2M", "17. 1M - 2M", "18. 2M - 3M", "19. 3M - 6M", "19. 3M - 6M", "20. > 6M", "20. > 6M"), N = c(18232L, 5115L, 1697L, 819L, 364L, 761L, 476L, 245L, 308L, 137L, 210L, 108L, 155L, 89L, 77L, 137L, 52L, 108L, 103L, 39L, 569L, 260L, 233L, 182L, 1597L, 156L, 109L, 817L, 590L, 116L, 817L, 100L, 51L, 62L, 9L, 1L, 3L, 4L, 1L), p.c = c(0.780914036064591, 0.219085963935409, 0.674483306836248, 0.325516693163752, 0.323555555555556, 0.676444444444444, 0.660194174757282, 0.339805825242718, 0.692134831460674, 0.307865168539326, 0.660377358490566, 0.339622641509434, 0.635245901639344, 0.364754098360656, 0.35981308411215, 0.64018691588785, 0.325, 0.675, 0.725352112676056, 0.274647887323944, 0.686369119420989, 0.313630880579011, 0.56144578313253, 0.43855421686747, 0.911009697661152, 0.0889903023388477, 0.117710583153348, 0.882289416846652, 0.835694050991501, 0.164305949008499, 0.890948745910578, 0.109051254089422, 0.451327433628319, 0.548672566371681, 1, 0.25, 0.75, 0.8, 0.2), tots = c(23347L, 23347L, 2516L, 2516L, 1125L, 1125L, 721L, 721L, 445L, 445L, 318L, 318L, 244L, 244L, 214L, 214L, 160L, 160L, 142L, 142L, 829L, 829L, 415L, 415L, 1753L, 1753L, 926L, 926L, 706L, 706L, 917L, 917L, 113L, 113L, 9L, 4L, 4L, 5L, 5L), conf_low = c(0.775552136317493, 0.213794081502295, 0.65578046562415, 0.307220804467065, 0.296264735635882, 0.648227521218143, 0.624326658051425, 0.305255642604346, 0.646947176024304, 0.265253980427813, 0.60544358384926, 0.287709357961987, 0.571443652323727, 0.304282016481803, 0.295522603420615, 0.571952527712148, 0.25317409400087, 0.596551368545636, 0.64420157566435, 0.203150823708409, 0.653560936603063, 0.282154345692913, 0.51220524670192, 0.390195953557052, 0.896698056863425, 0.076072772673856, 0.0976559949418072, 0.859767702403072, 0.80626156910148, 0.137713232814479, 0.868959355941994, 0.0896127959455879, 0.357541357583628, 0.452272456810347, 0.663732883120057, 0.00630946320970987, 0.194120449683243, 0.283582063881911, 0.00505076337946806), conf_hi = c(0.786205918497705, 0.224447863682507, 0.692779195532935, 0.34421953437585, 0.351772478781857, 0.703735264364118, 0.694744357395654, 0.375673341948575, 0.734746019572187, 0.353052823975696, 0.712290642038013, 0.39455641615074, 0.695717983518197, 0.428556347676273, 0.428047472287852, 0.704477396579385, 0.403448631454364, 0.74682590599913, 0.796849176291591, 0.35579842433565, 0.717845654307087, 0.346439063396937, 0.609804046442948, 0.48779475329808, 0.923927227326144, 0.103301943136575, 0.140232297596928, 0.902344005058193, 0.862286767185521, 0.19373843089852, 0.910387204054412, 0.131040644058006, 0.547727543189653, 0.642458642416372, 1, 0.805879550316757, 0.99369053679029, 0.994949236620532, 0.716417936118089)), row.names = c(NA, -39L), class = "information.body") observed_n_per_cat <- loan_data %>% filter(finalClass == "Success") %>% pull(tots) geom_negloglikelihood = perform(logit_prob, dat) { -sum(dgeom(seq_along(dat)-1, prob = plogis(logit_prob), log = T) * dat)
} mle_prob = plogis(optimize(f = geom_negloglikelihood, dat = observed_n_per_cat, decrease = -10, higher = 10)$minimal) expected_n_per_cat = sum(observed_n_per_cat) * dgeom(seq_along(observed_n_per_cat)-1, prob = mle_prob) chisq_statistic <- sum((observed_n_per_cat - expected_n_per_cat)^2 / expected_n_per_cat) pchisq(chisq_statistic, df = size(observed_n_per_cat) - 1, decrease = F) 

Leave a Reply

Your email address will not be published. Required fields are marked *