Loan Interest Rate Associations Analysis

Load some libraries

library(impute)

Processing

Read the data in

getwd()
## [1] "D:/R/DA/assignment1/code"
loans <- read.csv('../data/loansData.csv')

Preprocessing

Transform data types

loans$Interest.Rate <- as.numeric(sub("%", "", loans$Interest.Rate))
loans$Debt.To.Income.Ratio <- as.numeric(sub("%", "", loans$Debt.To.Income.Ratio))
loans$Employment.Length[loans$Employment.Length=="n/a"] <- NA
new.employment.year.levels <- levels(factor(loans$Employment.Length))
ordered.year.levels <- new.employment.year.levels[c(2:11,1)]
loans$Employment.Length <- factor(loans$Employment.Length, levels = ordered.year.levels)



loans$FICO.Numeric<- as.numeric(as.character(gsub("-...", "", loans$FICO.Range)))

Look at the data set

dim(loans)
## [1] 2500   15
head(loans)
##       Amount.Requested Amount.Funded.By.Investors Interest.Rate
## 81174            20000                      20000          8.90
## 99592            19200                      19200         12.12
## 80059            35000                      35000         21.98
## 15825            10000                       9975          9.99
## 33182            12000                      12000         11.71
## 62403             6000                       6000         15.31
##       Loan.Length       Loan.Purpose Debt.To.Income.Ratio State
## 81174   36 months debt_consolidation                14.90    SC
## 99592   36 months debt_consolidation                28.36    TX
## 80059   60 months debt_consolidation                23.81    CA
## 15825   36 months debt_consolidation                14.30    KS
## 33182   36 months        credit_card                18.78    NJ
## 62403   36 months              other                20.05    CT
##       Home.Ownership Monthly.Income FICO.Range Open.CREDIT.Lines
## 81174       MORTGAGE        6541.67    735-739                14
## 99592       MORTGAGE        4583.33    715-719                12
## 80059       MORTGAGE       11500.00    690-694                14
## 15825       MORTGAGE        3833.33    695-699                10
## 33182           RENT        3195.00    695-699                11
## 62403            OWN        4891.67    670-674                17
##       Revolving.CREDIT.Balance Inquiries.in.the.Last.6.Months
## 81174                    14272                              2
## 99592                    11140                              1
## 80059                    21977                              1
## 15825                     9346                              0
## 33182                    14469                              0
## 62403                    10391                              2
##       Employment.Length FICO.Numeric
## 81174          < 1 year          735
## 99592           2 years          715
## 80059           2 years          690
## 15825           5 years          695
## 33182           9 years          695
## 62403           3 years          670
summary(loans)
##  Amount.Requested Amount.Funded.By.Investors Interest.Rate  
##  Min.   : 1000    Min.   :   -0.01           Min.   : 5.42  
##  1st Qu.: 6000    1st Qu.: 6000.00           1st Qu.:10.16  
##  Median :10000    Median :10000.00           Median :13.11  
##  Mean   :12406    Mean   :12001.57           Mean   :13.07  
##  3rd Qu.:17000    3rd Qu.:16000.00           3rd Qu.:15.80  
##  Max.   :35000    Max.   :35000.00           Max.   :24.89  
##                                                             
##     Loan.Length               Loan.Purpose  Debt.To.Income.Ratio
##  36 months:1952   debt_consolidation:1307   Min.   : 0.000      
##  60 months: 548   credit_card       : 444   1st Qu.: 9.748      
##                   other             : 201   Median :15.320      
##                   home_improvement  : 152   Mean   :15.378      
##                   major_purchase    : 101   3rd Qu.:20.672      
##                   small_business    :  87   Max.   :34.910      
##                   (Other)           : 208                       
##      State       Home.Ownership Monthly.Income       FICO.Range  
##  CA     : 433   MORTGAGE:1148   Min.   :   588.5   670-674: 171  
##  NY     : 255   NONE    :   1   1st Qu.:  3500.0   675-679: 166  
##  TX     : 174   OTHER   :   5   Median :  5000.0   680-684: 157  
##  FL     : 169   OWN     : 200   Mean   :  5688.9   695-699: 153  
##  IL     : 101   RENT    :1146   3rd Qu.:  6800.0   665-669: 145  
##  GA     :  98                   Max.   :102750.0   690-694: 140  
##  (Other):1270                   NA's   :1          (Other):1568  
##  Open.CREDIT.Lines Revolving.CREDIT.Balance Inquiries.in.the.Last.6.Months
##  Min.   : 2.00     Min.   :     0           Min.   :0.0000                
##  1st Qu.: 7.00     1st Qu.:  5586           1st Qu.:0.0000                
##  Median : 9.00     Median : 10962           Median :0.0000                
##  Mean   :10.08     Mean   : 15245           Mean   :0.9063                
##  3rd Qu.:13.00     3rd Qu.: 18889           3rd Qu.:1.0000                
##  Max.   :38.00     Max.   :270800           Max.   :9.0000                
##  NA's   :2         NA's   :2                NA's   :2                     
##  Employment.Length  FICO.Numeric  
##  10+ years:653     Min.   :640.0  
##  < 1 year :250     1st Qu.:680.0  
##  2 years  :244     Median :700.0  
##  3 years  :235     Mean   :705.9  
##  5 years  :202     3rd Qu.:725.0  
##  (Other)  :839     Max.   :830.0  
##  NA's     : 77
sapply(loans[1,],class)
##               Amount.Requested     Amount.Funded.By.Investors 
##                      "integer"                      "numeric" 
##                  Interest.Rate                    Loan.Length 
##                      "numeric"                       "factor" 
##                   Loan.Purpose           Debt.To.Income.Ratio 
##                       "factor"                      "numeric" 
##                          State                 Home.Ownership 
##                       "factor"                       "factor" 
##                 Monthly.Income                     FICO.Range 
##                      "numeric"                       "factor" 
##              Open.CREDIT.Lines       Revolving.CREDIT.Balance 
##                      "integer"                      "integer" 
## Inquiries.in.the.Last.6.Months              Employment.Length 
##                      "integer"                       "factor" 
##                   FICO.Numeric 
##                      "numeric"

Find out about missing values

sum(is.na(loans))
## [1] 84

Some exploratory analysis

hist(loans$Interest.Rate)

plot of chunk unnamed-chunk-5

Plot some attribute pairs

plot(loans$Interest.Rate ~ loans$Amount.Requested,col="blue")

plot of chunk unnamed-chunk-6

# tiez zaujimave
plot(loans$Interest.Rate ~ loans$Loan.Length,col="blue")

plot of chunk unnamed-chunk-6

plot(loans$Interest.Rate ~ loans$State,col="blue")

plot of chunk unnamed-chunk-6

# toto je zaujimava metrika
plot(loans$Interest.Rate ~ loans$FICO.Range,col="blue")

plot of chunk unnamed-chunk-6

plot(loans$Interest.Rate ~ loans$Inquiries.in.the.Last.6.Months,col="blue")

plot of chunk unnamed-chunk-6

plot(loans$Interest.Rate ~ loans$Amount.Funded.By.Investors,col="blue")

plot of chunk unnamed-chunk-6

plot(loans$Interest.Rate ~ loans$Loan.Purpose,col="blue")

plot of chunk unnamed-chunk-6

plot(loans$Interest.Rate ~ loans$Home.Ownership,col="blue")

plot of chunk unnamed-chunk-6

plot(loans$Interest.Rate ~ loans$Open.CREDIT.Lines,col="blue")

plot of chunk unnamed-chunk-6

plot(loans$Interest.Rate ~ loans$Employment.Length,col="blue")

plot of chunk unnamed-chunk-6

plot(loans$Interest.Rate ~ loans$Debt.To.Income.Ratio,col="blue")

plot of chunk unnamed-chunk-6

# toto by mohlo byt zaujimave ak by sa odstranili outliery
plot(loans$Interest.Rate ~ loans$Monthly.Income,col="blue")

plot of chunk unnamed-chunk-6

subset <- loans[loans$Monthly.Income < 30000,]
plot(subset$Interest.Rate ~ subset$Monthly.Income,col="blue")

plot of chunk unnamed-chunk-6

# tak ani nie


plot(loans$Interest.Rate ~ loans$Revolving.CREDIT.Balance,col="blue")

plot of chunk unnamed-chunk-6

Impute missing values

loansMatrixTmp <- data.matrix(loans)
loansMatrix <- impute.knn(loansMatrixTmp)$data
## Cluster size 2500 broken into 1827 673 
## Cluster size 1827 broken into 1211 616 
## Done cluster 1211 
## Done cluster 616 
## Done cluster 1827 
## Done cluster 673

Data variance

#svd <- svd(scale(loansMatrix))
#plot(svd$d,xlab="Column",ylab="Singluar value",pch=19)
#plot(svd$d^2/sum(svd$d^2),xlab="Column",ylab="Percent of variance explained",pch=19)

Correlations

correlations <- cor(loansMatrix)
correlations["Interest.Rate",][order(-abs(correlations["Interest.Rate",]))]
##                  Interest.Rate                     FICO.Range 
##                    1.000000000                   -0.709177443 
##                   FICO.Numeric                    Loan.Length 
##                   -0.709111285                    0.423693811 
##     Amount.Funded.By.Investors               Amount.Requested 
##                    0.337386549                    0.332352117 
##           Debt.To.Income.Ratio Inquiries.in.the.Last.6.Months 
##                    0.173206785                    0.164245732 
##              Open.CREDIT.Lines                 Home.Ownership 
##                    0.090396417                    0.075012639 
##                   Loan.Purpose       Revolving.CREDIT.Balance 
##                   -0.062083229                    0.061448348 
##                 Monthly.Income                          State 
##                    0.012428982                    0.008279973 
##              Employment.Length 
##                   -0.003351337
image(abs(correlations))

plot of chunk unnamed-chunk-9

Clustering correlations

#plot(hclust(dist(abs(correlations))))

Cluster by FICO.Range

#subMatrix <- loansMatrix[,c(3,10)]
#distLoans <- dist(subMatrix)
#hClustering <- hclust(distLoans)
#plot(hClustering)

Heatmap

#heatmap(loansMatrix)

Some other clustering

#loansMatrixOrdered <- loansMatrix[hClustering$order,]

Scatter

boxplot(loans$Interest.Rate ~ loans$FICO.Range,col="blue")

plot of chunk unnamed-chunk-14

interest rate by fico and loan length

plot(loans$FICO.Numeric, loans$Interest.Rate, col=(loans$Loan.Length), pch=19, cex=1.2)

plot of chunk unnamed-chunk-15

#sem este pridam natrenovany linearny model a pode to do finalnej analyzy

The same thing using amount requested

#plot(loans$Interest.Rate, loans$FICO.Numeric, cex=log(loans$Amount.Requested/8000), pch=19)
colfunc <- colorRampPalette(c("orange","blue"))
z.cols <- cut(loans$Amount.Requested, 50, labels=colfunc(50))
plot(loans$FICO.Numeric, loans$Interest.Rate, col=as.character(z.cols), pch=19, cex=1.2)

plot of chunk unnamed-chunk-16

do fnale

histogram z corelacnej matice (jeden riadok matice) boxplot inetrest rate / fico (factor) scatterplot interest rate by fico (numeric) and loan length scatterplot interest rate by fico (numeric) and amount requested statistiky o natrenovanom modely