library(impute)
Read the data in
getwd()
## [1] "D:/R/DA/assignment1/code"
loans <- read.csv('../data/loansData.csv')
loans$Interest.Rate <- as.numeric(sub("%", "", loans$Interest.Rate))
loans$Debt.To.Income.Ratio <- as.numeric(sub("%", "", loans$Debt.To.Income.Ratio))
loans$Employment.Length[loans$Employment.Length=="n/a"] <- NA
new.employment.year.levels <- levels(factor(loans$Employment.Length))
ordered.year.levels <- new.employment.year.levels[c(2:11,1)]
loans$Employment.Length <- factor(loans$Employment.Length, levels = ordered.year.levels)
loans$FICO.Numeric<- as.numeric(as.character(gsub("-...", "", loans$FICO.Range)))
dim(loans)
## [1] 2500 15
head(loans)
## Amount.Requested Amount.Funded.By.Investors Interest.Rate
## 81174 20000 20000 8.90
## 99592 19200 19200 12.12
## 80059 35000 35000 21.98
## 15825 10000 9975 9.99
## 33182 12000 12000 11.71
## 62403 6000 6000 15.31
## Loan.Length Loan.Purpose Debt.To.Income.Ratio State
## 81174 36 months debt_consolidation 14.90 SC
## 99592 36 months debt_consolidation 28.36 TX
## 80059 60 months debt_consolidation 23.81 CA
## 15825 36 months debt_consolidation 14.30 KS
## 33182 36 months credit_card 18.78 NJ
## 62403 36 months other 20.05 CT
## Home.Ownership Monthly.Income FICO.Range Open.CREDIT.Lines
## 81174 MORTGAGE 6541.67 735-739 14
## 99592 MORTGAGE 4583.33 715-719 12
## 80059 MORTGAGE 11500.00 690-694 14
## 15825 MORTGAGE 3833.33 695-699 10
## 33182 RENT 3195.00 695-699 11
## 62403 OWN 4891.67 670-674 17
## Revolving.CREDIT.Balance Inquiries.in.the.Last.6.Months
## 81174 14272 2
## 99592 11140 1
## 80059 21977 1
## 15825 9346 0
## 33182 14469 0
## 62403 10391 2
## Employment.Length FICO.Numeric
## 81174 < 1 year 735
## 99592 2 years 715
## 80059 2 years 690
## 15825 5 years 695
## 33182 9 years 695
## 62403 3 years 670
summary(loans)
## Amount.Requested Amount.Funded.By.Investors Interest.Rate
## Min. : 1000 Min. : -0.01 Min. : 5.42
## 1st Qu.: 6000 1st Qu.: 6000.00 1st Qu.:10.16
## Median :10000 Median :10000.00 Median :13.11
## Mean :12406 Mean :12001.57 Mean :13.07
## 3rd Qu.:17000 3rd Qu.:16000.00 3rd Qu.:15.80
## Max. :35000 Max. :35000.00 Max. :24.89
##
## Loan.Length Loan.Purpose Debt.To.Income.Ratio
## 36 months:1952 debt_consolidation:1307 Min. : 0.000
## 60 months: 548 credit_card : 444 1st Qu.: 9.748
## other : 201 Median :15.320
## home_improvement : 152 Mean :15.378
## major_purchase : 101 3rd Qu.:20.672
## small_business : 87 Max. :34.910
## (Other) : 208
## State Home.Ownership Monthly.Income FICO.Range
## CA : 433 MORTGAGE:1148 Min. : 588.5 670-674: 171
## NY : 255 NONE : 1 1st Qu.: 3500.0 675-679: 166
## TX : 174 OTHER : 5 Median : 5000.0 680-684: 157
## FL : 169 OWN : 200 Mean : 5688.9 695-699: 153
## IL : 101 RENT :1146 3rd Qu.: 6800.0 665-669: 145
## GA : 98 Max. :102750.0 690-694: 140
## (Other):1270 NA's :1 (Other):1568
## Open.CREDIT.Lines Revolving.CREDIT.Balance Inquiries.in.the.Last.6.Months
## Min. : 2.00 Min. : 0 Min. :0.0000
## 1st Qu.: 7.00 1st Qu.: 5586 1st Qu.:0.0000
## Median : 9.00 Median : 10962 Median :0.0000
## Mean :10.08 Mean : 15245 Mean :0.9063
## 3rd Qu.:13.00 3rd Qu.: 18889 3rd Qu.:1.0000
## Max. :38.00 Max. :270800 Max. :9.0000
## NA's :2 NA's :2 NA's :2
## Employment.Length FICO.Numeric
## 10+ years:653 Min. :640.0
## < 1 year :250 1st Qu.:680.0
## 2 years :244 Median :700.0
## 3 years :235 Mean :705.9
## 5 years :202 3rd Qu.:725.0
## (Other) :839 Max. :830.0
## NA's : 77
sapply(loans[1,],class)
## Amount.Requested Amount.Funded.By.Investors
## "integer" "numeric"
## Interest.Rate Loan.Length
## "numeric" "factor"
## Loan.Purpose Debt.To.Income.Ratio
## "factor" "numeric"
## State Home.Ownership
## "factor" "factor"
## Monthly.Income FICO.Range
## "numeric" "factor"
## Open.CREDIT.Lines Revolving.CREDIT.Balance
## "integer" "integer"
## Inquiries.in.the.Last.6.Months Employment.Length
## "integer" "factor"
## FICO.Numeric
## "numeric"
sum(is.na(loans))
## [1] 84
hist(loans$Interest.Rate)
plot(loans$Interest.Rate ~ loans$Amount.Requested,col="blue")
# tiez zaujimave
plot(loans$Interest.Rate ~ loans$Loan.Length,col="blue")
plot(loans$Interest.Rate ~ loans$State,col="blue")
# toto je zaujimava metrika
plot(loans$Interest.Rate ~ loans$FICO.Range,col="blue")
plot(loans$Interest.Rate ~ loans$Inquiries.in.the.Last.6.Months,col="blue")
plot(loans$Interest.Rate ~ loans$Amount.Funded.By.Investors,col="blue")
plot(loans$Interest.Rate ~ loans$Loan.Purpose,col="blue")
plot(loans$Interest.Rate ~ loans$Home.Ownership,col="blue")
plot(loans$Interest.Rate ~ loans$Open.CREDIT.Lines,col="blue")
plot(loans$Interest.Rate ~ loans$Employment.Length,col="blue")
plot(loans$Interest.Rate ~ loans$Debt.To.Income.Ratio,col="blue")
# toto by mohlo byt zaujimave ak by sa odstranili outliery
plot(loans$Interest.Rate ~ loans$Monthly.Income,col="blue")
subset <- loans[loans$Monthly.Income < 30000,]
plot(subset$Interest.Rate ~ subset$Monthly.Income,col="blue")
# tak ani nie
plot(loans$Interest.Rate ~ loans$Revolving.CREDIT.Balance,col="blue")
loansMatrixTmp <- data.matrix(loans)
loansMatrix <- impute.knn(loansMatrixTmp)$data
## Cluster size 2500 broken into 1827 673
## Cluster size 1827 broken into 1211 616
## Done cluster 1211
## Done cluster 616
## Done cluster 1827
## Done cluster 673
#svd <- svd(scale(loansMatrix))
#plot(svd$d,xlab="Column",ylab="Singluar value",pch=19)
#plot(svd$d^2/sum(svd$d^2),xlab="Column",ylab="Percent of variance explained",pch=19)
correlations <- cor(loansMatrix)
correlations["Interest.Rate",][order(-abs(correlations["Interest.Rate",]))]
## Interest.Rate FICO.Range
## 1.000000000 -0.709177443
## FICO.Numeric Loan.Length
## -0.709111285 0.423693811
## Amount.Funded.By.Investors Amount.Requested
## 0.337386549 0.332352117
## Debt.To.Income.Ratio Inquiries.in.the.Last.6.Months
## 0.173206785 0.164245732
## Open.CREDIT.Lines Home.Ownership
## 0.090396417 0.075012639
## Loan.Purpose Revolving.CREDIT.Balance
## -0.062083229 0.061448348
## Monthly.Income State
## 0.012428982 0.008279973
## Employment.Length
## -0.003351337
image(abs(correlations))
#plot(hclust(dist(abs(correlations))))
#subMatrix <- loansMatrix[,c(3,10)]
#distLoans <- dist(subMatrix)
#hClustering <- hclust(distLoans)
#plot(hClustering)
#heatmap(loansMatrix)
#loansMatrixOrdered <- loansMatrix[hClustering$order,]
boxplot(loans$Interest.Rate ~ loans$FICO.Range,col="blue")
plot(loans$FICO.Numeric, loans$Interest.Rate, col=(loans$Loan.Length), pch=19, cex=1.2)
#sem este pridam natrenovany linearny model a pode to do finalnej analyzy
#plot(loans$Interest.Rate, loans$FICO.Numeric, cex=log(loans$Amount.Requested/8000), pch=19)
colfunc <- colorRampPalette(c("orange","blue"))
z.cols <- cut(loans$Amount.Requested, 50, labels=colfunc(50))
plot(loans$FICO.Numeric, loans$Interest.Rate, col=as.character(z.cols), pch=19, cex=1.2)
histogram z corelacnej matice (jeden riadok matice) boxplot inetrest rate / fico (factor) scatterplot interest rate by fico (numeric) and loan length scatterplot interest rate by fico (numeric) and amount requested statistiky o natrenovanom modely