A simple script that diagnose breast cancer using k-Nearest Neightbor algorithm. Dataset is donated by University of Wisconsin and can be found at UCI’s Machine Learning Repository http://archive.ics.uci.edu/ml

Step 1: Collecting data

wbcd <- read.csv("wisc_bc_data.csv", stringsAsFactors = FALSE)
wbcd <- wbcd[-1] # exclude the id column from the dataframe 'cuz it provides no valuable info
table(wbcd$diagnosis) # print out number of benign and malignant diag

  B   M 
357 212 
wbcd$diagnosis <- factor(wbcd$diagnosis, levels = c("B", "M"), labels = c("Benign", "Malignant")) # factor and label
prop.table(table(wbcd$diagnosis)) * 100 # percentage

   Benign Malignant 
 62.74165  37.25835 

Step 2: Exploring/Preping data:

Normalizing features:

# this shows that features are very inconsitance scales, therefore we must normailize them
summary(wbcd[c("radius_mean", "area_mean", "smoothness_mean")])
  radius_mean       area_mean      smoothness_mean  
 Min.   : 6.981   Min.   : 143.5   Min.   :0.05263  
 1st Qu.:11.700   1st Qu.: 420.3   1st Qu.:0.08637  
 Median :13.370   Median : 551.1   Median :0.09587  
 Mean   :14.127   Mean   : 654.9   Mean   :0.09636  
 3rd Qu.:15.780   3rd Qu.: 782.7   3rd Qu.:0.10530  
 Max.   :28.110   Max.   :2501.0   Max.   :0.16340  
# simple function to normalize data passed in; output is in range of [0.0,1.0]
normalize <- function(x){
  return ((x - min(x))/(max(x) - min(x)))
}
# normalize wbcd and store it in wbcd_n
wbcd_n <- as.data.frame(lapply(wbcd[2:31], normalize))
# confirm that fatures had been nomalized
summary(wbcd_n[c("area_mean", "texture_mean", "smoothness_mean")])
   area_mean       texture_mean    smoothness_mean 
 Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
 1st Qu.:0.1174   1st Qu.:0.2185   1st Qu.:0.3046  
 Median :0.1729   Median :0.3088   Median :0.3904  
 Mean   :0.2169   Mean   :0.3240   Mean   :0.3948  
 3rd Qu.:0.2711   3rd Qu.:0.4089   3rd Qu.:0.4755  
 Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  

Split normalized dataset into trainging and test dataset:

wbcd_train <- wbcd_n[1:469, ]
wbcd_test <- wbcd_n[470:569, ]
wbcd_train_labels <- wbcd[1:469, 1]
wbcd_test_labels <- wbcd[470:569, 1]

Step 3: Train and predict using class.knn()

#install.packages("class")
library(class)
wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test, cl = wbcd_train_labels, k = 21) # k = sqrt(469) = 21

Step 4: Evaluating model performance

library(gmodels)
CrossTable(x = wbcd_test_labels, y = wbcd_test_pred, prop.chisq = FALSE)

 
   Cell Contents
|-------------------------|
|                       N |
|           N / Row Total |
|           N / Col Total |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  100 

 
                 | wbcd_test_pred 
wbcd_test_labels |    Benign | Malignant | Row Total | 
-----------------|-----------|-----------|-----------|
          Benign |        61 |         0 |        61 | 
                 |     1.000 |     0.000 |     0.610 | 
                 |     0.968 |     0.000 |           | 
                 |     0.610 |     0.000 |           | 
-----------------|-----------|-----------|-----------|
       Malignant |         2 |        37 |        39 | 
                 |     0.051 |     0.949 |     0.390 | 
                 |     0.032 |     1.000 |           | 
                 |     0.020 |     0.370 |           | 
-----------------|-----------|-----------|-----------|
    Column Total |        63 |        37 |       100 | 
                 |     0.630 |     0.370 |           | 
-----------------|-----------|-----------|-----------|

 

==> As we can see, our k-NN model correctly predicts all Benign test cases (61/61 = 100%), and correctly predicts 37 Malignant test caeses (37/39 = 94.9%). However, it incorectly predicts 2 (2/39 = 5.1%) Malignant cases as Benign.

Step 5: improving model performance by trying different scaling method and k value

wbcd_z <- as.data.frame(scale(wbcd[-1])) # standardize dataframe using z-scope
summary(wbcd_z[c("area_mean", "texture_mean", "smoothness_mean")])
   area_mean        texture_mean     smoothness_mean   
 Min.   :-1.4532   Min.   :-2.2273   Min.   :-3.10935  
 1st Qu.:-0.6666   1st Qu.:-0.7253   1st Qu.:-0.71034  
 Median :-0.2949   Median :-0.1045   Median :-0.03486  
 Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
 3rd Qu.: 0.3632   3rd Qu.: 0.5837   3rd Qu.: 0.63564  
 Max.   : 5.2459   Max.   : 4.6478   Max.   : 4.76672  
wbcd_train <- wbcd_z[1:469, ]
wbcd_test <- wbcd_z[470:569, ]
wbcd_train_labels <- wbcd[1:469, 1]
wbcd_test_labels <- wbcd[470:569, 1]
wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test, cl = wbcd_train_labels, k = 21) # k = sqrt(469) = 21
CrossTable(x = wbcd_test_labels, y = wbcd_test_pred, prop.chisq = FALSE)

 
   Cell Contents
|-------------------------|
|                       N |
|           N / Row Total |
|           N / Col Total |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  100 

 
                 | wbcd_test_pred 
wbcd_test_labels |    Benign | Malignant | Row Total | 
-----------------|-----------|-----------|-----------|
          Benign |        61 |         0 |        61 | 
                 |     1.000 |     0.000 |     0.610 | 
                 |     0.924 |     0.000 |           | 
                 |     0.610 |     0.000 |           | 
-----------------|-----------|-----------|-----------|
       Malignant |         5 |        34 |        39 | 
                 |     0.128 |     0.872 |     0.390 | 
                 |     0.076 |     1.000 |           | 
                 |     0.050 |     0.340 |           | 
-----------------|-----------|-----------|-----------|
    Column Total |        66 |        34 |       100 | 
                 |     0.660 |     0.340 |           | 
-----------------|-----------|-----------|-----------|

 

==> Unfortunately, in this case z-scope standardization result in a slightly worse prediction model with 5 (5/39 = 12.8%) Malignant cases predicted as Benign. In the real world, failure in predicting Malignant cancer as Benign can lead patients to believe they are fine and dandy eventhough they need serious treatment!!

LS0tDQp0aXRsZTogIkRpYWdub3NpbmcgQnJlYXN0IENhbmNlciB3aXRoDQprLU5OIEFsZ29yaXRobSINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCkEgc2ltcGxlIHNjcmlwdCB0aGF0IGRpYWdub3NlIGJyZWFzdCBjYW5jZXIgdXNpbmcgay1OZWFyZXN0IE5laWdodGJvciBhbGdvcml0aG0uIERhdGFzZXQgaXMgZG9uYXRlZCBieSBVbml2ZXJzaXR5IG9mIFdpc2NvbnNpbiBhbmQgY2FuIGJlIGZvdW5kIGF0IFVDSSdzIE1hY2hpbmUgTGVhcm5pbmcgUmVwb3NpdG9yeSBodHRwOi8vYXJjaGl2ZS5pY3MudWNpLmVkdS9tbA0KPHU+PGg0PlN0ZXAgMTogQ29sbGVjdGluZyBkYXRhPC9oND48L3U+DQpgYGB7cn0NCndiY2QgPC0gcmVhZC5jc3YoIndpc2NfYmNfZGF0YS5jc3YiLCBzdHJpbmdzQXNGYWN0b3JzID0gRkFMU0UpDQp3YmNkIDwtIHdiY2RbLTFdICMgZXhjbHVkZSB0aGUgaWQgY29sdW1uIGZyb20gdGhlIGRhdGFmcmFtZSAnY3V6IGl0IHByb3ZpZGVzIG5vIHZhbHVhYmxlIGluZm8NCnRhYmxlKHdiY2QkZGlhZ25vc2lzKSAjIHByaW50IG91dCBudW1iZXIgb2YgYmVuaWduIGFuZCBtYWxpZ25hbnQgZGlhZw0Kd2JjZCRkaWFnbm9zaXMgPC0gZmFjdG9yKHdiY2QkZGlhZ25vc2lzLCBsZXZlbHMgPSBjKCJCIiwgIk0iKSwgbGFiZWxzID0gYygiQmVuaWduIiwgIk1hbGlnbmFudCIpKSAjIGZhY3RvciBhbmQgbGFiZWwNCnByb3AudGFibGUodGFibGUod2JjZCRkaWFnbm9zaXMpKSAqIDEwMCAjIHBlcmNlbnRhZ2UNCmBgYA0KDQoNCjx1PjxoND5TdGVwIDI6IEV4cGxvcmluZy9QcmVwaW5nIGRhdGE6PC9oND48L3U+DQoNCk5vcm1hbGl6aW5nIGZlYXR1cmVzOg0KYGBge3J9DQojIHRoaXMgc2hvd3MgdGhhdCBmZWF0dXJlcyBhcmUgdmVyeSBpbmNvbnNpc3RhbmNlIHNjYWxlcywgdGhlcmVmb3JlIHdlIG11c3Qgbm9ybWFpbGl6ZSB0aGVtDQpzdW1tYXJ5KHdiY2RbYygicmFkaXVzX21lYW4iLCAiYXJlYV9tZWFuIiwgInNtb290aG5lc3NfbWVhbiIpXSkNCmBgYA0KYGBge3J9DQojIHNpbXBsZSBmdW5jdGlvbiB0byBub3JtYWxpemUgZGF0YSBwYXNzZWQgaW47IG91dHB1dCBpcyBpbiByYW5nZSBvZiBbMC4wLDEuMF0NCm5vcm1hbGl6ZSA8LSBmdW5jdGlvbih4KXsNCiAgcmV0dXJuICgoeCAtIG1pbih4KSkvKG1heCh4KSAtIG1pbih4KSkpDQp9DQojIG5vcm1hbGl6ZSB3YmNkIGFuZCBzdG9yZSBpdCBpbiB3YmNkX24NCndiY2RfbiA8LSBhcy5kYXRhLmZyYW1lKGxhcHBseSh3YmNkWzI6MzFdLCBub3JtYWxpemUpKQ0KIyBjb25maXJtIHRoYXQgZmF0dXJlcyBoYWQgYmVlbiBub21hbGl6ZWQNCnN1bW1hcnkod2JjZF9uW2MoImFyZWFfbWVhbiIsICJ0ZXh0dXJlX21lYW4iLCAic21vb3RobmVzc19tZWFuIildKQ0KYGBgDQoNClNwbGl0IG5vcm1hbGl6ZWQgZGF0YXNldCBpbnRvIHRyYWluZ2luZyBhbmQgdGVzdCBkYXRhc2V0Og0KYGBge3J9DQp3YmNkX3RyYWluIDwtIHdiY2RfblsxOjQ2OSwgXQ0Kd2JjZF90ZXN0IDwtIHdiY2Rfbls0NzA6NTY5LCBdDQp3YmNkX3RyYWluX2xhYmVscyA8LSB3YmNkWzE6NDY5LCAxXQ0Kd2JjZF90ZXN0X2xhYmVscyA8LSB3YmNkWzQ3MDo1NjksIDFdDQpgYGANCg0KPHU+PGg0PlN0ZXAgMzogVHJhaW4gYW5kIHByZWRpY3QgdXNpbmcgY2xhc3Mua25uKCk8L2g0PjwvdT4NCg0KYGBge3J9DQojaW5zdGFsbC5wYWNrYWdlcygiY2xhc3MiKQ0KbGlicmFyeShjbGFzcykNCndiY2RfdGVzdF9wcmVkIDwtIGtubih0cmFpbiA9IHdiY2RfdHJhaW4sIHRlc3QgPSB3YmNkX3Rlc3QsIGNsID0gd2JjZF90cmFpbl9sYWJlbHMsIGsgPSAyMSkgIyBrID0gc3FydCg0NjkpID0gMjENCmBgYA0KDQo8dT48aDQ+U3RlcCA0OiBFdmFsdWF0aW5nIG1vZGVsIHBlcmZvcm1hbmNlPC9oND48L3U+DQoNCmBgYHtyfQ0KbGlicmFyeShnbW9kZWxzKQ0KQ3Jvc3NUYWJsZSh4ID0gd2JjZF90ZXN0X2xhYmVscywgeSA9IHdiY2RfdGVzdF9wcmVkLCBwcm9wLmNoaXNxID0gRkFMU0UpDQpgYGANCj09PiBBcyB3ZSBjYW4gc2VlLCBvdXIgay1OTiBtb2RlbCBjb3JyZWN0bHkgcHJlZGljdHMgYWxsIEJlbmlnbiB0ZXN0IGNhc2VzICg2MS82MSA9IDEwMCUpLCBhbmQgY29ycmVjdGx5IHByZWRpY3RzIDM3IE1hbGlnbmFudCB0ZXN0IGNhZXNlcyAoMzcvMzkgPSA5NC45JSkuIEhvd2V2ZXIsIGl0IGluY29yZWN0bHkgcHJlZGljdHMgMiAoMi8zOSA9IDUuMSUpIE1hbGlnbmFudCBjYXNlcyBhcyBCZW5pZ24uDQoNCjx1PjxoND5TdGVwIDU6IGltcHJvdmluZyBtb2RlbCBwZXJmb3JtYW5jZSBieSB0cnlpbmcgZGlmZmVyZW50IHNjYWxpbmcgbWV0aG9kIGFuZCBrIHZhbHVlPC9oND48L3U+DQo8aDU+KyBTdGFuZGFyZGl6aW5nIHdpdGggei1zY29wZSBtZXRob2QgYW5kIHJlYXBwbHkga19OTiBtb2RlbDo8L2g1Pg0KYGBge3J9DQp3YmNkX3ogPC0gYXMuZGF0YS5mcmFtZShzY2FsZSh3YmNkWy0xXSkpICMgc3RhbmRhcmRpemUgZGF0YWZyYW1lIHVzaW5nIHotc2NvcGUNCnN1bW1hcnkod2JjZF96W2MoImFyZWFfbWVhbiIsICJ0ZXh0dXJlX21lYW4iLCAic21vb3RobmVzc19tZWFuIildKQ0KYGBgDQpgYGB7cn0NCndiY2RfdHJhaW4gPC0gd2JjZF96WzE6NDY5LCBdDQp3YmNkX3Rlc3QgPC0gd2JjZF96WzQ3MDo1NjksIF0NCndiY2RfdHJhaW5fbGFiZWxzIDwtIHdiY2RbMTo0NjksIDFdDQp3YmNkX3Rlc3RfbGFiZWxzIDwtIHdiY2RbNDcwOjU2OSwgMV0NCndiY2RfdGVzdF9wcmVkIDwtIGtubih0cmFpbiA9IHdiY2RfdHJhaW4sIHRlc3QgPSB3YmNkX3Rlc3QsIGNsID0gd2JjZF90cmFpbl9sYWJlbHMsIGsgPSAyMSkgIyBrID0gc3FydCg0NjkpID0gMjENCkNyb3NzVGFibGUoeCA9IHdiY2RfdGVzdF9sYWJlbHMsIHkgPSB3YmNkX3Rlc3RfcHJlZCwgcHJvcC5jaGlzcSA9IEZBTFNFKQ0KYGBgDQo9PT4gVW5mb3J0dW5hdGVseSwgaW4gdGhpcyBjYXNlIHotc2NvcGUgc3RhbmRhcmRpemF0aW9uIHJlc3VsdCBpbiBhIHNsaWdodGx5IHdvcnNlIHByZWRpY3Rpb24gbW9kZWwgd2l0aCA1ICg1LzM5ID0gMTIuOCUpIE1hbGlnbmFudCBjYXNlcyBwcmVkaWN0ZWQgYXMgQmVuaWduLiBJbiB0aGUgcmVhbCB3b3JsZCwgZmFpbHVyZSBpbiBwcmVkaWN0aW5nIE1hbGlnbmFudCBjYW5jZXIgYXMgQmVuaWduIGNhbiBsZWFkIHBhdGllbnRzIHRvIGJlbGlldmUgdGhleSBhcmUgZmluZSBhbmQgZGFuZHkgZXZlbnRob3VnaCB0aGV5IG5lZWQgc2VyaW91cyB0cmVhdG1lbnQhIQ0KDQo=