老师您好,在学习中我发现,KNN补值函数knnImputation函数是按照行而非列进行的缺失值填补,但是蛋白组或者基因组数据一般都是列是样本行是蛋白/基因特征名称。是按照行(特征)进行近邻k个参考的距离计算并进行缺失值填补,而非按照列(样本)进行距离计算的,我想问下您,KNN不是一般是计算样本间距离吗?为什么此处是计算特征间距离进行缺失值填补?两者填补结果都合理吗?
function (data, k = 10, scale = TRUE, meth = "weighAvg",
distData = NULL)
{
n <- nrow(data)
if (!is.null(distData)) {
distInit <- n + 1
data <- rbind(data, distData)
}
else distInit <- 1
N <- nrow(data)
ncol <- ncol(data)
contAttrs <- which(vapply(data, dplyr::type_sum, character(1)) %in%
c("dbl", "int"))
nomAttrs <- setdiff(seq.int(ncol), contAttrs)
hasNom <- length(nomAttrs)
dm <- data
if (scale)
dm[, contAttrs] <- scale(dm[, contAttrs])
if (hasNom)
for (i in nomAttrs) dm[[i]] <- as.integer(dm[[i]])
dm <- as.matrix(dm)
nas <- which(!complete.cases(dm))
if (!is.null(distData))
tgt.nas <- nas[nas <= n]
else tgt.nas <- nas
if (length(tgt.nas) == 0)
warning("No case has missing values. Stopping as there is nothing to do.")
xcomplete <- dm[setdiff(distInit:N, nas), ]
if (nrow(xcomplete) < k)
stop("Not sufficient complete cases for computing neighbors.")
for (i in tgt.nas) {
tgtAs <- which(is.na(dm[i, ]))
dist <- scale(xcomplete, dm[i, ], FALSE)
xnom <- setdiff(nomAttrs, tgtAs)
if (length(xnom))
dist[, xnom] <- ifelse(dist[, xnom] > 0, 1, dist[,
xnom])
dist <- dist[, -tgtAs]
dist <- sqrt(drop(dist^2 %*% rep(1, ncol(dist))))
ks <- order(dist)[seq(k)]
for (j in tgtAs) if (meth == "median")
data[i, j] <- centralValue(data[setdiff(distInit:N,
nas)[ks], j])
else data[i, j] <- centralValue(data[setdiff(distInit:N,
nas)[ks], j], exp(-dist[ks]))
}
data[1:n, ]
}
<bytecode: 0x00000248c5647580>
<environment: namespace:DMwR2>