# Distributed k-means

First of all make sure you start RHadoop properly
>sudo R

>Sys.setenv(JAVA_HOME='/usr/lib/jvm/java-6-openjdk-amd64')
>library(rhdfs)
>hdfs.init()
>hdfs.ls(".")

The rest of the code comes from the RHADOOP documentation We will discuss it in the class. This might be useful too

> dist.fun =
function(C, P) {
apply(C,
1,
function(x)
matrix(
rowSums((t(t(P) - x))^2),
ncol = length(x)))}

kmeans.map.1 =
function(., P) {
nearest =
if(is.null(C)) {
sample(
1:num.clusters,
nrow(P),
replace = T)}
else {
D = dist.fun(C, P)
nearest = max.col(-D)}
keyval(nearest, P) }

kmeans.reduce.1 =
function(x, P) {
t(as.matrix(apply(P, 2, mean)))}
C = NULL
for(i in 1:num.iter ) {
C =
values(
from.dfs(
mapreduce(
P,
map = kmeans.map.1,
reduce = kmeans.reduce.1)))
if(nrow(C) < 5)
C =
matrix(
rnorm(
num.clusters * nrow(C)),
ncol = nrow(C)) %*% C }
C}

input =
do.call(
rbind,
rep(
list(
matrix(
rnorm(10, sd = 10),
ncol=2)),
20)) +
matrix(rnorm(200), ncol =2)

kmeans.mr(
to.dfs(input),
num.clusters = 12,
num.iter= 5)

Page title Most recent update Last edited by
ICML 2013 Review August 2, 2013 4:15 PM nikolaos v.
Lesson 8 April 10, 2013 1:57 PM nikolaos v.
Lesson 7 April 3, 2013 11:44 AM nikolaos v.
Other clustering December 6, 2012 3:33 PM nikolaos v.
Distributed k-means December 5, 2012 11:23 PM nikolaos v.
Introduction to k-means December 5, 2012 11:09 PM nikolaos v.
Lesson 3 December 6, 2012 4:21 PM nikolaos v.
Decision Tree November 16, 2012 3:21 PM nikolaos v.
Regression Tree November 16, 2012 3:10 PM nikolaos v.
Lesson 2 Run a big logistic regression November 16, 2012 2:33 PM nikolaos v.
Lesson 2 Logistic Regression November 16, 2012 2:23 PM nikolaos v.

### Atlanta, GA

Founded Oct 23, 2012

#### Organizer:

• ##### Ismion Inc

The instructor for teaching the courses

• ##### LogicBlox Inc

LogicBlox offers space, equipment and instructors payment

• ##### Predictix

Paying for cloud time and for TAs

• ##### Kabbage

Space and great pizza