00001
00006
00007
00008
00009
00010
00011
00012
00013 #include "kmeans.h"
00014 #include "generic.h"
00015 #include "mathop.h"
00016 #include <string.h>
00017
00173
00174 #ifndef VL_KMEANS_INSTANTIATING
00175
00176
00186 VL_EXPORT void
00187 vl_kmeans_reset (VlKMeans * self)
00188 {
00189 self->numCenters = 0 ;
00190 self->dimension = 0 ;
00191
00192 if (self->centers) vl_free(self->centers) ;
00193 if (self->centerDistances) vl_free(self->centerDistances) ;
00194 self->centers = NULL ;
00195 self->centerDistances = NULL ;
00196 }
00197
00205 VL_EXPORT VlKMeans *
00206 vl_kmeans_new (vl_type dataType,
00207 VlVectorComparisonType distance)
00208 {
00209 VlKMeans * self = vl_malloc(sizeof(VlKMeans)) ;
00210
00211 self->algorithm = VlKMeansLLoyd ;
00212 self->distance = distance ;
00213 self->dataType = dataType ;
00214
00215 self->verbosity = 0 ;
00216 self->maxNumIterations = 100 ;
00217 self->numRepetitions = 1 ;
00218
00219 self->centers = NULL ;
00220 self->centerDistances = NULL ;
00221
00222 vl_kmeans_reset (self) ;
00223
00224 return self ;
00225 }
00226
00233 VL_EXPORT VlKMeans *
00234 vl_kmeans_new_copy (VlKMeans const * kmeans)
00235 {
00236 VlKMeans * self = vl_malloc(sizeof(VlKMeans)) ;
00237
00238 self->algorithm = kmeans->algorithm ;
00239 self->distance = kmeans->distance ;
00240 self->dataType = kmeans->dataType ;
00241
00242 self->verbosity = kmeans->verbosity ;
00243 self->maxNumIterations = kmeans->maxNumIterations ;
00244 self->numRepetitions = kmeans->numRepetitions ;
00245
00246 self->dimension = kmeans->dimension ;
00247 self->numCenters = kmeans->numCenters ;
00248 self->centers = NULL ;
00249 self->centerDistances = NULL ;
00250
00251 if (kmeans->centers) {
00252 vl_size dataSize = vl_get_type_size(self->dataType) * self->dimension * self->numCenters ;
00253 self->centers = vl_malloc(dataSize) ;
00254 memcpy (self->centers, kmeans->centers, dataSize) ;
00255 }
00256
00257 if (kmeans->centerDistances) {
00258 vl_size dataSize = vl_get_type_size(self->dataType) * self->numCenters * self->numCenters ;
00259 self->centerDistances = vl_malloc(dataSize) ;
00260 memcpy (self->centerDistances, kmeans->centerDistances, dataSize) ;
00261 }
00262
00263 return self ;
00264 }
00265
00274 VL_EXPORT void
00275 vl_kmeans_delete (VlKMeans * self)
00276 {
00277 vl_kmeans_reset (self) ;
00278 vl_free (self) ;
00279 }
00280
00281
00282 typedef struct _VlKMeansSortWrapper
00283 {
00284 vl_uint32 * permutation ;
00285 void const * data ;
00286 vl_size stride ;
00287 } VlKMeansSortWrapper ;
00288
00289
00290
00291
00292 #define VL_SHUFFLE_type vl_uindex
00293 #define VL_SHUFFLE_prefix _vl_kmeans
00294 #include "shuffle-def.h"
00295
00296
00297 #endif
00298
00299
00300 #ifdef VL_KMEANS_INSTANTIATING
00301
00302
00303
00304
00305
00306 static void
00307 VL_XCAT(_vl_kmeans_set_centers_, SFX)
00308 (VlKMeans * self,
00309 TYPE const * centers,
00310 vl_size dimension,
00311 vl_size numCenters)
00312 {
00313 self->dimension = dimension ;
00314 self->numCenters = numCenters ;
00315 self->centers = vl_malloc (sizeof(TYPE) * dimension * numCenters) ;
00316 memcpy ((TYPE*)self->centers, centers,
00317 sizeof(TYPE) * dimension * numCenters) ;
00318 }
00319
00320
00321
00322
00323
00324 static void
00325 VL_XCAT(_vl_kmeans_seed_centers_with_rand_data_, SFX)
00326 (VlKMeans * self,
00327 TYPE const * data,
00328 vl_size dimension,
00329 vl_size numData,
00330 vl_size numCenters)
00331 {
00332 vl_uindex i, j, k ;
00333 VlRand * rand = vl_get_rand () ;
00334
00335 self->dimension = dimension ;
00336 self->numCenters = numCenters ;
00337 self->centers = vl_malloc (sizeof(TYPE) * dimension * numCenters) ;
00338
00339 {
00340 vl_uindex * perm = vl_malloc (sizeof(vl_uindex) * numData) ;
00341 #if (FLT == VL_TYPE_FLOAT)
00342 VlFloatVectorComparisonFunction distFn = vl_get_vector_comparison_function_f(self->distance) ;
00343 #else
00344 VlDoubleVectorComparisonFunction distFn = vl_get_vector_comparison_function_d(self->distance) ;
00345 #endif
00346 TYPE * distances = vl_malloc (sizeof(TYPE) * numCenters) ;
00347
00348
00349 for (i = 0 ; i < numData ; ++i) perm[i] = i ;
00350 _vl_kmeans_shuffle (perm, numData, rand) ;
00351
00352 for (k = 0, i = 0 ; k < numCenters ; ++ i) {
00353
00354
00355
00356
00357 if (numCenters - k < numData - i) {
00358 vl_bool duplicateDetected = VL_FALSE ;
00359 VL_XCAT(vl_eval_vector_comparison_on_all_pairs_, SFX)(distances,
00360 dimension,
00361 data + dimension * perm[i], 1,
00362 (TYPE*)self->centers, k,
00363 distFn) ;
00364 for (j = 0 ; j < k ; ++j) { duplicateDetected |= (distances[j] == 0) ; }
00365 if (duplicateDetected) continue ;
00366 }
00367
00368
00369 memcpy ((TYPE*)self->centers + dimension * k,
00370 data + dimension * perm[i],
00371 sizeof(TYPE) * dimension) ;
00372 k ++ ;
00373 }
00374 vl_free(distances) ;
00375 vl_free(perm) ;
00376 }
00377 }
00378
00379
00380
00381
00382
00383 static void
00384 VL_XCAT(_vl_kmeans_seed_centers_plus_plus_, SFX)
00385 (VlKMeans * self,
00386 TYPE const * data,
00387 vl_size dimension,
00388 vl_size numData,
00389 vl_size numCenters)
00390 {
00391 vl_uindex x, c ;
00392 VlRand * rand = vl_get_rand () ;
00393 TYPE * distances = vl_malloc (sizeof(TYPE) * numData) ;
00394 TYPE * minDistances = vl_malloc (sizeof(TYPE) * numData) ;
00395 #if (FLT == VL_TYPE_FLOAT)
00396 VlFloatVectorComparisonFunction distFn = vl_get_vector_comparison_function_f(self->distance) ;
00397 #else
00398 VlDoubleVectorComparisonFunction distFn = vl_get_vector_comparison_function_d(self->distance) ;
00399 #endif
00400
00401 self->dimension = dimension ;
00402 self->numCenters = numCenters ;
00403 self->centers = vl_malloc (sizeof(TYPE) * dimension * numCenters) ;
00404
00405 for (x = 0 ; x < numData ; ++x) {
00406 minDistances[x] = (TYPE) VL_INFINITY_D ;
00407 }
00408
00409
00410 x = vl_rand_uindex (rand, numData) ;
00411 c = 0 ;
00412 while (1) {
00413 TYPE energy = 0 ;
00414 TYPE acc = 0 ;
00415 TYPE thresh = (TYPE) vl_rand_real1 (rand) ;
00416
00417 memcpy ((TYPE*)self->centers + c * dimension,
00418 data + x * dimension,
00419 sizeof(TYPE) * dimension) ;
00420
00421 c ++ ;
00422 if (c == numCenters) break ;
00423
00424 VL_XCAT(vl_eval_vector_comparison_on_all_pairs_, SFX)
00425 (distances,
00426 dimension,
00427 (TYPE*)self->centers + (c - 1) * dimension, 1,
00428 data, numData,
00429 distFn) ;
00430
00431 for (x = 0 ; x < numData ; ++x) {
00432 minDistances[x] = VL_MIN(minDistances[x], distances[x]) ;
00433 energy += minDistances[x] ;
00434 }
00435
00436 for (x = 0 ; x < numData - 1 ; ++x) {
00437 acc += minDistances[x] ;
00438 if (acc >= thresh * energy) break ;
00439 }
00440 }
00441
00442 vl_free(distances) ;
00443 vl_free(minDistances) ;
00444 }
00445
00446
00447
00448
00449
00450 static void
00451 VL_XCAT(_vl_kmeans_quantize_, SFX)
00452 (VlKMeans * self,
00453 vl_uint32 * assignments,
00454 TYPE * distances,
00455 TYPE const * data,
00456 vl_size numData)
00457 {
00458 vl_uindex i ;
00459 #if (FLT == VL_TYPE_FLOAT)
00460 VlFloatVectorComparisonFunction distFn = vl_get_vector_comparison_function_f(self->distance) ;
00461 #else
00462 VlDoubleVectorComparisonFunction distFn = vl_get_vector_comparison_function_d(self->distance) ;
00463 #endif
00464 TYPE * distanceToCenters = vl_malloc (sizeof(TYPE) * self->numCenters) ;
00465
00466 for (i = 0 ; i < numData ; ++i) {
00467 vl_size k ;
00468 TYPE bestDistance = (TYPE) VL_INFINITY_D ;
00469 VL_XCAT(vl_eval_vector_comparison_on_all_pairs_, SFX)(distanceToCenters,
00470 self->dimension,
00471 data + self->dimension * i, 1,
00472 (TYPE*)self->centers, self->numCenters,
00473 distFn) ;
00474 for (k = 0 ; k < self->numCenters ; ++k) {
00475 if (distanceToCenters[k] < bestDistance) {
00476 bestDistance = distanceToCenters[k] ;
00477 assignments[i] = k ;
00478 }
00479 }
00480
00481 if (distances) distances[i] = bestDistance ;
00482 }
00483 vl_free(distanceToCenters) ;
00484 }
00485
00486
00487
00488
00489
00490
00491
00492
00493
00494 VL_INLINE TYPE
00495 VL_XCAT3(_vl_kmeans_, SFX, _qsort_cmp)
00496 (VlKMeansSortWrapper * array, vl_uindex indexA, vl_uindex indexB)
00497 {
00498 return
00499 ((TYPE*)array->data) [array->permutation[indexA] * array->stride]
00500 -
00501 ((TYPE*)array->data) [array->permutation[indexB] * array->stride] ;
00502 }
00503
00504 VL_INLINE void
00505 VL_XCAT3(_vl_kmeans_, SFX, _qsort_swap)
00506 (VlKMeansSortWrapper * array, vl_uindex indexA, vl_uindex indexB)
00507 {
00508 vl_uint32 tmp = array->permutation[indexA] ;
00509 array->permutation[indexA] = array->permutation[indexB] ;
00510 array->permutation[indexB] = tmp ;
00511 }
00512
00513 #define VL_QSORT_prefix VL_XCAT3(_vl_kmeans_, SFX, _qsort)
00514 #define VL_QSORT_array VlKMeansSortWrapper*
00515 #define VL_QSORT_cmp VL_XCAT3(_vl_kmeans_, SFX, _qsort_cmp)
00516 #define VL_QSORT_swap VL_XCAT3(_vl_kmeans_, SFX, _qsort_swap)
00517 #include "qsort-def.h"
00518
00519 static void
00520 VL_XCAT(_vl_kmeans_sort_data_helper_, SFX)
00521 (VlKMeans * self, vl_uint32 * permutations, TYPE const * data, vl_size numData)
00522 {
00523 vl_uindex d, x ;
00524
00525 for (d = 0 ; d < self->dimension ; ++d) {
00526 VlKMeansSortWrapper array ;
00527 array.permutation = permutations + d * numData ;
00528 array.data = data + d ;
00529 array.stride = self->dimension ;
00530 for (x = 0 ; x < numData ; ++x) { array.permutation[x] = x ; }
00531 VL_XCAT3(_vl_kmeans_, SFX, _qsort_sort)(&array, numData) ;
00532 }
00533 }
00534
00535
00536
00537
00538
00539 static double
00540 VL_XCAT(_vl_kmeans_refine_centers_lloyd_, SFX)
00541 (VlKMeans * self,
00542 TYPE const * data,
00543 vl_size numData)
00544 {
00545 vl_size c, d, x, iteration ;
00546 vl_bool allDone ;
00547 double previousEnergy = VL_INFINITY_D ;
00548 double energy ;
00549 TYPE * distances = vl_malloc (sizeof(TYPE) * numData) ;
00550 vl_uint32 * assignments = vl_malloc (sizeof(vl_uint32) * numData) ;
00551 vl_size * clusterMasses = vl_malloc (sizeof(vl_size) * numData) ;
00552 vl_uint32 * permutations = NULL ;
00553 vl_size * numSeenSoFar = NULL ;
00554
00555 if (self->distance == VlDistanceL1) {
00556 permutations = vl_malloc(sizeof(vl_uint32) * numData * self->dimension) ;
00557 numSeenSoFar = vl_malloc(sizeof(vl_size) * self->numCenters) ;
00558 VL_XCAT(_vl_kmeans_sort_data_helper_, SFX)(self, permutations, data, numData) ;
00559 }
00560
00561 for (energy = VL_INFINITY_D,
00562 iteration = 0,
00563 allDone = VL_FALSE ;
00564 1 ;
00565 ++ iteration) {
00566
00567
00568 VL_XCAT(_vl_kmeans_quantize_, SFX)(self, assignments, distances, data, numData) ;
00569
00570
00571 energy = 0 ;
00572 for (x = 0 ; x < numData ; ++x) energy += distances[x] ;
00573 if (self->verbosity) {
00574 VL_PRINTF("kmeans: Lloyd iter %d: energy = %g\n", iteration,
00575 energy) ;
00576 }
00577
00578
00579 if (iteration >= self->maxNumIterations) {
00580 if (self->verbosity) {
00581 VL_PRINTF("kmeans: Lloyd terminating because maximum number of iterations reached\n") ;
00582 }
00583 break ;
00584 }
00585 if (energy == previousEnergy) {
00586 if (self->verbosity) {
00587 VL_PRINTF("kmeans: Lloyd terminating because the algorithm fully converged\n") ;
00588 }
00589 break ;
00590 }
00591
00592
00593 previousEnergy = energy ;
00594
00595
00596 memset(clusterMasses, 0, sizeof(vl_size) * numData) ;
00597 for (x = 0 ; x < numData ; ++x) {
00598 clusterMasses[assignments[x]] ++ ;
00599 }
00600
00601 switch (self->distance) {
00602 case VlDistanceL2:
00603 memset(self->centers, 0, sizeof(TYPE) * self->dimension * self->numCenters) ;
00604 for (x = 0 ; x < numData ; ++x) {
00605 TYPE * cpt = (TYPE*)self->centers + assignments[x] * self->dimension ;
00606 TYPE const * xpt = data + x * self->dimension ;
00607 for (d = 0 ; d < self->dimension ; ++d) { cpt[d] += xpt[d] ; }
00608 }
00609 for (c = 0 ; c < self->numCenters ; ++c) {
00610 TYPE mass = clusterMasses[c] ;
00611 TYPE * cpt = (TYPE*)self->centers + c * self->dimension ;
00612 for (d = 0 ; d < self->dimension ; ++d) { cpt[d] /= mass ; }
00613 }
00614 break ;
00615 case VlDistanceL1:
00616 for (d = 0 ; d < self->dimension ; ++d) {
00617 vl_uint32 * perm = permutations + d * numData ;
00618 memset(numSeenSoFar, 0, sizeof(vl_size) * self->numCenters) ;
00619 for (x = 0; x < numData ; ++x) {
00620 c = assignments[perm[x]] ;
00621 if (2 * numSeenSoFar[c] < clusterMasses[c]) {
00622 ((TYPE*)self->centers) [d + c * self->dimension] =
00623 data [d + perm[x] * self->dimension] ;
00624 }
00625 numSeenSoFar[c] ++ ;
00626 }
00627 }
00628 break ;
00629 default:
00630 abort();
00631 }
00632 }
00633
00634 if (permutations) { vl_free(permutations) ; }
00635 if (numSeenSoFar) { vl_free(numSeenSoFar) ; }
00636 vl_free(distances) ;
00637 vl_free(assignments) ;
00638 vl_free(clusterMasses) ;
00639 return energy ;
00640 }
00641
00642
00643
00644
00645
00646 static double
00647 VL_XCAT(_vl_kmeans_update_center_distances_, SFX)
00648 (VlKMeans * self)
00649 {
00650 #if (FLT == VL_TYPE_FLOAT)
00651 VlFloatVectorComparisonFunction distFn = vl_get_vector_comparison_function_f(self->distance) ;
00652 #else
00653 VlDoubleVectorComparisonFunction distFn = vl_get_vector_comparison_function_d(self->distance) ;
00654 #endif
00655
00656 if (! self->centerDistances) {
00657 self->centerDistances = vl_malloc (sizeof(TYPE) *
00658 self->numCenters *
00659 self->numCenters) ;
00660 }
00661 VL_XCAT(vl_eval_vector_comparison_on_all_pairs_, SFX)(self->centerDistances,
00662 self->dimension,
00663 self->centers, self->numCenters,
00664 NULL, 0,
00665 distFn) ;
00666 return self->numCenters * (self->numCenters - 1) / 2 ;
00667 }
00668
00669
00670
00671 static double
00672 VL_XCAT(_vl_kmeans_refine_centers_elkan_, SFX)
00673 (VlKMeans * self,
00674 TYPE const * data,
00675 vl_size numData)
00676 {
00677 vl_size d, iteration, x ;
00678 vl_uint32 c, j ;
00679 vl_bool allDone ;
00680 TYPE * distances = vl_malloc (sizeof(TYPE) * numData) ;
00681 vl_uint32 * assignments = vl_malloc (sizeof(vl_uint32) * numData) ;
00682 vl_size * clusterMasses = vl_malloc (sizeof(vl_size) * numData) ;
00683
00684 #if (FLT == VL_TYPE_FLOAT)
00685 VlFloatVectorComparisonFunction distFn = vl_get_vector_comparison_function_f(self->distance) ;
00686 #else
00687 VlDoubleVectorComparisonFunction distFn = vl_get_vector_comparison_function_d(self->distance) ;
00688 #endif
00689
00690 TYPE * nextCenterDistances = vl_malloc (sizeof(TYPE) * self->numCenters) ;
00691 TYPE * pointToClosestCenterUB = vl_malloc (sizeof(TYPE) * numData) ;
00692 vl_bool * pointToClosestCenterUBIsStrict = vl_malloc (sizeof(vl_bool) * numData) ;
00693 TYPE * pointToCenterLB = vl_malloc (sizeof(TYPE) * numData * self->numCenters) ;
00694 TYPE * newCenters = vl_malloc(sizeof(TYPE) * self->dimension * self->numCenters) ;
00695 TYPE * centerToNewCenterDistances = vl_malloc (sizeof(TYPE) * self->numCenters) ;
00696
00697 vl_uint32 * permutations = NULL ;
00698 vl_size * numSeenSoFar = NULL ;
00699
00700 double energy ;
00701
00702 vl_size totDistanceComputationsToInit = 0 ;
00703 vl_size totDistanceComputationsToRefreshUB = 0 ;
00704 vl_size totDistanceComputationsToRefreshLB = 0 ;
00705 vl_size totDistanceComputationsToRefreshCenterDistances = 0 ;
00706 vl_size totDistanceComputationsToNewCenters = 0 ;
00707 vl_size totDistanceComputationsToFinalize = 0 ;
00708
00709 if (self->distance == VlDistanceL1) {
00710 permutations = vl_malloc(sizeof(vl_uint32) * numData * self->dimension) ;
00711 numSeenSoFar = vl_malloc(sizeof(vl_size) * self->numCenters) ;
00712 VL_XCAT(_vl_kmeans_sort_data_helper_, SFX)(self, permutations, data, numData) ;
00713 }
00714
00715
00716
00717
00718
00719
00720
00721
00722
00723
00724
00725 totDistanceComputationsToInit +=
00726 VL_XCAT(_vl_kmeans_update_center_distances_, SFX)(self) ;
00727
00728
00729 memset(pointToCenterLB, 0, sizeof(TYPE) * self->numCenters * numData) ;
00730 for (x = 0 ; x < numData ; ++x) {
00731 TYPE distance ;
00732
00733
00734 assignments[x] = 0 ;
00735 distance = distFn(self->dimension,
00736 data + x * self->dimension,
00737 (TYPE*)self->centers + 0) ;
00738 pointToClosestCenterUB[x] = distance ;
00739 pointToClosestCenterUBIsStrict[x] = VL_TRUE ;
00740 pointToCenterLB[0 + x * self->numCenters] = distance ;
00741 totDistanceComputationsToInit += 1 ;
00742
00743
00744 for (c = 1 ; c < self->numCenters ; ++c) {
00745
00746
00747
00748
00749 if (((self->distance == VlDistanceL1) ? 2.0 : 4.0) *
00750 pointToClosestCenterUB[x] <=
00751 ((TYPE*)self->centerDistances)
00752 [c + assignments[x] * self->numCenters]) {
00753 continue ;
00754 }
00755
00756 distance = distFn(self->dimension,
00757 data + x * self->dimension,
00758 (TYPE*)self->centers + c * self->dimension) ;
00759 pointToCenterLB[c + x * self->numCenters] = distance ;
00760 totDistanceComputationsToInit += 1 ;
00761 if (distance < pointToClosestCenterUB[x]) {
00762 pointToClosestCenterUB[x] = distance ;
00763 assignments[x] = c ;
00764 }
00765 }
00766 }
00767
00768
00769 energy = 0 ;
00770 for (x = 0 ; x < numData ; ++x) {
00771 energy += pointToClosestCenterUB[x] ;
00772 }
00773
00774 if (self->verbosity) {
00775 VL_PRINTF("kmeans: Elkan iter 0: energy = %g, dist. calc. = %d\n",
00776 energy, totDistanceComputationsToInit) ;
00777 }
00778
00779
00780 #ifdef SANITY
00781 {
00782 int xx ; int cc ;
00783 TYPE tol = 1e-5 ;
00784 VL_PRINTF("inconsistencies after initial assignments:\n");
00785 for (xx = 0 ; xx < numData ; ++xx) {
00786 for (cc = 0 ; cc < self->numCenters ; ++cc) {
00787 TYPE a = pointToCenterLB[cc + xx * self->numCenters] ;
00788 TYPE b = distFn(self->dimension,
00789 data + self->dimension * xx,
00790 (TYPE*)self->centers + self->dimension * cc) ;
00791 if (cc == assignments[xx]) {
00792 TYPE z = pointToClosestCenterUB[xx] ;
00793 if (z+tol<b) VL_PRINTF("UB %d %d = %f < %f\n",
00794 cc, xx, z, b) ;
00795 }
00796 if (a>b+tol) VL_PRINTF("LB %d %d = %f > %f\n",
00797 cc, xx, a, b) ;
00798 }
00799 }
00800 }
00801 #endif
00802
00803
00804
00805
00806
00807 for (iteration = 1 ; 1; ++iteration) {
00808
00809 vl_size numDistanceComputationsToRefreshUB = 0 ;
00810 vl_size numDistanceComputationsToRefreshLB = 0 ;
00811 vl_size numDistanceComputationsToRefreshCenterDistances = 0 ;
00812 vl_size numDistanceComputationsToNewCenters = 0 ;
00813
00814
00815
00816
00817
00818 memset(clusterMasses, 0, sizeof(vl_size) * numData) ;
00819 for (x = 0 ; x < numData ; ++x) {
00820 clusterMasses[assignments[x]] ++ ;
00821 }
00822
00823 switch (self->distance) {
00824 case VlDistanceL2:
00825 memset(newCenters, 0, sizeof(TYPE) * self->dimension * self->numCenters) ;
00826 for (x = 0 ; x < numData ; ++x) {
00827 TYPE * cpt = newCenters + assignments[x] * self->dimension ;
00828 TYPE const * xpt = data + x * self->dimension ;
00829 for (d = 0 ; d < self->dimension ; ++d) { cpt[d] += xpt[d] ; }
00830 }
00831 for (c = 0 ; c < self->numCenters ; ++c) {
00832 TYPE mass = clusterMasses[c] ;
00833 TYPE * cpt = newCenters + c * self->dimension ;
00834 for (d = 0 ; d < self->dimension ; ++d) { cpt[d] /= mass ; }
00835 }
00836 break ;
00837 case VlDistanceL1:
00838 for (d = 0 ; d < self->dimension ; ++d) {
00839 vl_uint32 * perm = permutations + d * numData ;
00840 memset(numSeenSoFar, 0, sizeof(vl_size) * self->numCenters) ;
00841 for (x = 0; x < numData ; ++x) {
00842 c = assignments[perm[x]] ;
00843 if (2 * numSeenSoFar[c] < clusterMasses[c]) {
00844 newCenters [d + c * self->dimension] =
00845 data [d + perm[x] * self->dimension] ;
00846 }
00847 numSeenSoFar[c] ++ ;
00848 }
00849 }
00850 break ;
00851 default:
00852 abort();
00853 }
00854
00855
00856 for (c = 0 ; c < self->numCenters ; ++c) {
00857 TYPE distance = distFn(self->dimension,
00858 newCenters + c * self->dimension,
00859 (TYPE*)self->centers + c * self->dimension) ;
00860 centerToNewCenterDistances[c] = distance ;
00861 numDistanceComputationsToNewCenters += 1 ;
00862 }
00863
00864
00865 {
00866 TYPE * tmp = self->centers ;
00867 self->centers = newCenters ;
00868 newCenters = tmp ;
00869 }
00870
00871
00872
00873
00874
00875
00876
00877
00878 numDistanceComputationsToRefreshCenterDistances
00879 += VL_XCAT(_vl_kmeans_update_center_distances_, SFX)(self) ;
00880
00881 for (c = 0 ; c < self->numCenters ; ++c) {
00882 nextCenterDistances[c] = (TYPE) VL_INFINITY_D ;
00883 for (j = 0 ; j < self->numCenters ; ++j) {
00884 if (j == c) continue ;
00885 nextCenterDistances[c] = VL_MIN(nextCenterDistances[c],
00886 ((TYPE*)self->centerDistances)
00887 [j + c * self->numCenters]) ;
00888 }
00889 }
00890
00891
00892
00893
00894
00895 for (x = 0 ; x < numData ; ++x) {
00896 TYPE a = pointToClosestCenterUB[x] ;
00897 TYPE b = centerToNewCenterDistances[assignments[x]] ;
00898 if (self->distance == VlDistanceL1) {
00899 pointToClosestCenterUB[x] = a + b ;
00900 } else {
00901 #if (FLT == VL_TYPE_FLOAT)
00902 TYPE sqrtab = sqrtf (a * b) ;
00903 #else
00904 TYPE sqrtab = sqrt (a * b) ;
00905 #endif
00906 pointToClosestCenterUB[x] = a + b + 2.0 * sqrtab ;
00907 }
00908 pointToClosestCenterUBIsStrict[x] = VL_FALSE ;
00909 }
00910
00911
00912
00913
00914
00915 for (x = 0 ; x < numData ; ++x) {
00916 for (c = 0 ; c < self->numCenters ; ++c) {
00917 TYPE a = pointToCenterLB[c + x * self->numCenters] ;
00918 TYPE b = centerToNewCenterDistances[c] ;
00919 if (a < b) {
00920 pointToCenterLB[c + x * self->numCenters] = 0 ;
00921 } else {
00922 if (self->distance == VlDistanceL1) {
00923 pointToCenterLB[c + x * self->numCenters] = a - b ;
00924 } else {
00925 #if (FLT == VL_TYPE_FLOAT)
00926 TYPE sqrtab = sqrtf (a * b) ;
00927 #else
00928 TYPE sqrtab = sqrt (a * b) ;
00929 #endif
00930 pointToCenterLB[c + x * self->numCenters] = a + b - 2.0 * sqrtab ;
00931 }
00932 }
00933 }
00934 }
00935
00936 #ifdef SANITY
00937 {
00938 int xx ; int cc ;
00939 TYPE tol = 1e-5 ;
00940 VL_PRINTF("inconsistencies before assignments:\n");
00941 for (xx = 0 ; xx < numData ; ++xx) {
00942 for (cc = 0 ; cc < self->numCenters ; ++cc) {
00943 TYPE a = pointToCenterLB[cc + xx * self->numCenters] ;
00944 TYPE b = distFn(self->dimension,
00945 data + self->dimension * xx,
00946 (TYPE*)self->centers + self->dimension * cc) ;
00947 if (cc == assignments[xx]) {
00948 TYPE z = pointToClosestCenterUB[xx] ;
00949 if (z+tol<b) VL_PRINTF("UB %d %d = %f < %f\n",
00950 cc, xx, z, b) ;
00951 }
00952 if (a>b+tol) VL_PRINTF("LB %d %d = %f > %f (assign = %d)\n",
00953 cc, xx, a, b, assignments[xx]) ;
00954 }
00955 }
00956 }
00957 #endif
00958
00959
00960
00961
00962
00963 for (allDone = VL_TRUE, x = 0 ; x < numData ; ++x) {
00964
00965
00966
00967
00968
00969 if (((self->distance == VlDistanceL1) ? 2.0 : 4.0) *
00970 pointToClosestCenterUB[x] <= nextCenterDistances[assignments[x]]) {
00971 continue ;
00972 }
00973
00974 for (c = 0 ; c < self->numCenters ; ++c) {
00975 vl_uint32 cx = assignments[x] ;
00976 TYPE distance ;
00977
00978
00979
00980
00981
00982
00983
00984
00985
00986
00987 if (cx == c) {
00988 continue ;
00989 }
00990 if (((self->distance == VlDistanceL1) ? 2.0 : 4.0) *
00991 pointToClosestCenterUB[x] <= ((TYPE*)self->centerDistances)
00992 [c + cx * self->numCenters]) {
00993 continue ;
00994 }
00995 if (pointToClosestCenterUB[x] <= pointToCenterLB
00996 [c + x * self->numCenters]) {
00997 continue ;
00998 }
00999
01000
01001 if (! pointToClosestCenterUBIsStrict[x]) {
01002 distance = distFn(self->dimension,
01003 data + self->dimension * x,
01004 (TYPE*)self->centers + self->dimension * cx) ;
01005 pointToClosestCenterUB[x] = distance ;
01006 pointToClosestCenterUBIsStrict[x] = VL_TRUE ;
01007 pointToCenterLB[cx + x * self->numCenters] = distance ;
01008 numDistanceComputationsToRefreshUB += 1 ;
01009
01010 if (((self->distance == VlDistanceL1) ? 2.0 : 4.0) *
01011 pointToClosestCenterUB[x] <= ((TYPE*)self->centerDistances)
01012 [c + cx * self->numCenters]) {
01013 continue ;
01014 }
01015 if (pointToClosestCenterUB[x] <= pointToCenterLB
01016 [c + x * self->numCenters]) {
01017 continue ;
01018 }
01019 }
01020
01021
01022
01023
01024
01025
01026
01027 distance = distFn(self->dimension,
01028 data + x * self->dimension,
01029 (TYPE*)self->centers + c * self->dimension) ;
01030 numDistanceComputationsToRefreshLB += 1 ;
01031 pointToCenterLB[c + x * self->numCenters] = distance ;
01032
01033 if (distance < pointToClosestCenterUB[x]) {
01034 assignments[x] = c ;
01035 pointToClosestCenterUB[x] = distance ;
01036 allDone = VL_FALSE ;
01037
01038 }
01039
01040 }
01041 }
01042
01043 totDistanceComputationsToRefreshUB
01044 += numDistanceComputationsToRefreshUB ;
01045
01046 totDistanceComputationsToRefreshLB
01047 += numDistanceComputationsToRefreshLB ;
01048
01049 totDistanceComputationsToRefreshCenterDistances
01050 += numDistanceComputationsToRefreshCenterDistances ;
01051
01052 totDistanceComputationsToNewCenters
01053 += numDistanceComputationsToNewCenters ;
01054
01055 #ifdef SANITY
01056 {
01057 int xx ; int cc ;
01058 TYPE tol = 1e-5 ;
01059 VL_PRINTF("inconsistencies after assignments:\n");
01060 for (xx = 0 ; xx < numData ; ++xx) {
01061 for (cc = 0 ; cc < self->numCenters ; ++cc) {
01062 TYPE a = pointToCenterLB[cc + xx * self->numCenters] ;
01063 TYPE b = distFn(self->dimension,
01064 data + self->dimension * xx,
01065 (TYPE*)self->centers + self->dimension * cc) ;
01066 if (cc == assignments[xx]) {
01067 TYPE z = pointToClosestCenterUB[xx] ;
01068 if (z+tol<b) VL_PRINTF("UB %d %d = %f < %f\n",
01069 cc, xx, z, b) ;
01070 }
01071 if (a>b+tol) VL_PRINTF("LB %d %d = %f > %f (assign = %d)\n",
01072 cc, xx, a, b, assignments[xx]) ;
01073 }
01074 }
01075 }
01076 #endif
01077
01078
01079 energy = 0 ;
01080 for (x = 0 ; x < numData ; ++x) {
01081 energy += pointToClosestCenterUB[x] ;
01082 }
01083
01084 if (self->verbosity) {
01085 vl_size numDistanceComputations =
01086 numDistanceComputationsToRefreshUB +
01087 numDistanceComputationsToRefreshLB +
01088 numDistanceComputationsToRefreshCenterDistances +
01089 numDistanceComputationsToNewCenters ;
01090 VL_PRINTF("kmeans: Elkan iter %d: energy <= %g, dist. calc. = %d\n",
01091 iteration,
01092 energy,
01093 numDistanceComputations) ;
01094 if (self->verbosity > 1) {
01095 VL_PRINTF("kmeans: Elkan iter %d: total dist. calc. per type: "
01096 "UB: %.1f%% (%d), LB: %.1f%% (%d), "
01097 "intra_center: %.1f%% (%d), "
01098 "new_center: %.1f%% (%d)\n",
01099 iteration,
01100 100.0 * numDistanceComputationsToRefreshUB / numDistanceComputations,
01101 numDistanceComputationsToRefreshUB,
01102 100.0 *numDistanceComputationsToRefreshLB / numDistanceComputations,
01103 numDistanceComputationsToRefreshLB,
01104 100.0 * numDistanceComputationsToRefreshCenterDistances / numDistanceComputations,
01105 numDistanceComputationsToRefreshCenterDistances,
01106 100.0 * numDistanceComputationsToNewCenters / numDistanceComputations,
01107 numDistanceComputationsToNewCenters) ;
01108 }
01109 }
01110
01111
01112 if (iteration >= self->maxNumIterations) {
01113 if (self->verbosity) {
01114 VL_PRINTF("kmeans: Elkan terminating because maximum number of iterations reached\n") ;
01115 }
01116 break ;
01117 }
01118 if (allDone) {
01119 if (self->verbosity) {
01120 VL_PRINTF("kmeans: Elkan terminating because the algorithm fully converged\n") ;
01121 }
01122 break ;
01123 }
01124
01125 }
01126
01127
01128
01129 energy = 0 ;
01130 for (x = 0 ; x < numData ; ++ x) {
01131 vl_uindex cx = assignments [x] ;
01132 energy += distFn(self->dimension,
01133 data + self->dimension * x,
01134 (TYPE*)self->centers + self->dimension * cx) ;
01135 totDistanceComputationsToFinalize += 1 ;
01136 }
01137
01138 {
01139 vl_size totDistanceComputations =
01140 totDistanceComputationsToInit +
01141 totDistanceComputationsToRefreshUB +
01142 totDistanceComputationsToRefreshLB +
01143 totDistanceComputationsToRefreshCenterDistances +
01144 totDistanceComputationsToNewCenters +
01145 totDistanceComputationsToFinalize ;
01146
01147 double saving = (double)totDistanceComputations
01148 / (iteration * self->numCenters * numData) ;
01149
01150 if (self->verbosity) {
01151 VL_PRINTF("kmeans: Elkan: total dist. calc.: %d (%.2f %% of Lloyd)\n",
01152 totDistanceComputations, saving * 100.0) ;
01153 }
01154
01155 if (self->verbosity > 1) {
01156 VL_PRINTF("kmeans: Elkan: total dist. calc. per type: "
01157 "init: %.1f%% (%d), UB: %.1f%% (%d), LB: %.1f%% (%d), "
01158 "intra_center: %.1f%% (%d), "
01159 "new_center: %.1f%% (%d), "
01160 "finalize: %.1f%% (%d)\n",
01161 100.0 * totDistanceComputationsToInit / totDistanceComputations,
01162 totDistanceComputationsToInit,
01163 100.0 * totDistanceComputationsToRefreshUB / totDistanceComputations,
01164 totDistanceComputationsToRefreshUB,
01165 100.0 *totDistanceComputationsToRefreshLB / totDistanceComputations,
01166 totDistanceComputationsToRefreshLB,
01167 100.0 * totDistanceComputationsToRefreshCenterDistances / totDistanceComputations,
01168 totDistanceComputationsToRefreshCenterDistances,
01169 100.0 * totDistanceComputationsToNewCenters / totDistanceComputations,
01170 totDistanceComputationsToNewCenters,
01171 100.0 * totDistanceComputationsToFinalize / totDistanceComputations,
01172 totDistanceComputationsToFinalize) ;
01173 }
01174 }
01175
01176 if (permutations) { vl_free(permutations) ; }
01177 if (numSeenSoFar) { vl_free(numSeenSoFar) ; }
01178
01179 vl_free(distances) ;
01180 vl_free(assignments) ;
01181 vl_free(clusterMasses) ;
01182
01183 vl_free(nextCenterDistances) ;
01184 vl_free(pointToClosestCenterUB) ;
01185 vl_free(pointToClosestCenterUBIsStrict) ;
01186 vl_free(pointToCenterLB) ;
01187 vl_free(newCenters) ;
01188 vl_free(centerToNewCenterDistances) ;
01189
01190 return energy ;
01191 }
01192
01193
01194 static double
01195 VL_XCAT(_vl_kmeans_refine_centers_, SFX)
01196 (VlKMeans * self,
01197 TYPE const * data,
01198 vl_size numData)
01199 {
01200 switch (self->algorithm) {
01201 case VlKMeansLLoyd:
01202 return
01203 VL_XCAT(_vl_kmeans_refine_centers_lloyd_, SFX)(self, data, numData) ;
01204 break ;
01205 case VlKMeansElkan:
01206 return
01207 VL_XCAT(_vl_kmeans_refine_centers_elkan_, SFX)(self, data, numData) ;
01208 break ;
01209 default:
01210 abort() ;
01211 }
01212 }
01213
01214
01215 #else
01216
01217 #define FLT VL_TYPE_FLOAT
01218 #define TYPE float
01219 #define SFX f
01220 #define VL_KMEANS_INSTANTIATING
01221 #include "kmeans.c"
01222
01223 #define FLT VL_TYPE_DOUBLE
01224 #define TYPE double
01225 #define SFX d
01226 #define VL_KMEANS_INSTANTIATING
01227 #include "kmeans.c"
01228
01229
01230 #endif
01231
01232
01233 #ifndef VL_KMEANS_INSTANTIATING
01234
01243 VL_EXPORT void
01244 vl_kmeans_set_centers
01245 (VlKMeans * self,
01246 void const * centers,
01247 vl_size dimension,
01248 vl_size numCenters)
01249 {
01250 vl_kmeans_reset (self) ;
01251
01252 switch (self->dataType) {
01253 case VL_TYPE_FLOAT :
01254 _vl_kmeans_set_centers_f
01255 (self, (float const *)centers, dimension, numCenters) ;
01256 break ;
01257 case VL_TYPE_DOUBLE :
01258 _vl_kmeans_set_centers_d
01259 (self, (double const *)centers, dimension, numCenters) ;
01260 break ;
01261 default:
01262 abort() ;
01263 }
01264 }
01265
01279 VL_EXPORT void
01280 vl_kmeans_seed_centers_with_rand_data
01281 (VlKMeans * self,
01282 void const * data,
01283 vl_size dimension,
01284 vl_size numData,
01285 vl_size numCenters)
01286 {
01287 vl_kmeans_reset (self) ;
01288
01289 switch (self->dataType) {
01290 case VL_TYPE_FLOAT :
01291 _vl_kmeans_seed_centers_with_rand_data_f
01292 (self, (float const *)data, dimension, numData, numCenters) ;
01293 break ;
01294 case VL_TYPE_DOUBLE :
01295 _vl_kmeans_seed_centers_with_rand_data_d
01296 (self, (double const *)data, dimension, numData, numCenters) ;
01297 break ;
01298 default:
01299 abort() ;
01300 }
01301 }
01302
01312 VL_EXPORT void
01313 vl_kmeans_seed_centers_plus_plus
01314 (VlKMeans * self,
01315 void const * data,
01316 vl_size dimension,
01317 vl_size numData,
01318 vl_size numCenters)
01319 {
01320 vl_kmeans_reset (self) ;
01321
01322 switch (self->dataType) {
01323 case VL_TYPE_FLOAT :
01324 _vl_kmeans_seed_centers_plus_plus_f
01325 (self, (float const *)data, dimension, numData, numCenters) ;
01326 break ;
01327 case VL_TYPE_DOUBLE :
01328 _vl_kmeans_seed_centers_plus_plus_d
01329 (self, (double const *)data, dimension, numData, numCenters) ;
01330 break ;
01331 default:
01332 abort() ;
01333 }
01334 }
01335
01345 VL_EXPORT void
01346 vl_kmeans_quantize
01347 (VlKMeans * self,
01348 vl_uint32 * assignments,
01349 void * distances,
01350 void const * data,
01351 vl_size numData)
01352 {
01353 switch (self->dataType) {
01354 case VL_TYPE_FLOAT :
01355 _vl_kmeans_quantize_f
01356 (self, assignments, distances, (float const *)data, numData) ;
01357 break ;
01358 case VL_TYPE_DOUBLE :
01359 _vl_kmeans_quantize_d
01360 (self, assignments, distances, (double const *)data, numData) ;
01361 break ;
01362 default:
01363 abort() ;
01364 }
01365 }
01366
01381 VL_EXPORT double
01382 vl_kmeans_refine_centers
01383 (VlKMeans * self,
01384 void const * data,
01385 vl_size numData)
01386 {
01387 assert (self->centers) ;
01388
01389 switch (self->dataType) {
01390 case VL_TYPE_FLOAT :
01391 return
01392 _vl_kmeans_refine_centers_f
01393 (self, (float const *)data, numData) ;
01394 case VL_TYPE_DOUBLE :
01395 return
01396 _vl_kmeans_refine_centers_d
01397 (self, (double const *)data, numData) ;
01398 default:
01399 abort() ;
01400 }
01401 }
01402
01403
01418 VL_EXPORT double
01419 vl_kmeans_cluster (VlKMeans * self,
01420 void const * data,
01421 vl_size dimension,
01422 vl_size numData,
01423 vl_size numCenters)
01424 {
01425 vl_uindex repetition ;
01426 double bestEnergy = VL_INFINITY_D ;
01427 void * bestCenters = NULL ;
01428
01429 for (repetition = 0 ; repetition < self->numRepetitions ; ++ repetition) {
01430 double energy ;
01431 double timeRef ;
01432
01433 if (self->verbosity) {
01434 VL_PRINTF("kmeans: repetition %d of %d\n", repetition + 1, self->numRepetitions) ;
01435 }
01436
01437 timeRef = vl_get_cpu_time() ;
01438 switch (self->initialization) {
01439 case VlKMeansRandomSelection :
01440 vl_kmeans_seed_centers_with_rand_data (self,
01441 data, dimension, numData,
01442 numCenters) ;
01443 break ;
01444 case VlKMeansPlusPlus :
01445 vl_kmeans_seed_centers_plus_plus (self,
01446 data, dimension, numData,
01447 numCenters) ;
01448 break ;
01449 default:
01450 abort() ;
01451 }
01452
01453 if (self->verbosity) {
01454 VL_PRINTF("kmeans: K-means initialized in %.2f s\n",
01455 vl_get_cpu_time() - timeRef) ;
01456 }
01457
01458 timeRef = vl_get_cpu_time () ;
01459 energy = vl_kmeans_refine_centers (self, data, numData) ;
01460 if (self->verbosity) {
01461 VL_PRINTF("kmeans: K-means termineted in %.2f s with energy %g\n",
01462 vl_get_cpu_time() - timeRef, energy) ;
01463 }
01464
01465
01466 if (energy < bestEnergy) {
01467 void * temp ;
01468 bestEnergy = energy ;
01469
01470 if (bestCenters == NULL) {
01471 bestCenters = vl_malloc(vl_get_type_size(self->dataType) *
01472 self->dimension *
01473 self->numCenters) ;
01474 }
01475
01476
01477 temp = bestCenters ;
01478 bestCenters = self->centers ;
01479 self->centers = temp ;
01480 }
01481 }
01482
01483 vl_free (self->centers) ;
01484 self->centers = bestCenters ;
01485 return bestEnergy ;
01486 }
01487
01488
01489 #endif
01490
01491 #undef SFX
01492 #undef TYPE
01493 #undef FLT
01494 #undef VL_KMEANS_INSTANTIATING