-
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathkmeans
97 lines (82 loc) · 3.68 KB
/
kmeans
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
/*
Copyright (C) 2018-2024 Geoffrey Daniels. https://gpdaniels.com/
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, version 3 of the License only.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
#pragma once
#ifndef GTL_ALGORITHM_KMEANS_HPP
#define GTL_ALGORITHM_KMEANS_HPP
// Summary: Implementation of the KMeans clustering algorithm. [wip]
#if defined(_MSC_VER)
#pragma warning(push, 0)
#endif
#include <vector>
#if defined(_MSC_VER)
#pragma warning(pop)
#endif
namespace gtl {
template <typename data_type, typename distance_type>
class kmeans final {
private:
using distance_function_type = distance_type(const data_type&, const data_type&);
public:
static std::vector<std::size_t> compute(
const std::vector<data_type>& data,
std::size_t cluster_count,
int max_iterations,
distance_type min_delta,
distance_function_type distance_function
) {
std::vector<data_type> cluster_centroids(cluster_count);
std::vector<std::size_t> cluster_data(data.size());
// TODO: ASSERT there is enough data.
// Initialise cluster centroids using data (could be done using random points).
for (std::size_t i = 0; i < cluster_count; ++i) {
cluster_centroids[i] = data[i];
}
// Iteratively improve the clusters.
for (int iteration = 0; iteration < max_iterations; ++iteration) {
// Allocate points to clusters.
for (std::size_t i = 0; i < data.size(); ++i) {
// Assume first cluster.
cluster_data[i] = 0;
distance_type cluster_distance = distance_function(data[i], cluster_centroids[cluster_data[i]]);
// Search remaining clusters for closer centroids.
for (std::size_t j = 1; j < cluster_count; ++j) {
distance_type distance = distance_function(data[i], cluster_centroids[j]);
if (distance < cluster_distance) {
cluster_data[i] = j;
cluster_distance = distance;
}
}
}
// Recalculate cluster centroids.
std::vector<data_type> centroid_data_sum(cluster_count);
std::vector<int> centroid_data_count(cluster_count);
for (std::size_t i = 0; i < data.size(); ++i) {
centroid_data_sum[cluster_data[i]] = centroid_data_sum[cluster_data[i]] + data[i];
++centroid_data_count[cluster_data[i]];
}
// TODO: If data count is zero, should I move the centroid?
distance_type delta = 0;
for (std::size_t i = 0; i < cluster_count; ++i) {
data_type centroid = centroid_data_sum[i] / centroid_data_count[i];
delta += distance_function(cluster_centroids[i], centroid);
cluster_centroids[i] = centroid;
}
if (delta < min_delta) {
break;
}
}
return cluster_data;
}
};
}
#endif // GTL_ALGORITHM_KMEANS_HPP