Vous êtes sur la page 1sur 8

/*

Author: Blake Richardson


Date: 11/20/2016
Email: blake.a.l.richardson@gmail.com
Copyright: (C) 2016 DigiPen Institute of Technology. Reproduction or disclosur
e of this file or its contents without
the prior written consent of DigiPen Institute of Technology is pro
hibited.
*/
#include "Kmeans.h"
#include "Random.h"
#include <string>
int Distance(DataBlock point_1, DataBlock point_2)
{
int return_value = 0;
for(unsigned i = 0; i < point_1.GetSize(); i++)
return_value += (int)std::sqrt((std::atoi(point_2[i].c_str()) - std::atoi(po
int_1[i].c_str())) * (std::atoi(point_2[i].c_str()) - std::atoi(point_1[i].c_str
())));
return return_value;
}
double Distance(DataBlock point_1, DataBlock point_2, std::vector<std::string> v
ariables)
{
double return_value = 0;
for(unsigned i = 0; i < variables.size(); i++)
{
if(point_2[variables[i]] == "yes" || point_1[variables[i]] == "yes" || point
_2[variables[i]] == "no" || point_1[variables[i]] == "no")
{
int p1 = 0;
int p2 = 0;
if(point_2[variables[i]] == "yes")
p2 = 10000;
else if(point_2[variables[i]] == "no")
p2 = 1;
else
p2 = std::atoi(point_2[variables[i]].c_str());
if(point_1[variables[i]] == "yes")
p1 = 10000;
else if(point_1[variables[i]] == "no")
p1 = 1;
else
p1 = std::atoi(point_1[variables[i]].c_str());
if(p2 >= p1)
return_value += std::sqrt((p2 - p1) * (p2 - p1));
else if(p1 > p2)
return_value += std::sqrt((p1 - p2) * (p1 - p2));
}
else
return_value += std::sqrt((std::atof(point_2[variables[i]].c_str()) - std:

:atof(point_1[variables[i]].c_str())) * (std::atof(point_2[variables[i]].c_str()
) - std::atof(point_1[variables[i]].c_str())));
}
return return_value;
}
int Min(std::vector<double> inputs)
{
int return_value = 0;
double current_min = FLT_MAX;
for(int i = 0; i < inputs.size(); i++)
{
if(inputs[i] < current_min)
{
return_value = i;
current_min = inputs[i];
}
}
return return_value;
}
bool IsIn(int value, std::vector<int>& check)
{
for(unsigned i = 0; i < check.size(); i++)
{
if(check[i] == value)
return true;
}
return false;
}
bool NoneIn(std::vector<int>& values, std::vector<int>& check)
{
for(unsigned i = 0; i < values.size(); i++)
{
if(IsIn(values[i], check))
return false;
}
return true;
}
int Min(std::vector<int>& inputs, std::vector<int>& unconsidered)
{
int return_value = 0;
int current_min = INT_MAX;
for(int i = 0; i < inputs.size(); i++)
{
if(inputs[i] < current_min && !IsIn(i, unconsidered))
{
return_value = i;
current_min = inputs[i];
}
}

return return_value;
}
DataBlock GeometricMean(std::vector<DataBlock>& input)
{
std::vector<float> sums;
std::vector<std::string> column_names = input[0].GetColumnNames();
int sum_x = 0;
int sum_y = 0;
for(unsigned i = 0; i < input[0].GetSize(); i++)
sums.push_back(0);
for(auto x : input)
{
for(unsigned i = 0; i < x.GetSize(); i++)
{
if(x[i] == "yes")
sums[i] += 1000;
else if(x[i] == "no")
sums[i] += 1;
else
sums[i] += std::atoi(x[i].c_str());
}
}
for(unsigned i = 0; i < sums.size(); i++)
sums[i] /= input.size();
DataBlock block;
for(unsigned i = 0; i < sums.size(); i++)
block.AddColumn(column_names[i], std::to_string(sums[i]));
DataBlock xy_value(block);
return xy_value;
}
Centroid::Centroid(Dictionary input)
{
int size = input.GetSize();
for(int i = 0; i < size; i++)
m_member_points.push_back(input.GetValue(i));
m_center = m_member_points[RandomRange(0, m_member_points.size() - 1)];
}
Centroid::Centroid(DataBlock input) : m_center(input)
{
m_member_points.push_back(input);
}
DataBlock& Centroid::GetValue(int index)
{
return m_member_points[index];
}

void Centroid::AddPoint(DataBlock input)


{
m_member_points.push_back(input);
}
void Centroid::RemovePoint(DataBlock value_to_remove)
{
bool found = false;
int index = 0;
for(int i = 0; i < m_member_points.size(); i++)
{
if(m_member_points[i] == value_to_remove)
{
found = true;
index = i;
break;
}
}
if(found)
m_member_points.erase(m_member_points.begin() + index);
}
DataBlock& Centroid::GetCenter()
{
return m_center;
}
void Centroid::UpdateCenter()
{
if(m_member_points.size() > 0)
{
DataBlock new_center = GeometricMean(m_member_points);
m_center = new_center;
}
}
std::vector<DataBlock> Centroid::GetAllPoints()
{
return m_member_points;
}
unsigned Centroid::GetNumberOfMemberPoints()
{
return m_member_points.size();
}
bool IsNotUsed(std::vector<int>& used_values, int value_to_check)
{
for(unsigned i = 0; i < used_values.size(); i++)
{
if(used_values[i] == value_to_check)
return false;
}
return true;
}
std::vector<Centroid> KmeansClassifier(Dictionary& dataset, int number_of_groups

, int iterations)
{
std::vector<Centroid> return_vec;
std::vector<int> used_values;
//initialize the used values to something impossible
used_values.push_back(dataset.GetSize());
used_values.push_back(dataset.GetSize() + 1);
used_values.push_back(dataset.GetSize() + 2);
//randomly assign the centroids
for(int i = 0; i < number_of_groups; i++)
{
//get a random index
int random_value = RandomRange(0, dataset.GetSize() - 1);
//check to see if it's used yet
if(IsNotUsed(used_values, random_value))
{
//if it isn't used, assign it to the used value at the current index
//then assign the center of the centroid to be the randomly chosen point
used_values[i] = random_value;
Centroid new_centroid(dataset.GetValue(random_value));
return_vec.push_back(new_centroid);
}
else
{
//keep trying random values until you find one that isn't used yet
while(!IsNotUsed(used_values, random_value))
random_value = RandomRange(0, dataset.GetSize() - 1);
//once you find one, assign it to the used value at the current index
//then assign the center of the centroid to be the randomly chosen point
used_values[i] = random_value;
Centroid new_centroid(dataset.GetValue(random_value));
return_vec.push_back(new_centroid);
}
}
//initial loop to assign each point to a random cluster
for(int i = 0; i < dataset.GetSize(); i++)
{
//if it isn't one of the used values
if(IsNotUsed(used_values, i))
{
//check against the three centroids to find out which cluster this point b
elongs in
int distance_1 = Distance(dataset.GetValue(i), return_vec[0].GetCenter());
int distance_2 = Distance(dataset.GetValue(i), return_vec[1].GetCenter());
int distance_3 = Distance(dataset.GetValue(i), return_vec[2].GetCenter());
//put all the distances into a vector
std::vector<double> holder;
holder.push_back(distance_1);
holder.push_back(distance_2);
holder.push_back(distance_3);
//find the smallest distance and add the point to that cluster's members
double index_of_minimum = Min(holder);
return_vec[index_of_minimum].AddPoint(dataset.GetValue(i));
}
}
//perform as many iterations as directed
for(int z = 0; z < iterations; z++)
{
return_vec[0].UpdateCenter();
return_vec[1].UpdateCenter();

return_vec[2].UpdateCenter();
for(unsigned i = 0; i < return_vec.size(); i++)
{
for(unsigned j = 0; j < return_vec[i].GetNumberOfMemberPoints(); j++)
{
int distance_1 = Distance(return_vec[i].GetValue(j), return_vec[0].GetCe
nter());
int distance_2 = Distance(return_vec[i].GetValue(j), return_vec[1].GetCe
nter());
int distance_3 = Distance(return_vec[i].GetValue(j), return_vec[2].GetCe
nter());
std::vector<double> holder;
holder.push_back(distance_1);
holder.push_back(distance_2);
holder.push_back(distance_3);
int index_of_minimum = Min(holder);
if(index_of_minimum != i)
{
DataBlock temp = return_vec[i].GetValue(j);
return_vec[i].RemovePoint(return_vec[i].GetValue(j));
return_vec[index_of_minimum].AddPoint(temp);
}
}
}
}
return return_vec;
}
std::vector<Centroid> KmeansClassifier(Dictionary& dataset, int number_of_groups
, int iterations, std::vector<std::string> variables_to_check)
{
std::vector<Centroid> return_vec;
std::vector<int> used_values;
//initialize the used values to something impossible
for(int i = 0; i < number_of_groups; i++)
used_values.push_back(dataset.GetSize() + i);
//randomly assign the centroids
for(int i = 0; i < number_of_groups; i++)
{
//get a random index
int random_value = RandomRange(0, dataset.GetSize() - 1);
//check to see if it's used yet
if(IsNotUsed(used_values, random_value))
{
//if it isn't used, assign it to the used value at the current index
//then assign the center of the centroid to be the randomly chosen point
used_values[i] = random_value;
Centroid new_centroid(dataset.GetValue(random_value));
return_vec.push_back(new_centroid);
}
else
{
//keep trying random values until you find one that isn't used yet

while(!IsNotUsed(used_values, random_value))
random_value = RandomRange(0, dataset.GetSize() - 1);
//once you find one, assign it to the used value at the current index
//then assign the center of the centroid to be the randomly chosen point
used_values[i] = random_value;
Centroid new_centroid(dataset.GetValue(random_value));
return_vec.push_back(new_centroid);
}
}
//initial loop to assign each point to a random cluster
for(int i = 0; i < dataset.GetSize(); i++)
{
//if it isn't one of the used values
if(IsNotUsed(used_values, i))
{
//check against the centroids to find out which cluster this point belongs
in
std::vector<double> distances;
//put all the distances into a vector
for(int j = 0; j < number_of_groups; j++)
distances.push_back(Distance(dataset.GetValue(i), return_vec[j].GetCente
r(), variables_to_check));
//find the smallest distance and add the point to that cluster's members
int index_of_minimum = Min(distances);
return_vec[index_of_minimum].AddPoint(dataset.GetValue(i));
}
}
//perform as many iterations as directed
for(int z = 0; z < iterations; z++)
{
for(int i = 0; i < number_of_groups; i++)
return_vec[i].UpdateCenter();
for(unsigned i = 0; i < return_vec.size(); i++)
{
for(unsigned j = 0; j < return_vec[i].GetNumberOfMemberPoints(); j++)
{
std::vector<double> distances;
for(int k = 0; k < number_of_groups; k++)
distances.push_back(Distance(return_vec[i].GetValue(j), return_vec[k].
GetCenter(), variables_to_check));
int index_of_minimum = Min(distances);
if(index_of_minimum != i)
{
DataBlock temp = return_vec[i].GetValue(j);
return_vec[i].RemovePoint(return_vec[i].GetValue(j));
return_vec[index_of_minimum].AddPoint(temp);
}
}
}
}
return return_vec;
}
std::vector<Centroid> Organize(std::vector<Centroid>& groups, std::string attrib

ute_to_org_by)
{
std::vector<Centroid> return_vec;
std::vector<int> values;
std::vector<int> ignore = { -1 };
for(unsigned i = 0; i < groups.size(); i++)
values.push_back(std::atoi(groups[i].GetCenter()[attribute_to_or
g_by].c_str()));
for(unsigned i = 0; i < groups.size(); i++)
{
int index = Min(values, ignore);
return_vec.push_back(groups[index]);
ignore.push_back(index);
}
return return_vec;
}

Vous aimerez peut-être aussi