Fix inconsistency that could cause the optimization algorithm to osci…

…llate. Fixes cjlin1#225. # Background The optimization algorithm has three main calculations: 1. Select the working set `{i, j}` that minimizes the decrease in the objective function. 2. Change `alpha[i]` and `alpha[j]` to minimize the decrease in the objective function while respecting constraints. 3. Update the gradient of the objective function according to the changes to `alpha[i]` and `alpha[j]`. All three calculations make use of the matrix `Q`, which is represented by the `QMatrix` class. The `QMatrix` class has two main methods: - `get_Q`, which returns an array of values for a single column of the matrix; and - `get_QD`, which returns an array of diagonal values. # Problem `Q` values are of type `Qfloat` while `QD` values are of type `double`. `Qfloat` is currently defined as `float`, so there can be inconsistency in the diagonal values returned by `get_Q` and `get_QD`. For example, in cjlin1#225, one of the diagonal values is `181.05748749793070829` as `double` and `180.99411909539512067` as `float`. The first two calculations of the optimization algorithm access the diagonal values via `get_QD`. However, the third calculation accesses the diagonal values via `get_Q`. This inconsistency between the minimization calculations and the gradient update can cause the optimization algorithm to oscillate, as demonstrated by cjlin1#225. # Solution We change `get_Q` to return a new class called `QColumn` instead of a plain array of values. The `QColumn` class overloads the subscript operator, so accessing individual elements is the same as before. Internally though, the `QColumn` class will return the `QD` value when the diagonal element is accessed. This guarantees that all calculations are using the same values for the diagonal elements, eliminating the inconsistency. # Alternatives Considered Alternatively, we could change `Qfloat` to be defined as `double`. This would also eliminate the inconsistency; however, it would reduce the cache capacity by half. # Future Changes The Java code will be updated similarly in a separate commit.
fumoboy007 · Dec 21, 2024 · 87d106b · 87d106b
1 parent 35e5596
commit 87d106b
Showing 1 changed file with 51 additions and 18 deletions.
diff --git a/svm.cpp b/svm.cpp
@@ -188,6 +188,37 @@ void Cache::swap_index(int i, int j)
 	}
 }
 
+// Provides access to the elements in a single column of a QMatrix.
+class QColumn {
+public:
+   QColumn(const Qfloat *values, int column_idx, double diagonal_value)
+   : values(values), column_idx(column_idx), diagonal_value(diagonal_value) {
+   }
+
+   double operator[](int row_idx) const {
+      if (row_idx != column_idx) {
+         return values[row_idx];
+      } else {
+         // Return the value from the QD array so that calculations stay
+         // consistent, regardless of whether QColumn or the QD array
+         // are used.
+         //
+         // QD array is double precision while Qfloat could be single
+         // precision, so the diagonal values could be very different
+         // between the two. If one calculation uses QColumn while
+         // another calculation uses the QD array, the optimization
+         // algorithm may not converge.
+         return diagonal_value;
+      }
+   }
+
+private:
+   const Qfloat *values;
+
+   int column_idx;
+   double diagonal_value;
+};
+
 //
 // Kernel evaluation
 //
@@ -197,7 +228,7 @@ void Cache::swap_index(int i, int j)
 //
 class QMatrix {
 public:
-	virtual Qfloat *get_Q(int column, int len) const = 0;
+	virtual QColumn get_Q(int column, int len) const = 0;
 	virtual double *get_QD() const = 0;
 	virtual void swap_index(int i, int j) const = 0;
 	virtual ~QMatrix() {}
@@ -210,7 +241,7 @@ class Kernel: public QMatrix {
 
 	static double k_function(const svm_node *x, const svm_node *y,
 				 const svm_parameter& param);
-	virtual Qfloat *get_Q(int column, int len) const = 0;
+	virtual QColumn get_Q(int column, int len) const = 0;
 	virtual double *get_QD() const = 0;
 	virtual void swap_index(int i, int j) const	// no so const...
 	{
@@ -486,7 +517,7 @@ void Solver::reconstruct_gradient()
 	{
 		for(i=active_size;i<l;i++)
 		{
-			const Qfloat *Q_i = Q->get_Q(i,active_size);
+			QColumn Q_i = Q->get_Q(i,active_size);
 			for(j=0;j<active_size;j++)
 				if(is_free(j))
 					G[i] += alpha[j] * Q_i[j];
@@ -497,7 +528,7 @@ void Solver::reconstruct_gradient()
 		for(i=0;i<active_size;i++)
 			if(is_free(i))
 			{
-				const Qfloat *Q_i = Q->get_Q(i,l);
+				QColumn Q_i = Q->get_Q(i,l);
 				double alpha_i = alpha[i];
 				for(j=active_size;j<l;j++)
 					G[j] += alpha_i * Q_i[j];
@@ -548,7 +579,7 @@ void Solver::Solve(int l, const QMatrix& Q, const double *p_, const schar *y_,
 		for(i=0;i<l;i++)
 			if(!is_lower_bound(i))
 			{
-				const Qfloat *Q_i = Q.get_Q(i,l);
+				QColumn Q_i = Q.get_Q(i,l);
 				double alpha_i = alpha[i];
 				int j;
 				for(j=0;j<l;j++)
@@ -594,8 +625,8 @@ void Solver::Solve(int l, const QMatrix& Q, const double *p_, const schar *y_,
 
 		// update alpha[i] and alpha[j], handle bounds carefully
 
-		const Qfloat *Q_i = Q.get_Q(i,active_size);
-		const Qfloat *Q_j = Q.get_Q(j,active_size);
+		QColumn Q_i = Q.get_Q(i,active_size);
+		QColumn Q_j = Q.get_Q(j,active_size);
 
 		double C_i = get_C(i);
 		double C_j = get_C(j);
@@ -822,9 +853,11 @@ int Solver::select_working_set(int &out_i, int &out_j)
 		}
 
 	int i = Gmax_idx;
-	const Qfloat *Q_i = NULL;
-	if(i != -1) // NULL Q_i not accessed: Gmax=-INF if i=-1
-		Q_i = Q->get_Q(i,active_size);
+	if (i == -1) {
+	   return 1;
+	}
+
+	QColumn Q_i = Q->get_Q(i,active_size);
 
 	for(int j=0;j<active_size;j++)
 	{
@@ -1071,8 +1104,8 @@ int Solver_NU::select_working_set(int &out_i, int &out_j)
 
 	int ip = Gmaxp_idx;
 	int in = Gmaxn_idx;
-	const Qfloat *Q_ip = NULL;
-	const Qfloat *Q_in = NULL;
+	QColumn Q_ip = QColumn(NULL, -1, 0);
+	QColumn Q_in = QColumn(NULL, -1, 0);
 	if(ip != -1) // NULL Q_ip not accessed: Gmaxp=-INF if ip=-1
 		Q_ip = Q->get_Q(ip,active_size);
 	if(in != -1)
@@ -1280,7 +1313,7 @@ class SVC_Q: public Kernel
 			QD[i] = (this->*kernel_function)(i,i);
 	}
 
-	Qfloat *get_Q(int i, int len) const
+	QColumn get_Q(int i, int len) const
 	{
 		Qfloat *data;
 		int start, j;
@@ -1292,7 +1325,7 @@ class SVC_Q: public Kernel
 			for(j=start;j<len;j++)
 				data[j] = (Qfloat)(y[i]*y[j]*(this->*kernel_function)(i,j));
 		}
-		return data;
+		return QColumn(data, i, QD[i]);
 	}
 
 	double *get_QD() const
@@ -1332,7 +1365,7 @@ class ONE_CLASS_Q: public Kernel
 			QD[i] = (this->*kernel_function)(i,i);
 	}
 
-	Qfloat *get_Q(int i, int len) const
+	QColumn get_Q(int i, int len) const
 	{
 		Qfloat *data;
 		int start, j;
@@ -1341,7 +1374,7 @@ class ONE_CLASS_Q: public Kernel
 			for(j=start;j<len;j++)
 				data[j] = (Qfloat)(this->*kernel_function)(i,j);
 		}
-		return data;
+		return QColumn(data, i, QD[i]);
 	}
 
 	double *get_QD() const
@@ -1398,7 +1431,7 @@ class SVR_Q: public Kernel
 		swap(QD[i],QD[j]);
 	}
 
-	Qfloat *get_Q(int i, int len) const
+	QColumn get_Q(int i, int len) const
 	{
 		Qfloat *data;
 		int j, real_i = index[i];
@@ -1417,7 +1450,7 @@ class SVR_Q: public Kernel
 		schar si = sign[i];
 		for(j=0;j<len;j++)
 			buf[j] = (Qfloat) si * (Qfloat) sign[j] * data[index[j]];
-		return buf;
+		return QColumn(buf, i, QD[i]);
 	}
 
 	double *get_QD() const