From 3101ae7973b94f6eea176bb302813210eb3feeb3 Mon Sep 17 00:00:00 2001 From: Tomasz Sobczyk Date: Sun, 18 Apr 2021 19:04:14 +0200 Subject: [PATCH] remove learn --- src/Makefile | 4 - src/extra/stockfish_blas.cpp | 1291 --------------- src/extra/stockfish_blas.h | 140 -- src/learn/autograd.h | 667 -------- src/learn/gensfen.cpp | 5 +- src/learn/gensfen_nonpv.cpp | 1 - src/learn/half_float.h | 133 -- src/learn/learn.cpp | 1474 ----------------- src/learn/learn.h | 148 -- src/nnue/evaluate_nnue_learner.cpp | 341 ---- src/nnue/evaluate_nnue_learner.h | 52 - src/nnue/nnue_test_command.cpp | 215 --- src/nnue/nnue_test_command.h | 12 - src/nnue/trainer/features/all_factorizers.h | 10 - src/nnue/trainer/features/factorizer.h | 117 -- .../trainer/features/factorizer_feature_set.h | 121 -- .../trainer/features/factorizer_half_ka.h | 93 -- .../trainer/features/factorizer_half_kp.h | 104 -- src/nnue/trainer/trainer.h | 122 -- src/nnue/trainer/trainer_affine_transform.h | 476 ------ src/nnue/trainer/trainer_clipped_relu.h | 354 ---- .../trainer/trainer_feature_transformer.h | 783 --------- src/nnue/trainer/trainer_input_slice.h | 383 ----- src/nnue/trainer/trainer_sum.h | 201 --- src/uci.cpp | 25 - 25 files changed, 2 insertions(+), 7270 deletions(-) delete mode 100644 src/extra/stockfish_blas.cpp delete mode 100644 src/extra/stockfish_blas.h delete mode 100644 src/learn/autograd.h delete mode 100644 src/learn/half_float.h delete mode 100644 src/learn/learn.cpp delete mode 100644 src/learn/learn.h delete mode 100644 src/nnue/evaluate_nnue_learner.cpp delete mode 100644 src/nnue/evaluate_nnue_learner.h delete mode 100644 src/nnue/nnue_test_command.cpp delete mode 100644 src/nnue/nnue_test_command.h delete mode 100644 src/nnue/trainer/features/all_factorizers.h delete mode 100644 src/nnue/trainer/features/factorizer.h delete mode 100644 src/nnue/trainer/features/factorizer_feature_set.h delete mode 100644 src/nnue/trainer/features/factorizer_half_ka.h delete mode 100644 src/nnue/trainer/features/factorizer_half_kp.h delete mode 100644 src/nnue/trainer/trainer.h delete mode 100644 src/nnue/trainer/trainer_affine_transform.h delete mode 100644 src/nnue/trainer/trainer_clipped_relu.h delete mode 100644 src/nnue/trainer/trainer_feature_transformer.h delete mode 100644 src/nnue/trainer/trainer_input_slice.h delete mode 100644 src/nnue/trainer/trainer_sum.h diff --git a/src/Makefile b/src/Makefile index a4ced5f0..19927ce5 100644 --- a/src/Makefile +++ b/src/Makefile @@ -47,9 +47,7 @@ PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 1000 sfen_format bin output_file_name SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp \ material.cpp misc.cpp movegen.cpp movepick.cpp pawns.cpp position.cpp psqt.cpp \ search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \ - extra/stockfish_blas.cpp \ nnue/evaluate_nnue.cpp \ - nnue/evaluate_nnue_learner.cpp \ nnue/features/half_kp.cpp \ nnue/features/half_ka.cpp \ nnue/features/half_relative_kp.cpp \ @@ -59,9 +57,7 @@ SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp nnue/features/a.cpp \ nnue/features/castling_right.cpp \ nnue/features/enpassant.cpp \ - nnue/nnue_test_command.cpp \ learn/sfen_packer.cpp \ - learn/learn.cpp \ learn/gensfen.cpp \ learn/gensfen_nonpv.cpp \ learn/opening_book.cpp \ diff --git a/src/extra/stockfish_blas.cpp b/src/extra/stockfish_blas.cpp deleted file mode 100644 index 70b258bc..00000000 --- a/src/extra/stockfish_blas.cpp +++ /dev/null @@ -1,1291 +0,0 @@ -#include "stockfish_blas.h" - -#include "thread.h" - -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined(USE_SSE2) -#include -#endif - -#if defined (USE_SSE3) -#include -#endif - -#if defined(USE_BLAS) -#include -#endif - -namespace Blas { - void scopy( - const int N, - const float * SF_BLAS_RESTRICT X, - float * SF_BLAS_RESTRICT Y - ) - { - std::memcpy(Y, X, sizeof(float) * N); - } - - void scopy( - const int N, - const float * SF_BLAS_RESTRICT X, const int incX, - float * SF_BLAS_RESTRICT Y, const int incY - ) - { - if (incX == 1 && incY == 1) - { - scopy(N, X, Y); - } - else - { - for(int i = 0; i < N; ++i) - { - *Y = *X; - X += incX; - Y += incY; - } - } - } - - void scopy( - ThreadPool&, - const int N, - const float * SF_BLAS_RESTRICT X, - float * SF_BLAS_RESTRICT Y - ) - { - scopy(N, X, Y); - } - - void scopy( - ThreadPool&, - const int N, - const float * SF_BLAS_RESTRICT X, const int incX, - float * SF_BLAS_RESTRICT Y, const int incY - ) - { - scopy(N, X, incX, Y, incY); - } - - void sscal( - const int N, - const float alpha, - float * SF_BLAS_RESTRICT X - ) - { -#if defined (USE_SSE2) - - const __m128 alpha4 = _mm_set1_ps(alpha); - - int i = 0; - for(; i < N - 31; i += 32) - { - __m128 x0 = _mm_loadu_ps(X + i + 0); - __m128 x1 = _mm_loadu_ps(X + i + 4); - __m128 x2 = _mm_loadu_ps(X + i + 8); - __m128 x3 = _mm_loadu_ps(X + i + 12); - __m128 x4 = _mm_loadu_ps(X + i + 16); - __m128 x5 = _mm_loadu_ps(X + i + 20); - __m128 x6 = _mm_loadu_ps(X + i + 24); - __m128 x7 = _mm_loadu_ps(X + i + 28); - - x0 = _mm_mul_ps(x0, alpha4); - x1 = _mm_mul_ps(x1, alpha4); - x2 = _mm_mul_ps(x2, alpha4); - x3 = _mm_mul_ps(x3, alpha4); - x4 = _mm_mul_ps(x4, alpha4); - x5 = _mm_mul_ps(x5, alpha4); - x6 = _mm_mul_ps(x6, alpha4); - x7 = _mm_mul_ps(x7, alpha4); - - _mm_storeu_ps(X + i + 0, x0); - _mm_storeu_ps(X + i + 4, x1); - _mm_storeu_ps(X + i + 8, x2); - _mm_storeu_ps(X + i + 12, x3); - _mm_storeu_ps(X + i + 16, x4); - _mm_storeu_ps(X + i + 20, x5); - _mm_storeu_ps(X + i + 24, x6); - _mm_storeu_ps(X + i + 28, x7); - } - - for(; i < N; ++i) - { - X[i] *= alpha; - } - -#else - - for(int i = 0; i < N; ++i) - { - X[i] *= alpha; - } - -#endif - } - - void sscal( - const int N, - const float alpha, - float * SF_BLAS_RESTRICT X, const int incX - ) - { - if (incX == 1) - { - sscal(N, alpha, X); - } - else - { - for(int i = 0; i < N; ++i) - { - *X *= alpha; - X += incX; - } - } - } - - void sscal( - ThreadPool&, - const int N, - const float alpha, - float * SF_BLAS_RESTRICT X - ) - { - sscal(N, alpha, X); - } - - void sscal( - ThreadPool&, - const int N, - const float alpha, - float *X, const int incX - ) - { - sscal(N, alpha, X, incX); - } - - void saxpy( - const int N, - const float alpha, - const float * SF_BLAS_RESTRICT X, - float * SF_BLAS_RESTRICT Y - ) - { - if (alpha == 1.0f) - { - for (int i = 0; i < N; ++i) - { - Y[i] += X[i]; - } - } - else - { - for (int i = 0; i < N; ++i) - { - Y[i] += X[i] * alpha; - } - } - - } - - void saxpy( - const int N, - const float alpha, - const float * SF_BLAS_RESTRICT X, const int incX, - float * SF_BLAS_RESTRICT Y, const int incY - ) - { - if (incX == 1 && incY == 1) - { - saxpy(N, alpha, X, Y); - } - else - { - for(int i = 0; i < N; ++i) - { - *Y += *X * alpha; - Y += incY; - X += incX; - } - } - } - - void saxpy( - ThreadPool&, - const int N, - const float alpha, - const float * SF_BLAS_RESTRICT X, - float * SF_BLAS_RESTRICT Y - ) - { - saxpy(N, alpha, X, Y); - } - - void saxpy( - ThreadPool&, - const int N, - const float alpha, - const float * SF_BLAS_RESTRICT X, const int incX, - float * SF_BLAS_RESTRICT Y, const int incY - ) - { - saxpy(N, alpha, X, incX, Y, incY); - } - -#if defined (USE_SSE3) - inline __m128 m128_hadd_ps(__m128 a, __m128 b, __m128 c, __m128 d) - { - const __m128 t0 = _mm_hadd_ps(a, b); - const __m128 t1 = _mm_hadd_ps(c, d); - return _mm_hadd_ps(t0, t1); - } -#endif - -#if defined (USE_SSE2) - - inline void transpose4x4_sse2( - const float* SF_BLAS_RESTRICT A, const int lda, - float* SF_BLAS_RESTRICT B, const int ldb - ) - { - __m128 row1 = _mm_loadu_ps(&A[0 * lda]); - __m128 row2 = _mm_loadu_ps(&A[1 * lda]); - __m128 row3 = _mm_loadu_ps(&A[2 * lda]); - __m128 row4 = _mm_loadu_ps(&A[3 * lda]); - - _MM_TRANSPOSE4_PS(row1, row2, row3, row4); - - _mm_storeu_ps(&B[0 * ldb], row1); - _mm_storeu_ps(&B[1 * ldb], row2); - _mm_storeu_ps(&B[2 * ldb], row3); - _mm_storeu_ps(&B[3 * ldb], row4); - } - - void transpose_sse2( - const int N, const int M, - const float* SF_BLAS_RESTRICT A, const int lda, - float* SF_BLAS_RESTRICT B, const int ldb - ) - { - static constexpr int block_size = 16; - - for (int n = 0; n < N; n += block_size) - { - for (int m = 0; m < M; m += block_size) - { - const int max_n2 = n + block_size < N ? n + block_size : N; - const int max_m2 = m + block_size < M ? m + block_size : M; - - int n2 = n; - for (; n2 < max_n2 - 3; n2 += 4) - { - int m2 = m; - for (; m2 < max_m2 - 3; m2 += 4) - { - transpose4x4_sse2( - &A[n2 * lda + m2], lda, - &B[m2 * ldb + n2], ldb - ); - } - - for (; m2 < max_m2; ++m2) - { - B[m2 * ldb + n2 + 0] = A[(n2 + 0) * lda + m2]; - B[m2 * ldb + n2 + 1] = A[(n2 + 1) * lda + m2]; - B[m2 * ldb + n2 + 2] = A[(n2 + 2) * lda + m2]; - B[m2 * ldb + n2 + 3] = A[(n2 + 3) * lda + m2]; - } - } - - for (; n2 < max_n2; ++n2) - { - for (int m2 = m; m2 < max_m2; ++m2) - { - B[m2 * ldb + n2] = A[n2 * lda + m2]; - } - } - } - } - } -#endif - - void transpose( - const int N, const int M, - const float * SF_BLAS_RESTRICT A, const int lda, - float* SF_BLAS_RESTRICT B, const int ldb - ) - { -#if defined (USE_SSE2) - - transpose_sse2( - N, M, - A, lda, - B, ldb - ); - -#else - - for(int r = 0; r < N; ++r) - { - for (int c = 0; c < M; ++c) - { - B[c*ldb + r] = A[r*lda + c]; - } - } - -#endif - } - - void sgemm_row_major_transpose_right( - ThreadPool& thread_pool, - const int M, const int N, const int K, - const float alpha, - const float * SF_BLAS_RESTRICT A, const int lda, - const float * SF_BLAS_RESTRICT B, const int ldb, - const float beta, - float * SF_BLAS_RESTRICT C, const int ldc - ) - { - -#if defined(USE_SSE3) - - const __m128 alpha4 = _mm_set1_ps(alpha); - const __m128 beta4 = _mm_set1_ps(beta); - - std::atomic m_atomic = 0; - thread_pool.execute_with_workers( - [ - M, N, K, - alpha, alpha4, - A, lda, - B, ldb, - beta, beta4, - C, ldc, - &m_atomic - ](Thread&) { - for (;;) - { - const int m = m_atomic.fetch_add(2); - if (m >= M - 1) - break; - - int n = 0; - for (; n < N - 3; n += 4) - { - // mn - __m128 sum00 = _mm_setzero_ps(); - __m128 sum01 = _mm_setzero_ps(); - __m128 sum02 = _mm_setzero_ps(); - __m128 sum03 = _mm_setzero_ps(); - __m128 sum10 = _mm_setzero_ps(); - __m128 sum11 = _mm_setzero_ps(); - __m128 sum12 = _mm_setzero_ps(); - __m128 sum13 = _mm_setzero_ps(); - - // Horizontal sum of elements in sum[m][n] corresponds to - // the final element in the C. - - int k = 0; - for (; k < K - 3; k += 4) - { - const __m128 a0 = _mm_loadu_ps(&A[(m+0)*lda+k+0]); - const __m128 a1 = _mm_loadu_ps(&A[(m+1)*lda+k+0]); - - const __m128 b0 = _mm_loadu_ps(&B[(n+0)*ldb+k+0]); - const __m128 b1 = _mm_loadu_ps(&B[(n+1)*ldb+k+0]); - const __m128 b2 = _mm_loadu_ps(&B[(n+2)*ldb+k+0]); - const __m128 b3 = _mm_loadu_ps(&B[(n+3)*ldb+k+0]); - - sum00 = _mm_add_ps(sum00, _mm_mul_ps(a0, b0)); - sum01 = _mm_add_ps(sum01, _mm_mul_ps(a0, b1)); - sum02 = _mm_add_ps(sum02, _mm_mul_ps(a0, b2)); - sum03 = _mm_add_ps(sum03, _mm_mul_ps(a0, b3)); - sum10 = _mm_add_ps(sum10, _mm_mul_ps(a1, b0)); - sum11 = _mm_add_ps(sum11, _mm_mul_ps(a1, b1)); - sum12 = _mm_add_ps(sum12, _mm_mul_ps(a1, b2)); - sum13 = _mm_add_ps(sum13, _mm_mul_ps(a1, b3)); - } - - for(; k < K; k += 1) - { - const float a0 = A[(m+0)*lda+k+0]; - const float a1 = A[(m+1)*lda+k+0]; - - const float b0 = B[(n+0)*ldb+k+0]; - const float b1 = B[(n+1)*ldb+k+0]; - const float b2 = B[(n+2)*ldb+k+0]; - const float b3 = B[(n+3)*ldb+k+0]; - - // Since all will be summed vertically anyway we can - // just add to the first element. - // Other elements are left unmodified. - sum00 = _mm_add_ss(sum00, _mm_set_ss(a0 * b0)); - sum01 = _mm_add_ss(sum01, _mm_set_ss(a0 * b1)); - sum02 = _mm_add_ss(sum02, _mm_set_ss(a0 * b2)); - sum03 = _mm_add_ss(sum03, _mm_set_ss(a0 * b3)); - sum10 = _mm_add_ss(sum10, _mm_set_ss(a1 * b0)); - sum11 = _mm_add_ss(sum11, _mm_set_ss(a1 * b1)); - sum12 = _mm_add_ss(sum12, _mm_set_ss(a1 * b2)); - sum13 = _mm_add_ss(sum13, _mm_set_ss(a1 * b3)); - } - - __m128 s0 = m128_hadd_ps(sum00, sum01, sum02, sum03); - __m128 s1 = m128_hadd_ps(sum10, sum11, sum12, sum13); - s0 = _mm_mul_ps(s0, alpha4); - s1 = _mm_mul_ps(s1, alpha4); - - __m128 c0 = _mm_loadu_ps(&C[(m+0)*ldc+(n+0)]); - __m128 c1 = _mm_loadu_ps(&C[(m+1)*ldc+(n+0)]); - c0 = _mm_mul_ps(c0, beta4); - c1 = _mm_mul_ps(c1, beta4); - - c0 = _mm_add_ps(c0, s0); - c1 = _mm_add_ps(c1, s1); - - _mm_storeu_ps(&C[(m+0)*ldc+(n+0)], c0); - _mm_storeu_ps(&C[(m+1)*ldc+(n+0)], c1); - } - - for(; n < N; n += 1) - { - float sum0 = 0.0f; - float sum1 = 0.0f; - - for (int k = 0; k < K; ++k) - { - const float a0 = A[(m+0)*lda+k+0]; - const float a1 = A[(m+1)*lda+k+0]; - - const float b0 = B[(n+0)*ldb+k+0]; - - sum0 += a0 * b0; - sum1 += a1 * b0; - } - - C[(m+0)*ldc+(n+0)] = C[(m+0)*ldc+(n+0)] * beta + sum0 * alpha; - C[(m+1)*ldc+(n+0)] = C[(m+1)*ldc+(n+0)] * beta + sum1 * alpha; - } - } - } - ); - - int m = M - (M % 2); - for (; m < M; m += 1) - { - for (int n = 0; n < N; n += 1) - { - float sum = 0.0f; - - for (int k = 0; k < K; k += 1) - { - sum += A[m*lda + k] * B[n*ldb + k]; - } - - C[m*ldc + n] = C[m*ldc + n] * beta + sum * alpha; - } - } - - thread_pool.wait_for_workers_finished(); - -#else - - thread_pool.for_each_index_with_workers( - 0, M, - [&](Thread&, int m) { - for (int n = 0; n < N; n += 1) - { - float sum = 0.0f; - - for (int k = 0; k < K; k += 1) - { - sum += A[m*lda + k] * B[n*ldb + k]; - } - - C[m*ldc + n] = C[m*ldc + n] * beta + sum * alpha; - } - } - ); - thread_pool.wait_for_workers_finished(); - -#endif - } - - void sgemm_row_major_transpose_right( - const int M, const int N, const int K, - const float alpha, - const float * SF_BLAS_RESTRICT A, const int lda, - const float * SF_BLAS_RESTRICT B, const int ldb, - const float beta, - float * SF_BLAS_RESTRICT C, const int ldc - ) - { - -#if defined(USE_SSE3) - - const __m128 alpha4 = _mm_set1_ps(alpha); - const __m128 beta4 = _mm_set1_ps(beta); - - int m = 0; - for (; m < M - 1; m += 2) - { - int n = 0; - for (; n < N - 3; n += 4) - { - // mn - __m128 sum00 = _mm_setzero_ps(); - __m128 sum01 = _mm_setzero_ps(); - __m128 sum02 = _mm_setzero_ps(); - __m128 sum03 = _mm_setzero_ps(); - __m128 sum10 = _mm_setzero_ps(); - __m128 sum11 = _mm_setzero_ps(); - __m128 sum12 = _mm_setzero_ps(); - __m128 sum13 = _mm_setzero_ps(); - - // Horizontal sum of elements in sum[m][n] corresponds to - // the final element in the C. - - int k = 0; - for (; k < K - 3; k += 4) - { - const __m128 a0 = _mm_loadu_ps(&A[(m+0)*lda+k+0]); - const __m128 a1 = _mm_loadu_ps(&A[(m+1)*lda+k+0]); - - const __m128 b0 = _mm_loadu_ps(&B[(n+0)*ldb+k+0]); - const __m128 b1 = _mm_loadu_ps(&B[(n+1)*ldb+k+0]); - const __m128 b2 = _mm_loadu_ps(&B[(n+2)*ldb+k+0]); - const __m128 b3 = _mm_loadu_ps(&B[(n+3)*ldb+k+0]); - - sum00 = _mm_add_ps(sum00, _mm_mul_ps(a0, b0)); - sum01 = _mm_add_ps(sum01, _mm_mul_ps(a0, b1)); - sum02 = _mm_add_ps(sum02, _mm_mul_ps(a0, b2)); - sum03 = _mm_add_ps(sum03, _mm_mul_ps(a0, b3)); - sum10 = _mm_add_ps(sum10, _mm_mul_ps(a1, b0)); - sum11 = _mm_add_ps(sum11, _mm_mul_ps(a1, b1)); - sum12 = _mm_add_ps(sum12, _mm_mul_ps(a1, b2)); - sum13 = _mm_add_ps(sum13, _mm_mul_ps(a1, b3)); - } - - for(; k < K; k += 1) - { - const float a0 = A[(m+0)*lda+k+0]; - const float a1 = A[(m+1)*lda+k+0]; - - const float b0 = B[(n+0)*ldb+k+0]; - const float b1 = B[(n+1)*ldb+k+0]; - const float b2 = B[(n+2)*ldb+k+0]; - const float b3 = B[(n+3)*ldb+k+0]; - - // Since all will be summed vertically anyway we can - // just add to the first element. - // Other elements are left unmodified. - sum00 = _mm_add_ss(sum00, _mm_set_ss(a0 * b0)); - sum01 = _mm_add_ss(sum01, _mm_set_ss(a0 * b1)); - sum02 = _mm_add_ss(sum02, _mm_set_ss(a0 * b2)); - sum03 = _mm_add_ss(sum03, _mm_set_ss(a0 * b3)); - sum10 = _mm_add_ss(sum10, _mm_set_ss(a1 * b0)); - sum11 = _mm_add_ss(sum11, _mm_set_ss(a1 * b1)); - sum12 = _mm_add_ss(sum12, _mm_set_ss(a1 * b2)); - sum13 = _mm_add_ss(sum13, _mm_set_ss(a1 * b3)); - } - - __m128 s0 = m128_hadd_ps(sum00, sum01, sum02, sum03); - __m128 s1 = m128_hadd_ps(sum10, sum11, sum12, sum13); - s0 = _mm_mul_ps(s0, alpha4); - s1 = _mm_mul_ps(s1, alpha4); - - __m128 c0 = _mm_loadu_ps(&C[(m+0)*ldc+(n+0)]); - __m128 c1 = _mm_loadu_ps(&C[(m+1)*ldc+(n+0)]); - c0 = _mm_mul_ps(c0, beta4); - c1 = _mm_mul_ps(c1, beta4); - - c0 = _mm_add_ps(c0, s0); - c1 = _mm_add_ps(c1, s1); - - _mm_storeu_ps(&C[(m+0)*ldc+(n+0)], c0); - _mm_storeu_ps(&C[(m+1)*ldc+(n+0)], c1); - } - - for(; n < N; n += 1) - { - float sum0 = 0.0f; - float sum1 = 0.0f; - - for (int k = 0; k < K; ++k) - { - const float a0 = A[(m+0)*lda+k+0]; - const float a1 = A[(m+1)*lda+k+0]; - - const float b0 = B[(n+0)*ldb+k+0]; - - sum0 += a0 * b0; - sum1 += a1 * b0; - } - - C[(m+0)*ldc+(n+0)] = C[(m+0)*ldc+(n+0)] * beta + sum0 * alpha; - C[(m+1)*ldc+(n+0)] = C[(m+1)*ldc+(n+0)] * beta + sum1 * alpha; - } - } - - for (; m < M; m += 1) - { - for (int n = 0; n < N; n += 1) - { - float sum = 0.0f; - - for (int k = 0; k < K; k += 1) - { - sum += A[m*lda + k] * B[n*ldb + k]; - } - - C[m*ldc + n] = C[m*ldc + n] * beta + sum * alpha; - } - } - -#else - - for (int m = 0; m < M; m += 1) - { - for (int n = 0; n < N; n += 1) - { - float sum = 0.0f; - - for (int k = 0; k < K; k += 1) - { - sum += A[m*lda + k] * B[n*ldb + k]; - } - - C[m*ldc + n] = C[m*ldc + n] * beta + sum * alpha; - } - } - -#endif - } - - // The pointer to the storage returned by this function - // is valid until the next call to this function from - // the same thread with the same idx. - // This is an unsafe function and should be used with caution - // and only within this translation unit. - // The number of buffers available is just enough to make - // all functions here work. - float* get_thread_local_temporary_storage( - int requested_size, int idx - ) - { - static constexpr int MAX_NUM_BUFFERS = 2; - - static thread_local int s_data_size[MAX_NUM_BUFFERS] = {0}; - static thread_local std::unique_ptr s_data[MAX_NUM_BUFFERS]; - - if (requested_size > s_data_size[idx]) - { - s_data[idx] = std::make_unique(requested_size); - s_data_size[idx] = requested_size; - } - - return s_data[idx].get(); - } - - void sgemm_row_major_transpose_none( - ThreadPool& thread_pool, - const int M, const int N, const int K, - const float alpha, - const float * SF_BLAS_RESTRICT A, const int lda, - const float * SF_BLAS_RESTRICT B, const int ldb, - const float beta, - float * SF_BLAS_RESTRICT C, const int ldc - ) - { - constexpr static int temporary_buffer_index = 1; - - auto B_tr = get_thread_local_temporary_storage(K * N, temporary_buffer_index); - - transpose( - K, N, - B, ldb, - B_tr, K - ); - - sgemm_row_major_transpose_right( - thread_pool, - M, N, K, - alpha, - A, lda, - B_tr, K, - beta, - C, ldc - ); - } - - void sgemm_row_major_transpose_none( - const int M, const int N, const int K, - const float alpha, - const float * SF_BLAS_RESTRICT A, const int lda, - const float * SF_BLAS_RESTRICT B, const int ldb, - const float beta, - float * SF_BLAS_RESTRICT C, const int ldc - ) - { - constexpr static int temporary_buffer_index = 1; - - auto B_tr = get_thread_local_temporary_storage(K * N, temporary_buffer_index); - - transpose( - K, N, - B, ldb, - B_tr, K - ); - - sgemm_row_major_transpose_right( - M, N, K, - alpha, - A, lda, - B_tr, K, - beta, - C, ldc - ); - } - - void sgemm_row_major( - ThreadPool& thread_pool, - MatrixTranspose TransA, MatrixTranspose TransB, - const int M, const int N, const int K, - const float alpha, - const float * SF_BLAS_RESTRICT A, const int lda, - const float * SF_BLAS_RESTRICT B, const int ldb, - const float beta, - float * SF_BLAS_RESTRICT C, const int ldc - ) - { - constexpr static int temporary_buffer_index = 0; - - if (TransA == MatrixTranspose::Trans && TransB == MatrixTranspose::Trans) - { - auto A_tr = get_thread_local_temporary_storage(K * M, temporary_buffer_index); - - transpose( - K, M, - A, lda, - A_tr, K - ); - - sgemm_row_major_transpose_right( - thread_pool, - M, N, K, - alpha, - A_tr, K, - B, ldb, - beta, - C, ldc - ); - } - else if (TransA == MatrixTranspose::NoTrans && TransB == MatrixTranspose::Trans) - { - sgemm_row_major_transpose_right( - thread_pool, - M, N, K, - alpha, - A, lda, - B, ldb, - beta, - C, ldc - ); - } - else if (TransA == MatrixTranspose::Trans && TransB == MatrixTranspose::NoTrans) - { - auto A_tr = get_thread_local_temporary_storage(K * M, temporary_buffer_index); - - transpose( - K, M, - A, lda, - A_tr, K - ); - - sgemm_row_major_transpose_none( - thread_pool, - M, N, K, - alpha, - A_tr, K, - B, ldb, - beta, - C, ldc - ); - } - else // no transpositions - { - sgemm_row_major_transpose_none( - thread_pool, - M, N, K, - alpha, - A, lda, - B, ldb, - beta, - C, ldc - ); - } - } - - void sgemm_row_major( - MatrixTranspose TransA, MatrixTranspose TransB, - const int M, const int N, const int K, - const float alpha, - const float * SF_BLAS_RESTRICT A, const int lda, - const float * SF_BLAS_RESTRICT B, const int ldb, - const float beta, - float * SF_BLAS_RESTRICT C, const int ldc - ) - { - constexpr static int temporary_buffer_index = 0; - - if (TransA == MatrixTranspose::Trans && TransB == MatrixTranspose::Trans) - { - auto A_tr = get_thread_local_temporary_storage(K * M, temporary_buffer_index); - - transpose( - K, M, - A, lda, - A_tr, K - ); - - sgemm_row_major_transpose_right( - M, N, K, - alpha, - A_tr, K, - B, ldb, - beta, - C, ldc - ); - } - else if (TransA == MatrixTranspose::NoTrans && TransB == MatrixTranspose::Trans) - { - sgemm_row_major_transpose_right( - M, N, K, - alpha, - A, lda, - B, ldb, - beta, - C, ldc - ); - } - else if (TransA == MatrixTranspose::Trans && TransB == MatrixTranspose::NoTrans) - { - auto A_tr = get_thread_local_temporary_storage(K * M, temporary_buffer_index); - - transpose( - K, M, - A, lda, - A_tr, K - ); - - sgemm_row_major_transpose_none( - M, N, K, - alpha, - A_tr, K, - B, ldb, - beta, - C, ldc - ); - } - else // no transpositions - { - sgemm_row_major_transpose_none( - M, N, K, - alpha, - A, lda, - B, ldb, - beta, - C, ldc - ); - } - } - - void sgemm( - ThreadPool& thread_pool, - MatrixLayout layout, MatrixTranspose TransA, MatrixTranspose TransB, - const int M, const int N, const int K, - const float alpha, - const float * SF_BLAS_RESTRICT A, const int lda, - const float * SF_BLAS_RESTRICT B, const int ldb, - const float beta, - float * SF_BLAS_RESTRICT C, const int ldc - ) - { - if (layout == MatrixLayout::RowMajor) - { - sgemm_row_major( - thread_pool, - TransA, TransB, - M, N, K, - alpha, - A, lda, - B, ldb, - beta, - C, ldc - ); - } - else - { - sgemm_row_major( - thread_pool, - TransB, TransA, - N, M, K, - alpha, - B, ldb, - A, lda, - beta, - C, ldc - ); - } - } - - - void sgemm( - MatrixLayout layout, MatrixTranspose TransA, MatrixTranspose TransB, - const int M, const int N, const int K, - const float alpha, - const float * SF_BLAS_RESTRICT A, const int lda, - const float * SF_BLAS_RESTRICT B, const int ldb, - const float beta, - float * SF_BLAS_RESTRICT C, const int ldc - ) - { - if (layout == MatrixLayout::RowMajor) - { - sgemm_row_major( - TransA, TransB, - M, N, K, - alpha, - A, lda, - B, ldb, - beta, - C, ldc - ); - } - else - { - sgemm_row_major( - TransB, TransA, - N, M, K, - alpha, - B, ldb, - A, lda, - beta, - C, ldc - ); - } - } - - std::vector generate_random_matrix(int rows, int cols) - { - std::vector m(rows * cols); - - std::mt19937_64 rng; - std::uniform_real_distribution d(-1.0, 1.0); - - for(auto& v : m) - { - v = d(rng); - } - - return m; - } - - std::vector generate_zero_matrix(int rows, int cols) - { - return std::vector(rows * cols, 0.0f); - } - - float matrix_relative_error( - const std::vector& ref, - const std::vector& our - ) - { - double sum = 0.0; - double diff_sum = 0.0; - - for(size_t i = 0; i < ref.size(); ++i) - { - sum += std::abs(ref[i]); - diff_sum += std::abs(ref[i] - our[i]); - } - - return diff_sum / sum; - } - - float norm( - const std::vector& v - ) - { - double sum = 0.0; - - for(auto& e : v) - { - sum += e * e; - } - - return std::sqrt(sum); - } - -#if defined (USE_BLAS) - - CBLAS_LAYOUT matrix_layout_to_blas_layout(MatrixLayout layout) - { - if (layout == MatrixLayout::RowMajor) - return CblasRowMajor; - else if (layout == MatrixLayout::ColMajor) - return CblasColMajor; - - return static_cast(-1); - } - - const char* matrix_layout_to_string(MatrixLayout layout) - { - if (layout == MatrixLayout::RowMajor) - return "RowMajor"; - else if (layout == MatrixLayout::ColMajor) - return "ColMajor"; - - return "INVALID"; - } - - CBLAS_TRANSPOSE matrix_transpose_to_blas_transpose(MatrixTranspose tr) - { - if (tr == MatrixTranspose::NoTrans) - return CblasNoTrans; - else if (tr == MatrixTranspose::Trans) - return CblasTrans; - - return static_cast(-1); - } - - const char* matrix_transpose_to_string(MatrixTranspose tr) - { - if (tr == MatrixTranspose::NoTrans) - return "NoTrans"; - else if (tr == MatrixTranspose::Trans) - return "Trans"; - - return "INVALID"; - } - - void test_sgemm( - ThreadPool& thread_pool, - MatrixLayout layout, MatrixTranspose trA, MatrixTranspose trB, - int M, int N, int K - ) - { - auto A = generate_random_matrix(M * 2, K * 2); - auto B = generate_random_matrix(K * 2, N * 2); - auto C_ref = generate_random_matrix(M * 2, N * 2); - auto C_our = C_ref; - - std::cout - << matrix_layout_to_string(layout) << ' ' - << matrix_transpose_to_string(trA) << ' ' - << matrix_transpose_to_string(trB) << '\n'; - - std::cout << "A norm: " << norm(A) << '\n'; - std::cout << "B norm: " << norm(B) << '\n'; - std::cout << "C norm: " << norm(C_ref) << '\n'; - - const int lda = (trA == MatrixTranspose::NoTrans) == (layout == MatrixLayout::RowMajor) ? K * 2 : M * 2; - const int ldb = (trB == MatrixTranspose::NoTrans) == (layout == MatrixLayout::RowMajor) ? N * 2 : K * 2; - const int ldc = (layout == MatrixLayout::RowMajor) ? N * 2 : M * 2; - - cblas_sgemm( - matrix_layout_to_blas_layout(layout), - matrix_transpose_to_blas_transpose(trA), - matrix_transpose_to_blas_transpose(trB), - M, N, K, - 1.0, - A.data(), lda, - B.data(), ldb, - 1.0, - C_ref.data(), ldc - ); - - sgemm( - thread_pool, - layout, trA, trB, - M, N, K, - 1.0, - A.data(), lda, - B.data(), ldb, - 1.0, - C_our.data(), ldc - ); - - std::cout << "C_ref norm: " << norm(C_ref) << '\n'; - std::cout << "C_our norm: " << norm(C_our) << '\n'; - std::cout << "Relative error: " << matrix_relative_error(C_ref, C_our) << '\n'; - - std::cout << '\n'; - } - - void test_sgemm( - ThreadPool& thread_pool - ) - { - constexpr int M = 57; - constexpr int N = 127; - constexpr int K = 31; - - std::cout << "SGEMM test:\n"; - - for(auto layout : { MatrixLayout::RowMajor, MatrixLayout::ColMajor }) - { - for(auto trA : { MatrixTranspose::NoTrans, MatrixTranspose::Trans }) - { - for(auto trB : { MatrixTranspose::NoTrans, MatrixTranspose::Trans }) - { - test_sgemm( - thread_pool, - layout, trA, trB, - M, N, K - ); - } - } - } - } - - void bench_sgemm( - ThreadPool& thread_pool, - MatrixLayout layout, MatrixTranspose trA, MatrixTranspose trB, - int M, int N, int K - ) - { - constexpr int num_iters = 1000; - - auto A = generate_random_matrix(M * 2, K * 2); - auto B = generate_random_matrix(K * 2, N * 2); - auto C_ref = generate_random_matrix(M * 2, N * 2); - auto C_our = C_ref; - - std::cout - << matrix_layout_to_string(layout) << ' ' - << matrix_transpose_to_string(trA) << ' ' - << matrix_transpose_to_string(trB) << '\n'; - - std::cout << "A norm: " << norm(A) << '\n'; - std::cout << "B norm: " << norm(B) << '\n'; - std::cout << "C norm: " << norm(C_ref) << '\n'; - - const int lda = (trA == MatrixTranspose::NoTrans) == (layout == MatrixLayout::RowMajor) ? K * 2 : M * 2; - const int ldb = (trB == MatrixTranspose::NoTrans) == (layout == MatrixLayout::RowMajor) ? N * 2 : K * 2; - const int ldc = (layout == MatrixLayout::RowMajor) ? N * 2 : M * 2; - - auto t0_ref = std::chrono::high_resolution_clock::now(); - for(int i = 0; i < num_iters; ++i) - { - cblas_sgemm( - matrix_layout_to_blas_layout(layout), - matrix_transpose_to_blas_transpose(trA), - matrix_transpose_to_blas_transpose(trB), - M, N, K, - 1.0, - A.data(), lda, - B.data(), ldb, - -0.5, - C_ref.data(), ldc - ); - } - auto t1_ref = std::chrono::high_resolution_clock::now(); - auto diff_ref = t1_ref - t0_ref; - - auto t0_our = std::chrono::high_resolution_clock::now(); - for(int i = 0; i < num_iters; ++i) - { - sgemm( - thread_pool, - layout, trA, trB, - M, N, K, - 1.0, - A.data(), lda, - B.data(), ldb, - -0.5, - C_our.data(), ldc - ); - } - auto t1_our = std::chrono::high_resolution_clock::now(); - auto diff_our = t1_our - t0_our; - - std::cout << "C_ref norm: " << norm(C_ref) << '\n'; - std::cout << "C_our norm: " << norm(C_our) << '\n'; - std::cout << "Relative error: " << matrix_relative_error(C_ref, C_our) << '\n'; - std::cout << "Ref time: " << std::chrono::duration_cast(diff_ref).count() << " [ns]\n"; - std::cout << "Our time: " << std::chrono::duration_cast(diff_our).count() << " [ns]\n"; - - std::cout << '\n'; - } - - void bench_sgemm( - ThreadPool& thread_pool - ) - { - constexpr int M = 107; - constexpr int N = 213; - constexpr int K = 57; - - std::cout << "SGEMM benchmark:\n"; - - for(auto layout : { MatrixLayout::RowMajor, MatrixLayout::ColMajor }) - { - for(auto trA : { MatrixTranspose::NoTrans, MatrixTranspose::Trans }) - { - for(auto trB : { MatrixTranspose::NoTrans, MatrixTranspose::Trans }) - { - bench_sgemm( - thread_pool, - layout, trA, trB, - M, N, K - ); - } - } - } - } - -#endif - - void print_arch() - { -#if defined (USE_SSE3) - std::cout << "Using the sse3 implementation.\n"; -#elif defined (USE_SSE2) - std::cout << "Using the sse2 implementation.\n"; -#else - std::cout << "Using the base implementation.\n"; -#endif - } - - void test( - ThreadPool& thread_pool - ) - { -#if defined (USE_BLAS) - print_arch(); - test_sgemm(thread_pool); -#else - std::cout << "Blas tests are only runnable when USE_BLAS is defined.\n"; - (void)thread_pool; -#endif - } - - void bench( - ThreadPool& thread_pool - ) - { -#if defined (USE_BLAS) - print_arch(); - bench_sgemm(thread_pool); -#else - std::cout << "Blas benchmarks are only runnable when USE_BLAS is defined.\n"; - (void)thread_pool; -#endif - } -} \ No newline at end of file diff --git a/src/extra/stockfish_blas.h b/src/extra/stockfish_blas.h deleted file mode 100644 index f551bbf2..00000000 --- a/src/extra/stockfish_blas.h +++ /dev/null @@ -1,140 +0,0 @@ -#ifndef _STOCKFISH_BLAS_H_ -#define _STOCKFISH_BLAS_H_ - -struct ThreadPool; - -#if defined (_MSC_VER) -#define SF_BLAS_RESTRICT __restrict -#elif defined (__INTEL_COMPILER) -#define SF_BLAS_RESTRICT restrict -#elif defined (__clang__) -#define SF_BLAS_RESTRICT __restrict__ -#elif defined (__GNUC__) -#define SF_BLAS_RESTRICT __restrict__ -#endif - -namespace Blas { - - enum struct MatrixLayout { - RowMajor = 101, - ColMajor = 102 - }; - - enum struct MatrixTranspose { - NoTrans = 111, - Trans = 112 - }; - - void scopy( - const int N, - const float * SF_BLAS_RESTRICT X, - float * SF_BLAS_RESTRICT Y - ); - - void scopy( - const int N, - const float * SF_BLAS_RESTRICT X, const int incX, - float * SF_BLAS_RESTRICT Y, const int incY - ); - - void scopy( - ThreadPool& thread_pool, - const int N, - const float * SF_BLAS_RESTRICT X, - float * SF_BLAS_RESTRICT Y - ); - - void scopy( - ThreadPool& thread_pool, - const int N, - const float * SF_BLAS_RESTRICT X, const int incX, - float * SF_BLAS_RESTRICT Y, const int incY - ); - - void sscal( - const int N, - const float alpha, - float * SF_BLAS_RESTRICT X - ); - - void sscal( - const int N, - const float alpha, - float * SF_BLAS_RESTRICT X, const int incX - ); - - void sscal( - ThreadPool& thread_pool, - const int N, - const float alpha, - float * SF_BLAS_RESTRICT X - ); - - void sscal( - ThreadPool& thread_pool, - const int N, - const float alpha, - float * SF_BLAS_RESTRICT X, const int incX - ); - - void saxpy( - const int N, - const float alpha, - const float * SF_BLAS_RESTRICT X, - float * SF_BLAS_RESTRICT Y - ); - - void saxpy( - const int N, - const float alpha, - const float * SF_BLAS_RESTRICT X, const int incX, - float * SF_BLAS_RESTRICT Y, const int incY - ); - - void saxpy( - ThreadPool& thread_pool, - const int N, - const float alpha, - const float * SF_BLAS_RESTRICT X, - float * SF_BLAS_RESTRICT Y - ); - - void saxpy( - ThreadPool& thread_pool, - const int N, - const float alpha, - const float * SF_BLAS_RESTRICT X, const int incX, - float * SF_BLAS_RESTRICT Y, const int incY - ); - - void sgemm( - ThreadPool& thread_pool, - MatrixLayout layout, MatrixTranspose TransA, MatrixTranspose TransB, - const int M, const int N, const int K, - const float alpha, - const float * SF_BLAS_RESTRICT A, const int lda, - const float * SF_BLAS_RESTRICT B, const int ldb, - const float beta, - float * SF_BLAS_RESTRICT C, const int ldc - ); - - void sgemm( - MatrixLayout layout, MatrixTranspose TransA, MatrixTranspose TransB, - const int M, const int N, const int K, - const float alpha, - const float * SF_BLAS_RESTRICT A, const int lda, - const float * SF_BLAS_RESTRICT B, const int ldb, - const float beta, - float * SF_BLAS_RESTRICT C, const int ldc - ); - - void test( - ThreadPool& thread_pool - ); - - void bench( - ThreadPool& thread_pool - ); -} - -#endif diff --git a/src/learn/autograd.h b/src/learn/autograd.h deleted file mode 100644 index 7b2853df..00000000 --- a/src/learn/autograd.h +++ /dev/null @@ -1,667 +0,0 @@ -#ifndef LEARNER_AUTOGRAD_H -#define LEARNER_AUTOGRAD_H - -#include -#include -#include -#include -#include -#include -#include -#include - -namespace Learner -{ - template - struct ValueWithGrad - { - T value; - T grad; - - ValueWithGrad& operator+=(const ValueWithGrad& rhs) - { - value += rhs.value; - grad += rhs.grad; - return *this; - } - - ValueWithGrad& operator-=(const ValueWithGrad& rhs) - { - value -= rhs.value; - grad -= rhs.grad; - return *this; - } - - ValueWithGrad& operator*=(T rhs) - { - value *= rhs; - grad *= rhs; - return *this; - } - - ValueWithGrad& operator/=(T rhs) - { - value /= rhs; - grad /= rhs; - return *this; - } - - [[nodiscard]] ValueWithGrad abs() const - { - return { std::abs(value), std::abs(grad) }; - } - - [[nodiscard]] ValueWithGrad clamp_grad(T max) const - { - return { value, std::clamp(grad, -max, max) }; - } - }; -} - -namespace Learner::Autograd::UnivariateStatic -{ - - template - struct Identity - { - using type = T; - }; - - template - using Id = typename Identity::type; - - template - using StoreValueOrRef = std::conditional_t< - std::is_rvalue_reference_v, - std::remove_reference_t, - const std::remove_reference_t& - >; - - namespace Detail - { - using CallIdType = std::uint32_t; - - struct CallId - { - CallIdType call_id{}; - - constexpr CallId() : - call_id(0) - { - } - - constexpr CallId(CallIdType id) : - call_id(id) - { - } - - [[nodiscard]] bool operator==(CallId rhs) const noexcept - { - return call_id == rhs.call_id; - } - - [[nodiscard]] bool operator!=(CallId rhs) const noexcept - { - return call_id != rhs.call_id; - } - }; - - [[nodiscard]] inline CallId next_call_id() - { - static thread_local CallIdType s_call_id = 0; - return CallId{ s_call_id++ }; - } - - template - struct TupleContains; - - template - struct TupleContains> : std::disjunction...> {}; - - template - constexpr bool TupleContainsV = TupleContains::value; - - template - constexpr bool AreAllConstantV = (std::remove_reference_t::is_constant && ...); - } - - template - struct Evaluable - { - constexpr Evaluable() = default; - - // We append a unique call id so that we can invalidate the cache when - // the next computation starts. A single evaluation should see - // the same call_id at every node. - template - [[nodiscard]] auto eval(const std::tuple& args) const - { - const auto call_id = Detail::next_call_id(); - const auto new_args = std::tuple_cat(args, std::tuple(call_id)); - return ValueWithGrad{ value(new_args), grad(new_args) }; - } - - template >>> - [[nodiscard]] auto value(const std::tuple& args) const - { - const ChildT* this_ = static_cast(this); - - const auto call_id = std::get(args); - if (!value_cache.has_value() || value_cache_call_id != call_id) - { - value_cache_call_id = call_id; - value_cache = this_->calculate_value(args); - } - - return *value_cache; - } - - template >>> - [[nodiscard]] auto value(const std::tuple& args, ...) const - { - const auto call_id = Detail::next_call_id(); - const auto new_args = std::tuple_cat(args, std::tuple(call_id)); - return value(new_args); - } - - template >>> - [[nodiscard]] auto grad(const std::tuple& args) const - { - if constexpr (ChildT::is_constant) - { - return T(0.0); - } - else - { - const ChildT* this_ = static_cast(this); - - const auto call_id = std::get(args); - if (!grad_cache.has_value() || grad_cache_call_id != call_id) - { - grad_cache_call_id = call_id; - grad_cache = this_->calculate_grad(args); - } - - return *grad_cache; - } - } - - template >>> - [[nodiscard]] auto grad(const std::tuple& args, ...) const - { - const auto call_id = Detail::next_call_id(); - const auto new_args = std::tuple_cat(args, std::tuple(call_id)); - return grad(new_args); - } - - private: - mutable std::optional value_cache; - mutable std::optional grad_cache; - mutable Detail::CallId value_cache_call_id{}; - mutable Detail::CallId grad_cache_call_id{}; - }; - - template - struct VariableParameter : Evaluable> - { - using ValueType = T; - - static constexpr bool is_constant = false; - - constexpr VariableParameter() - { - } - - template - [[nodiscard]] T calculate_value(const std::tuple& args) const - { - return std::get(args); - } - - template - [[nodiscard]] T calculate_grad(const std::tuple&) const - { - return T(1.0); - } - }; - - template - struct ConstantParameter : Evaluable> - { - using ValueType = T; - - static constexpr bool is_constant = true; - - constexpr ConstantParameter() - { - } - - template - [[nodiscard]] T calculate_value(const std::tuple& args) const - { - return std::get(args); - } - - template - [[nodiscard]] T calculate_grad(const std::tuple&) const - { - return T(0.0); - } - }; - - template - struct Constant : Evaluable> - { - using ValueType = T; - - static constexpr bool is_constant = true; - - constexpr Constant(T x) : - m_x(std::move(x)) - { - } - - template - [[nodiscard]] T calculate_value(const std::tuple&) const - { - return m_x; - } - - template - [[nodiscard]] T calculate_grad(const std::tuple&) const - { - return T(0.0); - } - - private: - T m_x; - }; - - // The "constant" may change between executions, but is assumed to be - // constant during a single evaluation. - template - struct ConstantRef : Evaluable> - { - using ValueType = T; - - static constexpr bool is_constant = true; - - constexpr ConstantRef(const T& x) : - m_x(x) - { - } - - template - [[nodiscard]] T calculate_value(const std::tuple&) const - { - return m_x; - } - - template - [[nodiscard]] T calculate_grad(const std::tuple&) const - { - return T(0.0); - } - - private: - const T& m_x; - }; - - template ::ValueType> - struct Sum : Evaluable> - { - using ValueType = T; - - static constexpr bool is_constant = Detail::AreAllConstantV; - - constexpr Sum(LhsT&& lhs, RhsT&& rhs) : - m_lhs(std::forward(lhs)), - m_rhs(std::forward(rhs)) - { - } - - template - [[nodiscard]] T calculate_value(const std::tuple& args) const - { - return m_lhs.value(args) + m_rhs.value(args); - } - - template - [[nodiscard]] T calculate_grad(const std::tuple& args) const - { - return m_lhs.grad(args) + m_rhs.grad(args); - } - - private: - StoreValueOrRef m_lhs; - StoreValueOrRef m_rhs; - }; - - template ::ValueType> - [[nodiscard]] constexpr auto operator+(LhsT&& lhs, RhsT&& rhs) - { - return Sum(std::forward(lhs), std::forward(rhs)); - } - - template ::ValueType> - [[nodiscard]] constexpr auto operator+(LhsT&& lhs, Id rhs) - { - return Sum&&>(std::forward(lhs), Constant(rhs)); - } - - template ::ValueType> - [[nodiscard]] constexpr auto operator+(Id lhs, RhsT&& rhs) - { - return Sum&&, RhsT&&>(Constant(lhs), std::forward(rhs)); - } - - template ::ValueType> - struct Difference : Evaluable> - { - using ValueType = T; - - static constexpr bool is_constant = Detail::AreAllConstantV; - - constexpr Difference(LhsT&& lhs, RhsT&& rhs) : - m_lhs(std::forward(lhs)), - m_rhs(std::forward(rhs)) - { - } - - template - [[nodiscard]] T calculate_value(const std::tuple& args) const - { - return m_lhs.value(args) - m_rhs.value(args); - } - - template - [[nodiscard]] T calculate_grad(const std::tuple& args) const - { - return m_lhs.grad(args) - m_rhs.grad(args); - } - - private: - StoreValueOrRef m_lhs; - StoreValueOrRef m_rhs; - }; - - template ::ValueType> - [[nodiscard]] constexpr auto operator-(LhsT&& lhs, RhsT&& rhs) - { - return Difference(std::forward(lhs), std::forward(rhs)); - } - - template ::ValueType> - [[nodiscard]] constexpr auto operator-(LhsT&& lhs, Id rhs) - { - return Difference&&>(std::forward(lhs), Constant(rhs)); - } - - template ::ValueType> - [[nodiscard]] constexpr auto operator-(Id lhs, RhsT&& rhs) - { - return Difference&&, RhsT&&>(Constant(lhs), std::forward(rhs)); - } - - template ::ValueType> - struct Product : Evaluable> - { - using ValueType = T; - - static constexpr bool is_constant = Detail::AreAllConstantV; - - constexpr Product(LhsT&& lhs, RhsT&& rhs) : - m_lhs(std::forward(lhs)), - m_rhs(std::forward(rhs)) - { - } - - template - [[nodiscard]] T calculate_value(const std::tuple& args) const - { - return m_lhs.value(args) * m_rhs.value(args); - } - - template - [[nodiscard]] T calculate_grad(const std::tuple& args) const - { - return m_lhs.grad(args) * m_rhs.value(args) + m_lhs.value(args) * m_rhs.grad(args); - } - - private: - StoreValueOrRef m_lhs; - StoreValueOrRef m_rhs; - }; - - template ::ValueType> - [[nodiscard]] constexpr auto operator*(LhsT&& lhs, RhsT&& rhs) - { - return Product(std::forward(lhs), std::forward(rhs)); - } - - template ::ValueType> - [[nodiscard]] constexpr auto operator*(LhsT&& lhs, Id rhs) - { - return Product&&>(std::forward(lhs), Constant(rhs)); - } - - template ::ValueType> - [[nodiscard]] constexpr auto operator*(Id lhs, RhsT&& rhs) - { - return Product&&, RhsT&&>(Constant(lhs), std::forward(rhs)); - } - - template ::ValueType> - struct Quotient : Evaluable> - { - using ValueType = T; - - static constexpr bool is_constant = Detail::AreAllConstantV; - - constexpr Quotient(LhsT&& lhs, RhsT&& rhs) : - m_lhs(std::forward(lhs)), - m_rhs(std::forward(rhs)) - { - } - - template - [[nodiscard]] T calculate_value(const std::tuple& args) const - { - return m_lhs.value(args) / m_rhs.value(args); - } - - template - [[nodiscard]] T calculate_grad(const std::tuple& args) const - { - auto g = m_rhs.value(args); - return (m_lhs.grad(args) * g - m_lhs.value(args) * m_rhs.grad(args)) / (g * g); - } - - private: - StoreValueOrRef m_lhs; - StoreValueOrRef m_rhs; - }; - - template ::ValueType> - [[nodiscard]] constexpr auto operator/(LhsT&& lhs, RhsT&& rhs) - { - return Quotient(std::forward(lhs), std::forward(rhs)); - } - - template ::ValueType> - [[nodiscard]] constexpr auto operator/(LhsT&& lhs, Id rhs) - { - return Quotient&&>(std::forward(lhs), Constant(rhs)); - } - - template ::ValueType> - [[nodiscard]] constexpr auto operator/(Id lhs, RhsT&& rhs) - { - return Quotient&&, RhsT&&>(Constant(lhs), std::forward(rhs)); - } - - template ::ValueType> - struct Negation : Evaluable> - { - using ValueType = T; - - static constexpr bool is_constant = Detail::AreAllConstantV; - - constexpr explicit Negation(ArgT&& x) : - m_x(std::forward(x)) - { - } - - template - [[nodiscard]] T calculate_value(const std::tuple& args) const - { - return -m_x.value(args); - } - - template - [[nodiscard]] T calculate_grad(const std::tuple& args) const - { - return -m_x.grad(args); - } - - private: - StoreValueOrRef m_x; - }; - - template ::ValueType> - [[nodiscard]] constexpr auto operator-(ArgT&& x) - { - return Negation(std::forward(x)); - } - - template ::ValueType> - struct Sigmoid : Evaluable> - { - using ValueType = T; - - static constexpr bool is_constant = Detail::AreAllConstantV; - - constexpr explicit Sigmoid(ArgT&& x) : - m_x(std::forward(x)) - { - } - - template - [[nodiscard]] T calculate_value(const std::tuple& args) const - { - return value_(m_x.value(args)); - } - - template - [[nodiscard]] T calculate_grad(const std::tuple& args) const - { - return m_x.grad(args) * grad_(m_x.value(args)); - } - - private: - StoreValueOrRef m_x; - - [[nodiscard]] T value_(T x) const - { - return 1.0 / (1.0 + std::exp(-x)); - } - - [[nodiscard]] T grad_(T x) const - { - return value_(x) * (1.0 - value_(x)); - } - }; - - template ::ValueType> - [[nodiscard]] constexpr auto sigmoid(ArgT&& x) - { - return Sigmoid(std::forward(x)); - } - - template ::ValueType> - struct Pow : Evaluable> - { - using ValueType = T; - - static constexpr bool is_constant = Detail::AreAllConstantV; - - constexpr explicit Pow(ArgT&& x, Id exponent) : - m_x(std::forward(x)), - m_exponent(std::move(exponent)) - { - } - - template - [[nodiscard]] T calculate_value(const std::tuple& args) const - { - return std::pow(m_x.value(args), m_exponent); - } - - template - [[nodiscard]] T calculate_grad(const std::tuple& args) const - { - return m_exponent * std::pow(m_x.value(args), m_exponent - T(1.0)) * m_x.grad(args); - } - - private: - StoreValueOrRef m_x; - T m_exponent; - }; - - template ::ValueType> - [[nodiscard]] constexpr auto pow(ArgT&& x, Id exp) - { - return Pow(std::forward(x), std::move(exp)); - } - - template ::ValueType> - struct Log : Evaluable> - { - using ValueType = T; - - static constexpr bool is_constant = Detail::AreAllConstantV; - - constexpr explicit Log(ArgT&& x) : - m_x(std::forward(x)) - { - } - - template - [[nodiscard]] T calculate_value(const std::tuple& args) const - { - return value_(m_x.value(args)); - } - - template - [[nodiscard]] T calculate_grad(const std::tuple& args) const - { - return m_x.grad(args) * grad_(m_x.value(args)); - } - - private: - StoreValueOrRef m_x; - - T value_(T x) const - { - return std::log(x); - } - - T grad_(T x) const - { - return 1.0 / x; - } - }; - - template ::ValueType> - [[nodiscard]] constexpr auto log(ArgT&& x) - { - return Log(std::forward(x)); - } - -} - -#endif \ No newline at end of file diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp index b28afa13..e5ddd6aa 100644 --- a/src/learn/gensfen.cpp +++ b/src/learn/gensfen.cpp @@ -13,7 +13,6 @@ #include "extra/nnue_data_binpack_format.h" #include "nnue/evaluate_nnue.h" -#include "nnue/evaluate_nnue_learner.h" #include "syzygy/tbprobe.h" @@ -493,8 +492,8 @@ namespace Learner // has it reached the max length or is a draw by fifty-move rule // or by 3-fold repetition - if (ply >= params.write_maxply - || pos.is_fifty_move_draw() + if (ply >= params.write_maxply + || pos.is_fifty_move_draw() || pos.is_three_fold_repetition()) { return 0; diff --git a/src/learn/gensfen_nonpv.cpp b/src/learn/gensfen_nonpv.cpp index ca365034..098511fe 100644 --- a/src/learn/gensfen_nonpv.cpp +++ b/src/learn/gensfen_nonpv.cpp @@ -13,7 +13,6 @@ #include "extra/nnue_data_binpack_format.h" #include "nnue/evaluate_nnue.h" -#include "nnue/evaluate_nnue_learner.h" #include "syzygy/tbprobe.h" diff --git a/src/learn/half_float.h b/src/learn/half_float.h deleted file mode 100644 index 5808a786..00000000 --- a/src/learn/half_float.h +++ /dev/null @@ -1,133 +0,0 @@ -#ifndef __HALF_FLOAT_H__ -#define __HALF_FLOAT_H__ - -// Half Float Library by yaneurao -// (16-bit float) - -// Floating point operation by 16bit type -// Assume that the float type code generated by the compiler is in IEEE 754 format and use it. - -#include "types.h" - -namespace HalfFloat -{ - // IEEE 754 float 32 format is : - // sign(1bit) + exponent(8bits) + fraction(23bits) = 32bits - // - // Our float16 format is : - // sign(1bit) + exponent(5bits) + fraction(10bits) = 16bits - union float32_converter - { - int32_t n; - float f; - }; - - - // 16-bit float - struct float16 - { - // --- constructors - - float16() {} - float16(int16_t n) { from_float((float)n); } - float16(int32_t n) { from_float((float)n); } - float16(float n) { from_float(n); } - float16(double n) { from_float((float)n); } - - // build from a float - void from_float(float f) { *this = to_float16(f); } - - // --- implicit converters - - operator int32_t() const { return (int32_t)to_float(*this); } - operator float() const { return to_float(*this); } - operator double() const { return double(to_float(*this)); } - - // --- operators - - float16 operator += (float16 rhs) { from_float(to_float(*this) + to_float(rhs)); return *this; } - float16 operator -= (float16 rhs) { from_float(to_float(*this) - to_float(rhs)); return *this; } - float16 operator *= (float16 rhs) { from_float(to_float(*this) * to_float(rhs)); return *this; } - float16 operator /= (float16 rhs) { from_float(to_float(*this) / to_float(rhs)); return *this; } - float16 operator + (float16 rhs) const { return float16(*this) += rhs; } - float16 operator - (float16 rhs) const { return float16(*this) -= rhs; } - float16 operator * (float16 rhs) const { return float16(*this) *= rhs; } - float16 operator / (float16 rhs) const { return float16(*this) /= rhs; } - float16 operator - () const { return float16(-to_float(*this)); } - bool operator == (float16 rhs) const { return this->v_ == rhs.v_; } - bool operator != (float16 rhs) const { return !(*this == rhs); } - - static void UnitTest() { unit_test(); } - - private: - - // --- entity - - uint16_t v_; - - // --- conversion between float and float16 - - static float16 to_float16(float f) - { - float32_converter c; - c.f = f; - u32 n = c.n; - - // The sign bit is MSB in common. - uint16_t sign_bit = (n >> 16) & 0x8000; - - // The exponent of IEEE 754's float 32 is biased +127 , so we change this bias into +15 and limited to 5-bit. - uint16_t exponent = (((n >> 23) - 127 + 15) & 0x1f) << 10; - - // The fraction is limited to 10-bit. - uint16_t fraction = (n >> (23-10)) & 0x3ff; - - float16 f_; - f_.v_ = sign_bit | exponent | fraction; - - return f_; - } - - static float to_float(float16 v) - { - u32 sign_bit = (v.v_ & 0x8000) << 16; - u32 exponent = ((((v.v_ >> 10) & 0x1f) - 15 + 127) & 0xff) << 23; - u32 fraction = (v.v_ & 0x3ff) << (23 - 10); - - float32_converter c; - c.n = sign_bit | exponent | fraction; - return c.f; - } - - // It is not a unit test, but I confirmed that it can be calculated. I'll fix the code later (maybe). - static void unit_test() - { - float16 a, b, c, d; - a = 1; - std::cout << (float)a << std::endl; - b = -118.625; - std::cout << (float)b << std::endl; - c = 2.5; - std::cout << (float)c << std::endl; - d = a + c; - std::cout << (float)d << std::endl; - - c *= 1.5; - std::cout << (float)c << std::endl; - - b /= 3; - std::cout << (float)b << std::endl; - - float f1 = 1.5; - a += f1; - std::cout << (float)a << std::endl; - - a += f1 * (float)a; - std::cout << (float)a << std::endl; - } - - }; - -} - -#endif // __HALF_FLOAT_H__ diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp deleted file mode 100644 index 9c4546a6..00000000 --- a/src/learn/learn.cpp +++ /dev/null @@ -1,1474 +0,0 @@ -// Learning routines: -// -// 1) Automatic generation of game records in .bin format -// → "gensfen" command -// -// 2) Learning evaluation function parameters from the generated .bin files -// → "learn" command -// -// → Shuffle in the teacher phase is also an extension of this command. -// Example) "learn shuffle" -// -// 3) Automatic generation of fixed traces -// → "makebook think" command -// → implemented in extra/book/book.cpp -// -// 4) Post-station automatic review mode -// → I will not be involved in the engine because it is a problem that the GUI should assist. -// etc.. - -#include "learn.h" - -#include "autograd.h" -#include "sfen_reader.h" - -#include "misc.h" -#include "position.h" -#include "thread.h" -#include "tt.h" -#include "uci.h" -#include "search.h" -#include "timeman.h" - -#include "nnue/evaluate_nnue.h" -#include "nnue/evaluate_nnue_learner.h" - -#include "syzygy/tbprobe.h" - -#include -#include -#include // std::exp(),std::pow(),std::log() -#include // memcpy() -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined (_OPENMP) -#include -#endif - -using namespace std; - -template -T operator +=(std::atomic& x, const T rhs) -{ - T old = x.load(std::memory_order_consume); - - // It is allowed that the value is rewritten from other thread at this timing. - // The idea that the value is not destroyed is good. - T desired = old + rhs; - while (!x.compare_exchange_weak(old, desired, std::memory_order_release, std::memory_order_consume)) - desired = old + rhs; - return desired; -} -template -T operator -= (std::atomic& x, const T rhs) { return x += -rhs; } - -namespace Learner -{ - static double winning_probability_coefficient = 1.0 / PawnValueEg / 4.0 * std::log(10.0); - - // Score scale factors. ex) If we set src_score_min_value = 0.0, - // src_score_max_value = 1.0, dest_score_min_value = 0.0, - // dest_score_max_value = 10000.0, [0.0, 1.0] will be scaled to [0, 10000]. - static double src_score_min_value = 0.0; - static double src_score_max_value = 1.0; - static double dest_score_min_value = 0.0; - static double dest_score_max_value = 1.0; - - // A constant used in elmo (WCSC27). Adjustment required. - // Since elmo does not internally divide the expression, the value is different. - // You can set this value with the learn command. - // 0.33 is equivalent to the constant (0.5) used in elmo (WCSC27) - static double elmo_lambda_low = 1.0; - static double elmo_lambda_high = 1.0; - static double elmo_lambda_limit = 32000; - - // Using stockfish's WDL with win rate model instead of sigmoid - static bool use_wdl = false; - - static void append_files_from_dir( - std::vector& filenames, - const std::string& base_dir, - const std::string& target_dir) - { - string kif_base_dir = Path::combine(base_dir, target_dir); - - sys::path p(kif_base_dir); // Origin of enumeration - std::for_each(sys::directory_iterator(p), sys::directory_iterator(), - [&](const sys::path& path) { - if (sys::is_regular_file(path)) - filenames.push_back(Path::combine(target_dir, path.filename().generic_string())); - }); - } - - static void rebase_files( - std::vector& filenames, - const std::string& base_dir) - { - for (auto& file : filenames) - { - file = Path::combine(base_dir, file); - } - } - - static double calculate_lambda(double teacher_signal) - { - // If the evaluation value in deep search exceeds elmo_lambda_limit - // then apply elmo_lambda_high instead of elmo_lambda_low. - const double lambda = - (std::abs(teacher_signal) >= elmo_lambda_limit) - ? elmo_lambda_high - : elmo_lambda_low; - - return lambda; - } - - // We use our own simple static autograd for automatic - // differentiation of the loss function. While it works it has it's caveats. - // To work fast enough it requires memoization and reference semantics. - // Memoization is mostly opaque to the user and is only per eval basis. - // As for reference semantics, we cannot copy every node, - // because we need a way to reuse computation. - // But we can't really use shared_ptr because of the overhead. That means - // that we have to ensure all parts of a loss expression are not destroyed - // before use. When lvalue references are used to construct a node it will - // store just a reference, it only perform a copy of the rvalue reference arguments. - // This means that we need some storage for the whole computation tree - // that keeps the values after function returns and never moves them to - // a different memory location. This means that we cannot use local - // variables and just return by value - because there may be dangling references left. - // We also cannot create a struct with this tree on demand because one cannot - // use `auto` as a struct members. This is a big issue, and the only way - // to solve it as of now is to use static thread_local variables and rely on the - // following assumptions: - // 1. the expression node must not change for the duration of the program - // within a single instance of a function. This is usually not a problem - // because almost all information is carried by the type. There is an - // exception though, we have ConstantRef and Constant nodes that - // do not encode the constants in the type, so it's possible - // that these nodes are different on the first call to the function - // then later. We MUST ensure that one function is only ever used - // for one specific expression. - // 2. thread_local variables are not expensive. Usually after creation - // it only requires a single unsynchronized boolean check and that's - // how most compilers implement it. - // - // So the general way to do things right now is to use static thread_local - // variables for all named autograd nodes. Results being nodes should be - // returned by reference, so that there's no need to copy the returned objects. - // Parameters being nodes should be taken by lvalue reference if they are - // used more than once (to enable reference semantics to reuse computation), - // but they can be rvalues and forward on first use if there's only one use - // of the node in the scope. - // We must keep in mind that the node tree created by such a function - // is never going to change as thread_local variables are initialized - // on first call. This means that one cannot use one function as a factory - // for different autograd expression trees. - - template - static auto& cross_entropy_( - ShallowT& q_, - TeacherT& p_, - ResultT& t_, - LambdaT& lambda_ - ) - { - using namespace Learner::Autograd::UnivariateStatic; - - constexpr double epsilon = 1e-12; - - static thread_local auto teacher_entropy_ = -(p_ * log(p_ + epsilon) + (1.0 - p_) * log(1.0 - p_ + epsilon)); - static thread_local auto outcome_entropy_ = -(t_ * log(t_ + epsilon) + (1.0 - t_) * log(1.0 - t_ + epsilon)); - static thread_local auto teacher_loss_ = -(p_ * log(q_ + epsilon) + (1.0 - p_) * log(1.0 - q_ + epsilon)); - static thread_local auto outcome_loss_ = -(t_ * log(q_ + epsilon) + (1.0 - t_) * log(1.0 - q_ + epsilon)); - static thread_local auto result_ = lambda_ * teacher_loss_ + (1.0 - lambda_) * outcome_loss_; - static thread_local auto entropy_ = lambda_ * teacher_entropy_ + (1.0 - lambda_) * outcome_entropy_; - static thread_local auto cross_entropy_ = result_ - entropy_; - - return cross_entropy_; - } - - template - static auto& scale_score_(ValueT&& v_) - { - using namespace Learner::Autograd::UnivariateStatic; - - // Normalize to [0.0, 1.0]. - static thread_local auto normalized_ = - (std::forward(v_) - ConstantRef(src_score_min_value)) - / (ConstantRef(src_score_max_value) - ConstantRef(src_score_min_value)); - - // Scale to [dest_score_min_value, dest_score_max_value]. - static thread_local auto scaled_ = - normalized_ - * (ConstantRef(dest_score_max_value) - ConstantRef(dest_score_min_value)) - + ConstantRef(dest_score_min_value); - - return scaled_; - } - - static Value scale_score(Value v) - { - // Normalize to [0.0, 1.0]. - auto normalized = - ((double)v - src_score_min_value) - / (src_score_max_value - src_score_min_value); - - // Scale to [dest_score_min_value, dest_score_max_value]. - auto scaled = - normalized - * (dest_score_max_value - dest_score_min_value) - + dest_score_min_value; - - return Value(scaled); - } - - template - static auto& expected_perf_(ValueT&& v_) - { - using namespace Learner::Autograd::UnivariateStatic; - - static thread_local auto perf_ = sigmoid(std::forward(v_) * ConstantRef(winning_probability_coefficient)); - - return perf_; - } - - template - static auto& expected_perf_use_wdl_( - ValueT& v_, - PlyT&& ply_ - ) - { - using namespace Learner::Autograd::UnivariateStatic; - - // Coefficients of a 3rd order polynomial fit based on fishtest data - // for two parameters needed to transform eval to the argument of a - // logistic function. - static constexpr T as[] = { -8.24404295, 64.23892342, -95.73056462, 153.86478679 }; - static constexpr T bs[] = { -3.37154371, 28.44489198, -56.67657741, 72.05858751 }; - - // The model captures only up to 240 plies, so limit input (and rescale) - static thread_local auto m_ = std::forward(ply_) / 64.0; - - static thread_local auto a_ = (((as[0] * m_ + as[1]) * m_ + as[2]) * m_) + as[3]; - static thread_local auto b_ = (((bs[0] * m_ + bs[1]) * m_ + bs[2]) * m_) + bs[3]; - - // Return win rate in per mille - static thread_local auto sv_ = (v_ - a_) / b_; - static thread_local auto svn_ = (-v_ - a_) / b_; - - static thread_local auto win_pct_ = sigmoid(sv_); - static thread_local auto loss_pct_ = sigmoid(svn_); - - static thread_local auto draw_pct_ = 1.0 - win_pct_ - loss_pct_; - - static thread_local auto perf_ = win_pct_ + draw_pct_ * 0.5; - - return perf_; - } - - static double expected_perf_use_wdl( - Value v, - int ply - ) - { - // Coefficients of a 3rd order polynomial fit based on fishtest data - // for two parameters needed to transform eval to the argument of a - // logistic function. - static constexpr double as[] = { -8.24404295, 64.23892342, -95.73056462, 153.86478679 }; - static constexpr double bs[] = { -3.37154371, 28.44489198, -56.67657741, 72.05858751 }; - - // The model captures only up to 240 plies, so limit input (and rescale) - auto m = ply / 64.0; - - auto a = (((as[0] * m + as[1]) * m + as[2]) * m) + as[3]; - auto b = (((bs[0] * m + bs[1]) * m + bs[2]) * m) + bs[3]; - - // Return win rate in per mille - auto sv = ((double)v - a) / b; - auto svn = ((double)-v - a) / b; - - auto win_pct = Math::sigmoid(sv); - auto loss_pct = Math::sigmoid(svn); - - auto draw_pct = 1.0 - win_pct - loss_pct; - - auto perf = win_pct + draw_pct * 0.5; - - return perf; - } - - [[maybe_unused]] static ValueWithGrad get_loss_noob( - Value shallow, Value teacher_signal, int result, int /* ply */) - { - using namespace Learner::Autograd::UnivariateStatic; - - static thread_local auto q_ = VariableParameter{}; - static thread_local auto p_ = ConstantParameter{}; - static thread_local auto loss_ = pow(q_ - p_, 2.0) * (1.0 / (2400.0 * 2.0 * 600.0)); - - auto args = std::tuple( - (double)shallow, - (double)teacher_signal, - (double)result, - calculate_lambda(teacher_signal) - ); - - return loss_.eval(args); - } - - static auto& get_loss_cross_entropy_() - { - using namespace Learner::Autograd::UnivariateStatic; - - static thread_local auto& q_ = expected_perf_(VariableParameter{}); - static thread_local auto& p_ = expected_perf_(scale_score_(ConstantParameter{})); - static thread_local auto t_ = (ConstantParameter{} + 1.0) * 0.5; - static thread_local auto lambda_ = ConstantParameter{}; - static thread_local auto& loss_ = cross_entropy_(q_, p_, t_, lambda_); - - return loss_; - } - - static auto get_loss_cross_entropy_args( - Value shallow, Value teacher_signal, int result) - { - return std::tuple( - (double)shallow, - (double)teacher_signal, - (double)result, - calculate_lambda(teacher_signal) - ); - } - - static ValueWithGrad get_loss_cross_entropy( - Value shallow, Value teacher_signal, int result, int /* ply */) - { - using namespace Learner::Autograd::UnivariateStatic; - - static thread_local auto& loss_ = get_loss_cross_entropy_(); - - auto args = get_loss_cross_entropy_args(shallow, teacher_signal, result); - - return loss_.eval(args); - } - - static ValueWithGrad get_loss_cross_entropy_no_grad( - Value shallow, Value teacher_signal, int result, int /* ply */) - { - using namespace Learner::Autograd::UnivariateStatic; - - static thread_local auto& loss_ = get_loss_cross_entropy_(); - - auto args = get_loss_cross_entropy_args(shallow, teacher_signal, result); - - return { loss_.value(args), 0.0 }; - } - - static auto& get_loss_cross_entropy_use_wdl_() - { - using namespace Learner::Autograd::UnivariateStatic; - - static thread_local auto ply_ = ConstantParameter{}; - static thread_local auto shallow_ = VariableParameter{}; - static thread_local auto& q_ = expected_perf_use_wdl_(shallow_, ply_); - // We could do just this but MSVC crashes with an internal compiler error :( - // static thread_local auto& scaled_teacher_ = scale_score_(ConstantParameter{}); - // static thread_local auto& p_ = expected_perf_use_wdl_(scaled_teacher_, ply_); - static thread_local auto p_ = ConstantParameter{}; - static thread_local auto t_ = (ConstantParameter{} + 1.0) * 0.5; - static thread_local auto lambda_ = ConstantParameter{}; - static thread_local auto& loss_ = cross_entropy_(q_, p_, t_, lambda_); - - return loss_; - } - - static auto get_loss_cross_entropy_use_wdl_args( - Value shallow, Value teacher_signal, int result, int ply) - { - return std::tuple( - (double)shallow, - // This is required because otherwise MSVC crashes :( - expected_perf_use_wdl(scale_score(teacher_signal), ply), - (double)result, - calculate_lambda(teacher_signal), - (double)std::min(240, ply) - ); - } - - static ValueWithGrad get_loss_cross_entropy_use_wdl( - Value shallow, Value teacher_signal, int result, int ply) - { - using namespace Learner::Autograd::UnivariateStatic; - - static thread_local auto& loss_ = get_loss_cross_entropy_use_wdl_(); - - auto args = get_loss_cross_entropy_use_wdl_args(shallow, teacher_signal, result, ply); - - return loss_.eval(args); - } - - static ValueWithGrad get_loss_cross_entropy_use_wdl_no_grad( - Value shallow, Value teacher_signal, int result, int ply) - { - using namespace Learner::Autograd::UnivariateStatic; - - static thread_local auto& loss_ = get_loss_cross_entropy_use_wdl_(); - - auto args = get_loss_cross_entropy_use_wdl_args(shallow, teacher_signal, result, ply); - - return { loss_.value(args), 0.0 }; - } - - static auto get_loss(Value shallow, Value teacher_signal, int result, int ply) - { - using namespace Learner::Autograd::UnivariateStatic; - - if (use_wdl) - { - return get_loss_cross_entropy_use_wdl(shallow, teacher_signal, result, ply); - } - else - { - return get_loss_cross_entropy(shallow, teacher_signal, result, ply); - } - } - - static auto get_loss_no_grad(Value shallow, Value teacher_signal, int result, int ply) - { - using namespace Learner::Autograd::UnivariateStatic; - - if (use_wdl) - { - return get_loss_cross_entropy_use_wdl_no_grad(shallow, teacher_signal, result, ply); - } - else - { - return get_loss_cross_entropy_no_grad(shallow, teacher_signal, result, ply); - } - } - - [[maybe_unused]] static auto get_loss( - Value teacher_signal, - Value shallow, - const PackedSfenValue& psv) - { - return get_loss(shallow, teacher_signal, psv.game_result, psv.gamePly); - } - - static auto get_loss_no_grad( - Value teacher_signal, - Value shallow, - const PackedSfenValue& psv) - { - return get_loss_no_grad(shallow, teacher_signal, psv.game_result, psv.gamePly); - } - - // Class to generate sfen with multiple threads - struct LearnerThink - { - struct Params - { - // Mini batch size size. Be sure to set it on the side that uses this class. - uint64_t mini_batch_size = LEARN_MINI_BATCH_SIZE; - - // Number of phases used for calculation such as mse - // mini-batch size = 1M is standard, so 0.2% of that should be negligible in terms of time. - // Since search() is performed with depth = 1 in calculation of - // move match rate, simple comparison is not possible... - uint64_t validation_count = 2000; - - // Option to exclude early stage from learning - int reduction_gameply = 1; - - // If the absolute value of the evaluation value of the deep search - // of the teacher phase exceeds this value, discard the teacher phase. - int eval_limit = 32000; - - // Flag whether to dig a folder each time the evaluation function is saved. - // If true, do not dig the folder. - bool save_only_once = false; - - bool shuffle = true; - - bool verbose = false; - - double newbob_decay = 0.5; - int newbob_num_trials = 4; - uint64_t auto_lr_drop = 0; - - std::string best_nn_directory; - - uint64_t eval_save_interval = LEARN_EVAL_SAVE_INTERVAL; - uint64_t loss_output_interval = 1'000'000; - - size_t sfen_read_size = SfenReader::DEFAULT_SFEN_READ_SIZE; - size_t thread_buffer_size = SfenReader::DEFAULT_THREAD_BUFFER_SIZE; - - bool use_draw_games_in_training = true; - bool use_draw_games_in_validation = true; - bool skip_duplicated_positions_in_training = true; - - bool assume_quiet = false; - bool smart_fen_skipping = false; - bool smart_fen_skipping_for_validation = false; - - double learning_rate = 1.0; - double warmup_learning_rate = 0.1; - double max_grad = 1.0; - - string validation_set_file_name; - string seed; - - std::vector filenames; - - uint64_t num_threads; - - void enforce_constraints() - { - num_threads = Options["Threads"]; - - if (loss_output_interval == 0) - { - loss_output_interval = LEARN_RMSE_OUTPUT_INTERVAL * mini_batch_size; - } - - // If reduction_gameply is set to 0, rand(0) will be divided by 0, so correct it to 1. - reduction_gameply = max(reduction_gameply, 1); - - if (newbob_decay != 1.0 && !Options["SkipLoadingEval"]) { - // Save the current net to [EvalSaveDir]\original. - Eval::NNUE::save_eval("original"); - - // Set the folder above to best_nn_directory so that the trainer can - // resotre the network parameters from the original net file. - best_nn_directory = - Path::combine(Options["EvalSaveDir"], "original"); - } - } - }; - - LearnerThink(const Params& prm) : - params(prm), - init_prng(prm.seed), - train_sr( - prm.filenames, - prm.shuffle, - SfenReaderMode::Cyclic, - prm.num_threads, - std::to_string(init_prng.next_random_seed()), - prm.sfen_read_size, - prm.thread_buffer_size), - validation_sr( - prm.validation_set_file_name.empty() ? prm.filenames : std::vector{ prm.validation_set_file_name }, - prm.shuffle, - SfenReaderMode::Cyclic, - 1, - std::to_string(init_prng.next_random_seed()), - std::min(prm.validation_count * 10, 1000000), - prm.thread_buffer_size), - learn_loss_sum{} - { - save_count = 0; - loss_output_count = 0; - last_lr_drop = 0; - best_loss = std::numeric_limits::infinity(); - latest_loss_sum = 0.0; - latest_loss_count = 0; - total_done = 0; - trials = params.newbob_num_trials; - dir_number = 0; - - prngs.reserve(prm.num_threads); - for (uint64_t i = 0; i < prm.num_threads; ++i) - { - prngs.emplace_back(init_prng.next_random_seed()); - } - } - - void learn(uint64_t epochs, uint64_t warmup_epochs = 0); - - private: - static void set_learning_search_limits(); - - PSVector fetch_next_validation_set(); - - void learn_worker(Thread& th, std::atomic& counter, uint64_t limit); - - void update_weights(const PSVector& psv, uint64_t epoch); - void update_weights_warmup(uint64_t warmup_epoch); - - void calc_loss(const PSVector& psv, uint64_t epoch); - - void calc_loss_worker( - Thread& th, - std::atomic& counter, - const PSVector& psv, - Loss& test_loss_sum, - atomic& sum_norm, - atomic& move_accord_count, - atomic& sum_one_over_move_count - ); - - bool has_depth1_move_agreement(Position& pos, Move pvmove); - - bool check_progress(); - - // save merit function parameters to a file - bool save(bool is_final = false); - - Params params; - - PRNG init_prng; - std::vector prngs; - - // sfen reader - SfenReader train_sr; - SfenReader validation_sr; - - uint64_t save_count; - uint64_t loss_output_count; - - std::atomic stop_flag; - - uint64_t total_done; - - uint64_t last_lr_drop; - double best_loss; - double latest_loss_sum; - uint64_t latest_loss_count; - - int trials; - int dir_number; - - // For calculation of learning data loss - Loss learn_loss_sum; - }; - - void LearnerThink::set_learning_search_limits() - { - Threads.main()->ponder = false; - - // About Search::Limits - // Be careful because this member variable is global and affects other threads. - auto& limits = Search::Limits; - - limits.startTime = now(); - - // Make the search equivalent to the "go infinite" command. (Because it is troublesome if time management is done) - limits.infinite = true; - - // Since PV is an obstacle when displayed, erase it. - limits.silent = true; - - // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it. - limits.nodes = 0; - - // depth is also processed by the one passed as an argument of Learner::search(). - limits.depth = 0; - } - - PSVector LearnerThink::fetch_next_validation_set() - { - PSVector validation_data; - - auto mainThread = Threads.main(); - mainThread->execute_with_worker([&validation_data, this](auto& th){ - auto do_include_predicate = [&th, this](const PackedSfenValue& ps) -> bool { - if (params.eval_limit < abs(ps.score)) - return false; - - if (!params.use_draw_games_in_validation && ps.game_result == 0) - return false; - - if (params.smart_fen_skipping_for_validation) - { - StateInfo si; - auto& pos = th.rootPos; - if (pos.set_from_packed_sfen(ps.sfen, &si, &th) != 0) - return false; - - if (pos.capture_or_promotion((Move)ps.move) || pos.checkers()) - return false; - } - - return true; - }; - - validation_data = validation_sr.read_some( - params.validation_count, - params.validation_count * 100, // to have a reasonable bound on the running time. - do_include_predicate - ); - }); - mainThread->wait_for_worker_finished(); - - return validation_data; - } - - void LearnerThink::learn(uint64_t epochs, uint64_t warmup_epochs) - { -#if defined(_OPENMP) - omp_set_num_threads((int)Options["Threads"]); -#endif - - set_learning_search_limits(); - - Eval::NNUE::verify_any_net_loaded(); - - const PSVector validation_data = fetch_next_validation_set(); - - if (validation_data.size() != params.validation_count) - { - auto out = sync_region_cout.new_region(); - out - << "INFO (learn): Error reading validation data. Read " << validation_data.size() - << " out of " << params.validation_count << '\n' - << "INFO (learn): This either means that less than 1% of the validation data passed the filter" - << " or the file is empty\n"; - - return; - } - - stop_flag = false; - - if (warmup_epochs > 0) - { - cout << "Doing " << warmup_epochs << " warmup epochs." << endl; - } - - for(uint64_t warmup_epoch = 1; warmup_epoch <= warmup_epochs; ++warmup_epoch) - { - std::atomic counter{0}; - - Threads.execute_with_workers([this, &counter](auto& th){ - learn_worker(th, counter, params.mini_batch_size); - }); - - total_done += params.mini_batch_size; - - Threads.wait_for_workers_finished(); - - if (stop_flag) - break; - - update_weights_warmup(warmup_epoch); - - if (stop_flag) - break; - - cout << "Finished " << warmup_epoch << " out of " << warmup_epochs << " warmup epochs." << endl; - } - - if (params.newbob_decay != 1.0) { - - calc_loss(validation_data, 0); - - best_loss = latest_loss_sum / latest_loss_count; - latest_loss_sum = 0.0; - latest_loss_count = 0; - - auto out = sync_region_cout.new_region(); - out << "INFO (learn): initial loss = " << best_loss << endl; - } - - for(uint64_t epoch = 1; epoch <= epochs; ++epoch) - { - std::atomic counter{0}; - - Threads.execute_with_workers([this, &counter](auto& th){ - learn_worker(th, counter, params.mini_batch_size); - }); - - total_done += params.mini_batch_size; - - Threads.wait_for_workers_finished(); - - if (stop_flag) - break; - - update_weights(validation_data, epoch); - - if (stop_flag) - break; - } - - Eval::NNUE::finalize_net(); - - save(true); - } - - void LearnerThink::learn_worker(Thread& th, std::atomic& counter, uint64_t limit) - { - const auto thread_id = th.thread_idx(); - auto& pos = th.rootPos; - auto& prng = prngs[th.thread_idx()]; - - std::vector> state(MAX_PLY); - - while(!stop_flag) - { - const auto iter = counter.fetch_add(1); - if (iter >= limit) - break; - - PackedSfenValue ps; - - RETRY_READ:; - - if (!train_sr.read_to_thread_buffer(thread_id, ps)) - { - // If we ran out of data we stop completely - // because there's nothing left to do. - stop_flag = true; - break; - } - - if (params.eval_limit < abs(ps.score)) - goto RETRY_READ; - - if (!params.use_draw_games_in_training && ps.game_result == 0) - goto RETRY_READ; - - // Skip over the opening phase - if (ps.gamePly < prng.rand(params.reduction_gameply)) - goto RETRY_READ; - - StateInfo si; - if (pos.set_from_packed_sfen(ps.sfen, &si, &th) != 0) - { - // Malformed sfen - auto out = sync_region_cout.new_region(); - out << "ERROR: illigal packed sfen = " << pos.fen() << endl; - goto RETRY_READ; - } - - const auto rootColor = pos.side_to_move(); - - // A function that adds the current `pos` and `ps` - // to the training set. - auto pos_add_grad = [&]() { - - // Evaluation value of deep search - const Value shallow_value = Eval::evaluate(pos); - - Eval::NNUE::add_example(pos, rootColor, shallow_value, ps, 1.0); - }; - - if (!pos.pseudo_legal((Move)ps.move) || !pos.legal((Move)ps.move)) - { - goto RETRY_READ; - } - - // We don't need to qsearch when doing smart skipping - if (!params.assume_quiet && !params.smart_fen_skipping) - { - int ply = 0; - pos.do_move((Move)ps.move, state[ply++]); - - // Evaluation value of shallow search (qsearch) - const auto [_, pv] = Search::qsearch(pos); - - for (auto m : pv) - { - pos.do_move(m, state[ply++]); - } - } - - if (params.smart_fen_skipping - && (pos.capture_or_promotion((Move)ps.move) - || pos.checkers())) - { - goto RETRY_READ; - } - - // We want to position being trained on not to be terminal - if (MoveList(pos).size() == 0) - goto RETRY_READ; - - // Since we have reached the end phase of PV, add the slope here. - pos_add_grad(); - } - } - - void LearnerThink::update_weights_warmup(uint64_t warmup_epoch) - { - // I'm not sure this fencing is correct. But either way there - // should be no real issues happening since - // the read/write phases are isolated. - atomic_thread_fence(memory_order_seq_cst); - Eval::NNUE::update_parameters( - Threads, warmup_epoch, params.verbose, params.warmup_learning_rate, params.max_grad, get_loss); - atomic_thread_fence(memory_order_seq_cst); - } - - void LearnerThink::update_weights(const PSVector& psv, uint64_t epoch) - { - // I'm not sure this fencing is correct. But either way there - // should be no real issues happening since - // the read/write phases are isolated. - atomic_thread_fence(memory_order_seq_cst); - learn_loss_sum += Eval::NNUE::update_parameters( - Threads, epoch, params.verbose, params.learning_rate, params.max_grad, get_loss); - atomic_thread_fence(memory_order_seq_cst); - - if (++save_count * params.mini_batch_size >= params.eval_save_interval) - { - save_count = 0; - - const bool converged = save(); - if (converged) - { - stop_flag = true; - return; - } - } - - if (++loss_output_count * params.mini_batch_size >= params.loss_output_interval) - { - loss_output_count = 0; - - // loss calculation - calc_loss(psv, epoch); - - Eval::NNUE::check_health(); - } - } - - void LearnerThink::calc_loss(const PSVector& psv, uint64_t epoch) - { - TT.new_search(); - TimePoint elapsed = now() - Search::Limits.startTime + 1; - - auto out = sync_region_cout.new_region(); - - out << "\n"; - out << "PROGRESS (calc_loss): " << now_string() - << ", " << total_done << " sfens" - << ", " << total_done * 1000 / elapsed << " sfens/second" - << ", epoch " << epoch - << endl; - - out << " - learning rate = " << params.learning_rate << endl; - - // For calculation of verification data loss - Loss test_loss_sum{}; - - // norm for learning - atomic sum_norm{0.0}; - - // The number of times the pv first move of deep - // search matches the pv first move of search(1). - atomic move_accord_count{0}; - - // If there is 10 legal moves then 0.1 will be added. - // This happens for each position tested. - // Effectively at the end we have the random move accuracy - // multiplied by the number of positions, which is psv.size() - atomic sum_one_over_move_count{0.0}; - - auto mainThread = Threads.main(); - mainThread->execute_with_worker([&out](auto& th){ - auto& pos = th.rootPos; - StateInfo si; - pos.set(StartFEN, false, &si, &th); - out << " - startpos eval = " << Eval::evaluate(pos) << endl; - }); - mainThread->wait_for_worker_finished(); - - // The number of tasks to do. - atomic counter{0}; - Threads.execute_with_workers([&](auto& th){ - calc_loss_worker( - th, - counter, - psv, - test_loss_sum, - sum_norm, - move_accord_count, - sum_one_over_move_count - ); - }); - Threads.wait_for_workers_finished(); - - latest_loss_sum += test_loss_sum.value(); - latest_loss_count += psv.size(); - - if (psv.size() && test_loss_sum.count() > 0) - { - test_loss_sum.print_only_loss("val", out); - - if (learn_loss_sum.count() > 0) - { - learn_loss_sum.print_with_grad("train", out); - } - - out << " - norm = " << sum_norm << endl; - out << " - move accuracy = " << (move_accord_count * 100.0 / psv.size()) << "%" << endl; - out << " - random move accuracy = " << (sum_one_over_move_count * 100.0 / psv.size()) << "%" << endl; - } - else - { - out << "ERROR: psv.size() = " << psv.size() << " , done = " << test_loss_sum.count() << endl; - } - - learn_loss_sum.reset(); - } - - void LearnerThink::calc_loss_worker( - Thread& th, - std::atomic& counter, - const PSVector& psv, - Loss& test_loss_sum, - atomic& sum_norm, - atomic& move_accord_count, - atomic& sum_one_over_move_count - ) - { - Loss local_loss_sum{}; - double local_sum_one_over_move_count = 0.0; - auto& pos = th.rootPos; - - for(;;) - { - const auto task_id = counter.fetch_add(1); - if (task_id >= psv.size()) - { - break; - } - - const auto& ps = psv[task_id]; - - StateInfo si; - if (pos.set_from_packed_sfen(ps.sfen, &si, &th) != 0) - { - cout << "Error! : illegal packed sfen " << pos.fen() << endl; - continue; - } - - const Value shallow_value = Eval::evaluate(pos); - - // Evaluation value of deep search - const auto deep_value = (Value)ps.score; - - const auto loss = get_loss_no_grad( - deep_value, - shallow_value, - ps); - - local_loss_sum += loss; - sum_norm += (double)abs(shallow_value); - - // Threat all moves with equal scores as first. This is up to move ordering. - if (has_depth1_move_agreement(pos, (Move)ps.move)) - move_accord_count.fetch_add(1, std::memory_order_relaxed); - - local_sum_one_over_move_count += 1.0 / static_cast(MoveList(pos).size()); - } - - sum_one_over_move_count += local_sum_one_over_move_count; - test_loss_sum += local_loss_sum; - } - - bool LearnerThink::has_depth1_move_agreement(Position& pos, Move pvmove) - { - // Determine if the depth 1 search pv matches the move from the dataset. - // Do a manual depth 1 search so we're not affected by previous searches. - std::vector> child_scores; - - // Call evaluate once for the rootpos so that the evals - // for children moves use incremental feature transformer updates. - (void)Eval::evaluate(pos); - - // Just to get guaranteed alignment. - std::vector> states(1); - auto legal_moves = MoveList(pos); - for (auto m : legal_moves) - { - pos.do_move(m, states[0]); - // We don't care if the king is in check or stuff like that. - // not a big issue and nnue should digest all. - auto value = -Eval::evaluate(pos); - child_scores.emplace_back(m, value); - pos.undo_move(m); - } - - if (child_scores.empty()) - return false; - - std::sort( - child_scores.begin(), - child_scores.end(), - [](auto& lhs, auto& rhs) { return lhs.second > rhs.second; } - ); - - // Require the best move to have strictly higher score than the next one. - return - child_scores[0].first == pvmove - && (child_scores.size() == 1 - || child_scores[1].second != child_scores[0].second); - } - - bool LearnerThink::check_progress() - { - auto out = sync_region_cout.new_region(); - - const double latest_loss = latest_loss_sum / latest_loss_count; - bool converged = false; - latest_loss_sum = 0.0; - latest_loss_count = 0; - - auto drop_lr = [&]() { - last_lr_drop = total_done; - - out - << " - reducing learning rate from " << params.learning_rate - << " to " << (params.learning_rate * params.newbob_decay) - << " (" << trials << " more trials)" << endl; - - params.learning_rate *= params.newbob_decay; - }; - - auto accept = [&]() { - out << " - loss = " << latest_loss << " < best (" << best_loss << "), accepted" << endl; - - best_loss = latest_loss; - trials = params.newbob_num_trials; - }; - - auto reject = [&]() { - out << " - loss = " << latest_loss << " >= best (" << best_loss << "), rejected" << endl; - - --trials; - if (trials > 0) - { - drop_lr(); - return false; - } - else - { - return true; - } - }; - - out << "INFO (learning_rate):" << endl; - - if (params.auto_lr_drop) - { - accept(); - - if (total_done >= last_lr_drop + params.auto_lr_drop) - { - drop_lr(); - } - } - else if (latest_loss < best_loss) - { - accept(); - } - else - { - converged = reject(); - } - - if (converged) - { - out << " - converged" << endl; - } - - return converged; - } - - // Write evaluation function file. - bool LearnerThink::save(bool is_final) - { - // Each time you save, change the extension part of the file name like "0","1","2",.. - // (Because I want to compare the winning rate for each evaluation function parameter later) - - bool converged = false; - - if (params.save_only_once) - { - // When EVAL_SAVE_ONLY_ONCE is defined, - // Do not dig a subfolder because I want to save it only once. - Eval::NNUE::save_eval(""); - } - else if (is_final) - { - Eval::NNUE::save_eval("final"); - converged = true; - } - else - { - // TODO: consider naming the output directory by epoch. - const std::string dir_name = std::to_string(dir_number++); - Eval::NNUE::save_eval(dir_name); - - if (params.newbob_decay != 1.0 && latest_loss_count > 0) - { - converged = check_progress(); - params.best_nn_directory = Path::combine((std::string)Options["EvalSaveDir"], dir_name); - } - } - - return converged; - } - - // Learning from the generated game record - void learn(istringstream& is) - { - LearnerThink::Params params; - - // Number of epochs - uint64_t epochs = std::numeric_limits::max(); - uint64_t warmup_epochs = 0; - - // Game file storage folder (get game file with relative path from here) - string base_dir; - string target_dir; - - uint64_t nn_batch_size = 1000; - string nn_options; - - auto out = sync_region_cout.new_region(); - - // Assume the filenames are staggered. - while (true) - { - string option; - is >> option; - - if (option == "") - break; - - // specify the number of phases of mini-batch - if (option == "bat") - { - is >> params.mini_batch_size; - params.mini_batch_size *= 10000; // Unit is ten thousand - } - - // Specify the folder in which the game record is stored and make it the rooting target. - else if (option == "targetdir") is >> target_dir; - else if (option == "targetfile") - { - std::string filename; - is >> filename; - params.filenames.push_back(filename); - } - else if (option == "validation_count") is >> params.validation_count; - - // Specify the number of loops - else if (option == "epochs") is >> epochs; - else if (option == "warmup_epochs") is >> warmup_epochs; - - // Game file storage folder (get game file with relative path from here) - else if (option == "basedir") is >> base_dir; - - // Mini batch size - else if (option == "batchsize" - || option == "epoch_size") - is >> params.mini_batch_size; - - // learning rate - else if (option == "lr") is >> params.learning_rate; - else if (option == "warmup_lr") is >> params.warmup_learning_rate; - else if (option == "max_grad") is >> params.max_grad; - - // Accept also the old option name. - else if (option == "use_draw_in_training" - || option == "use_draw_games_in_training") - is >> params.use_draw_games_in_training; - - // Accept also the old option name. - else if (option == "use_draw_in_validation" - || option == "use_draw_games_in_validation") - is >> params.use_draw_games_in_validation; - - // Accept also the old option name. - else if (option == "use_hash_in_training" - || option == "skip_duplicated_positions_in_training") - is >> params.skip_duplicated_positions_in_training; - - else if (option == "winning_probability_coefficient") - is >> winning_probability_coefficient; - - // Using WDL with win rate model instead of sigmoid - else if (option == "use_wdl") is >> use_wdl; - - - // LAMBDA - else if (option == "lambda") is >> elmo_lambda_low; - else if (option == "lambda2") is >> elmo_lambda_high; - else if (option == "lambda_limit") is >> elmo_lambda_limit; - - else if (option == "reduction_gameply") is >> params.reduction_gameply; - - else if (option == "eval_limit") is >> params.eval_limit; - else if (option == "save_only_once") params.save_only_once = true; - else if (option == "no_shuffle") params.shuffle = false; - - else if (option == "nn_batch_size" - || option == "batch_size") - is >> nn_batch_size; - else if (option == "newbob_decay" - || option == "lr_step") - is >> params.newbob_decay; - else if (option == "newbob_num_trials" - || option == "max_consecutive_rejections") - is >> params.newbob_num_trials; - else if (option == "nn_options") is >> nn_options; - else if (option == "auto_lr_drop") is >> params.auto_lr_drop; - - else if (option == "eval_save_interval") is >> params.eval_save_interval; - else if (option == "loss_output_interval") is >> params.loss_output_interval; - else if (option == "validation_set_file_name") is >> params.validation_set_file_name; - - else if (option == "src_score_min_value") is >> src_score_min_value; - else if (option == "src_score_max_value") is >> src_score_max_value; - else if (option == "dest_score_min_value") is >> dest_score_min_value; - else if (option == "dest_score_max_value") is >> dest_score_max_value; - - else if (option == "sfen_read_size") is >> params.sfen_read_size; - else if (option == "thread_buffer_size") is >> params.thread_buffer_size; - - else if (option == "seed") is >> params.seed; - else if (option == "set_recommended_uci_options") - { - UCI::setoption("Use NNUE", "pure"); - UCI::setoption("MultiPV", "1"); - UCI::setoption("Contempt", "0"); - UCI::setoption("Skill Level", "20"); - UCI::setoption("UCI_Chess960", "false"); - UCI::setoption("UCI_AnalyseMode", "false"); - UCI::setoption("UCI_LimitStrength", "false"); - UCI::setoption("PruneAtShallowDepth", "false"); - UCI::setoption("EnableTranspositionTable", "false"); - } - else if (option == "verbose") params.verbose = true; - else if (option == "assume_quiet") params.assume_quiet = true; - else if (option == "smart_fen_skipping") params.smart_fen_skipping = true; - else if (option == "smart_fen_skipping_for_validation") params.smart_fen_skipping_for_validation = true; - else - { - out << "INFO: Unknown option: " << option << ". Ignoring.\n"; - } - } - - out << "INFO: Executing learn command\n"; - - // Issue a warning if OpenMP is disabled. -#if !defined(_OPENMP) - out << "WARNING: OpenMP disabled." << endl; -#endif - - params.enforce_constraints(); - - // Right now we only have the individual files. - // We need to apply base_dir here - if (!target_dir.empty()) - { - append_files_from_dir(params.filenames, base_dir, target_dir); - } - rebase_files(params.filenames, base_dir); - - out << "INFO: Input files:\n"; - for (auto s : params.filenames) - out << " - " << s << '\n'; - - out << "INFO: Parameters:\n"; - if (!params.validation_set_file_name.empty()) - { - out << " - validation set : " << params.validation_set_file_name << endl; - } - - out << " - validation count : " << params.validation_count << endl; - out << " - epochs : " << epochs << endl; - out << " - positions : " << epochs * params.mini_batch_size << endl; - out << " - warmup epochs : " << warmup_epochs << endl; - out << " - warmup positions : " << warmup_epochs * params.mini_batch_size << endl; - out << " - eval_limit : " << params.eval_limit << endl; - out << " - save_only_once : " << (params.save_only_once ? "true" : "false") << endl; - out << " - shuffle on read : " << (params.shuffle ? "true" : "false") << endl; - - out << " - Loss Function : " << LOSS_FUNCTION << endl; - out << " - minibatch size : " << params.mini_batch_size << endl; - - out << " - nn_batch_size : " << nn_batch_size << endl; - out << " - nn_options : " << nn_options << endl; - - out << " - learning rate : " << params.learning_rate << endl; - out << " - warmup learning rate : " << params.warmup_learning_rate << endl; - out << " - max_grad : " << params.max_grad << endl; - out << " - use draws in training : " << params.use_draw_games_in_training << endl; - out << " - use draws in validation : " << params.use_draw_games_in_validation << endl; - out << " - skip repeated positions : " << params.skip_duplicated_positions_in_training << endl; - - out << " - winning prob coeff : " << winning_probability_coefficient << endl; - out << " - use_wdl : " << use_wdl << endl; - - out << " - src_score_min_value : " << src_score_min_value << endl; - out << " - src_score_max_value : " << src_score_max_value << endl; - out << " - dest_score_min_value : " << dest_score_min_value << endl; - out << " - dest_score_max_value : " << dest_score_max_value << endl; - - out << " - reduction_gameply : " << params.reduction_gameply << endl; - - out << " - elmo_lambda_low : " << elmo_lambda_low << endl; - out << " - elmo_lambda_high : " << elmo_lambda_high << endl; - out << " - elmo_lambda_limit : " << elmo_lambda_limit << endl; - out << " - eval_save_interval : " << params.eval_save_interval << " sfens" << endl; - out << " - loss_output_interval : " << params.loss_output_interval << " sfens" << endl; - - out << " - sfen_read_size : " << params.sfen_read_size << endl; - out << " - thread_buffer_size : " << params.thread_buffer_size << endl; - - out << " - smart_fen_skipping : " << params.smart_fen_skipping << endl; - out << " - smart_fen_skipping_val : " << params.smart_fen_skipping_for_validation << endl; - - out << " - seed : " << params.seed << endl; - out << " - verbose : " << (params.verbose ? "true" : "false") << endl; - - if (params.auto_lr_drop) { - out << " - learning rate scheduling : every " << params.auto_lr_drop << " sfens" << endl; - } - else if (params.newbob_decay != 1.0) { - out << " - learning rate scheduling : newbob with decay" << endl; - out << " - newbob_decay : " << params.newbob_decay << endl; - out << " - newbob_num_trials : " << params.newbob_num_trials << endl; - } - else { - out << " - learning rate scheduling : fixed learning rate" << endl; - } - - out << endl; - - out << "INFO: Started initialization." << endl; - - Eval::NNUE::initialize_training(params.seed, out); - Eval::NNUE::set_batch_size(nn_batch_size); - Eval::NNUE::set_options(nn_options); - - LearnerThink learn_think(params); - - out << "Finished initialization." << endl; - - out.unlock(); - - // Start learning. - learn_think.learn(epochs, warmup_epochs); - } - -} // namespace Learner diff --git a/src/learn/learn.h b/src/learn/learn.h deleted file mode 100644 index 842ffad0..00000000 --- a/src/learn/learn.h +++ /dev/null @@ -1,148 +0,0 @@ -#ifndef _LEARN_H_ -#define _LEARN_H_ - -// ---------------------- -// Floating point for learning -// ---------------------- - -// If this is set to double, the calculation accuracy will be higher, but the weight array entangled memory will be doubled. -// Currently, if this is float, the weight array is 4.5 times the size of the evaluation function file. (About 4.5GB with KPPT) -// Even if it is a double type, there is almost no difference in the way of convergence, so fix it to float. - -// when using float -using LearnFloatType = float; - -// when using double -//typedef double LearnFloatType; - -// when using float16 -//#include "half_float.h" -//typedef HalfFloat::float16 LearnFloatType; - -// ====================== -// configure -// ====================== - -// ---------------------- -// Learning with the method of elmo (WCSC27) -// ---------------------- - -#define LOSS_FUNCTION "ELMO_METHOD(WCSC27)" - -// ---------------------- -// Definition of struct used in Learner -// ---------------------- - -#include "autograd.h" -#include "packed_sfen.h" - -#include "position.h" - -#include -#include -#include -#include - -namespace Learner -{ - // ---------------------- - // Settings for learning - // ---------------------- - - // mini-batch size. - // Calculate the gradient by combining this number of phases. - // If you make it smaller, the number of update_weights() will increase and the convergence will be faster. The gradient is incorrect. - // If you increase it, the number of update_weights() decreases, so the convergence will be slow. The slope will come out accurately. - // I don't think you need to change this value in most cases. - - constexpr std::size_t LEARN_MINI_BATCH_SIZE = 1000 * 1000 * 1; - - // Saving interval of evaluation function at learning. Save each time you learn this number of phases. - // Needless to say, the longer the saving interval, the shorter the learning time. - // Folder name is incremented for each save like 0/, 1/, 2/... - // By default, once every 1 billion phases. - constexpr std::size_t LEARN_EVAL_SAVE_INTERVAL = 100'000'000ULL; - - // Reduce the output of rmse during learning to 1 for this number of times. - // rmse calculation is done in one thread, so it takes some time, so reducing the output is effective. - constexpr std::size_t LEARN_RMSE_OUTPUT_INTERVAL = 1; - - // Learning from the generated game record - void learn(std::istringstream& is); - - using CalcLossFunc = ValueWithGrad(Value, Value, int, int); - - struct Loss - { - double value() const - { - return m_loss.value; - } - - double grad() const - { - return m_loss.grad; - } - - uint64_t count() const - { - return m_count; - } - - Loss() = default; - - Loss(const Loss& other) : - m_loss(other.m_loss), - m_count(other.m_count) - { - } - - Loss& operator += (const ValueWithGrad& rhs) - { - std::unique_lock lock(m_mutex); - - m_loss += rhs.abs(); - m_count += 1; - - return *this; - } - - Loss& operator += (const Loss& rhs) - { - std::unique_lock lock(m_mutex); - - m_loss += rhs.m_loss.abs(); - m_count += rhs.m_count; - - return *this; - } - - void reset() - { - std::unique_lock lock(m_mutex); - - m_loss = ValueWithGrad{ 0.0, 0.0 }; - m_count = 0; - } - - template - void print_with_grad(const std::string& prefix, StreamT& s) const - { - s << " - " << prefix << "_loss = " << m_loss.value / (double)m_count << std::endl; - s << " - " << prefix << "_grad_norm = " << m_loss.grad / (double)m_count << std::endl; - } - - template - void print_only_loss(const std::string& prefix, StreamT& s) const - { - s << " - " << prefix << "_loss = " << m_loss.value / (double)m_count << std::endl; - } - - private: - ValueWithGrad m_loss{ 0.0, 0.0 }; - uint64_t m_count{0}; - std::mutex m_mutex; - }; -} - -#endif // ifndef _LEARN_H_ diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp deleted file mode 100644 index 8d95221c..00000000 --- a/src/nnue/evaluate_nnue_learner.cpp +++ /dev/null @@ -1,341 +0,0 @@ -#include -#include - -#include "evaluate_nnue.h" -#include "evaluate_nnue_learner.h" - -#include "trainer/features/all_factorizers.h" - -#include "trainer/trainer_feature_transformer.h" -#include "trainer/trainer_input_slice.h" -#include "trainer/trainer_affine_transform.h" -#include "trainer/trainer_clipped_relu.h" -#include "trainer/trainer_sum.h" - -#include "position.h" -#include "uci.h" -#include "misc.h" -#include "thread_win32_osx.h" -#include "thread.h" - -// Code for learning NNUE evaluation function -namespace Eval::NNUE { - - namespace { - - // learning data - std::vector examples; - - // Mutex for exclusive control of examples - std::mutex examples_mutex; - - // number of samples in mini-batch - uint64_t batch_size; - - // random number generator - std::mt19937 rng; - - // learner - std::shared_ptr> trainer; - - // Tell the learner options such as hyperparameters - void send_messages(std::vector messages) { - for (auto& message : messages) { - trainer->send_message(&message); - assert(message.num_receivers > 0); - } - } - - } // namespace - - // Initialize learning - void initialize_training( - const std::string& seed, - SynchronizedRegionLogger::Region& out) { - -#if defined (OPENBLAS_VERSION) - openblas_set_num_threads(1); -#elif defined (INTEL_MKL_VERSION) - mkl_set_num_threads(1); -#endif - - out << "INFO (initialize_training): Initializing NN training for " - << get_architecture_string() << std::endl; - - out << std::endl; - - out << "Layers:\n" - << get_layers_info() << std::endl; - - out << std::endl; - - out << "Factorizers:\n" - << Features::Factorizer::get_factorizers_string() << std::endl; - - out << std::endl; - - assert(feature_transformer); - assert(network); - - trainer = Trainer::create(network.get(), feature_transformer.get()); - rng.seed(PRNG(seed).rand()); - - if (Options["SkipLoadingEval"]) { - out << "INFO (initialize_training): Performing random net initialization.\n"; - trainer->initialize(rng); - } - } - - // set the number of samples in the mini-batch - void set_batch_size(uint64_t size) { - assert(size > 0); - batch_size = size; - } - - // Set options such as hyperparameters - void set_options(const std::string& options) { - std::vector messages; - for (const auto& option : Algo::split(options, ',')) { - const auto fields = Algo::split(option, '='); - assert(fields.size() == 1 || fields.size() == 2); - - if (fields.size() == 1) { - messages.emplace_back(fields[0]); - } else { - messages.emplace_back(fields[0], fields[1]); - } - } - - send_messages(std::move(messages)); - } - - // Reread the evaluation function parameters for learning from the file - void restore_parameters(const std::string& dir_name) { - const std::string file_name = Path::combine(dir_name, NNUE::savedfileName); - std::ifstream stream(file_name, std::ios::binary); -#ifndef NDEBUG - bool result = -#endif - ReadParameters(stream); -#ifndef NDEBUG - assert(result); -#endif - - send_messages({{"reset"}}); - } - - void finalize_net() { - send_messages({{"clear_unobserved_feature_weights"}}); - } - - // Add 1 sample of learning data - void add_example( - Position& pos, - Color rootColor, - Value discrete_nn_eval, - const Learner::PackedSfenValue& psv, - double weight) { - - Example example; - if (rootColor == pos.side_to_move()) { - example.sign = 1; - } else { - example.sign = -1; - } - - example.discrete_nn_eval = discrete_nn_eval; - example.psv = psv; - example.weight = weight; - - Features::IndexList active_indices[2]; - for (const auto trigger : kRefreshTriggers) { - RawFeatures::append_active_indices(pos, trigger, active_indices); - } - - if (pos.side_to_move() != WHITE) { - active_indices[0].swap(active_indices[1]); - } - - static thread_local std::vector s_training_features; - auto& training_features = s_training_features; - - for (const auto color : Colors) { - training_features.clear(); - - for (const auto base_index : active_indices[color]) { - static_assert(Features::Factorizer::get_dimensions() < - (1 << TrainingFeature::kIndexBits), ""); - Features::Factorizer::append_training_features( - base_index, &training_features); - } - - std::sort(training_features.begin(), training_features.end()); - - auto& unique_features = example.training_features[color]; - unique_features.reserve(training_features.size()); - for (const auto& feature : training_features) { - if (!unique_features.empty() && - feature.get_index() == unique_features.back().get_index()) { - - unique_features.back() += feature; - } else { - unique_features.push_back(feature); - } - } - } - - std::lock_guard lock(examples_mutex); - examples.push_back(std::move(example)); - } - - // update the evaluation function parameters - Learner::Loss update_parameters( - ThreadPool& thread_pool, - uint64_t epoch, - bool verbose, - double learning_rate, - double max_grad, - Learner::CalcLossFunc calc_loss) - { - using namespace Learner::Autograd::UnivariateStatic; - - assert(batch_size > 0); - - learning_rate /= batch_size; - - std::lock_guard lock(examples_mutex); - - double abs_eval_diff_sum = 0.0; - double abs_discrete_eval_sum = 0.0; - double gradient_norm = 0.0; - - bool collect_stats = verbose; - - Learner::Loss loss_sum{}; - - std::vector abs_eval_diff_sum_local(thread_pool.size(), 0.0); - std::vector abs_discrete_eval_sum_local(thread_pool.size(), 0.0); - std::vector gradient_norm_local(thread_pool.size(), 0.0); - std::vector loss_sum_local(thread_pool.size()); - - auto prev_batch_begin = examples.end(); - while ((long)(prev_batch_begin - examples.begin()) >= (long)batch_size) { - auto batch_begin = prev_batch_begin - batch_size; - auto batch_end = prev_batch_begin; - auto size = batch_end - batch_begin; - const auto network_output = trainer->step_start(thread_pool, batch_begin, batch_end); - std::vector gradients(size); - - thread_pool.for_each_index_chunk_with_workers( - std::size_t(0), size, - [&](Thread& th, std::size_t offset, std::size_t count) { - const auto thread_id = th.thread_idx(); - - trainer->propagate(th, offset, count); - - for (std::size_t b = offset; b < offset + count; ++b) { - const auto& e = *(batch_begin + b); - const auto shallow = static_cast(round( - e.sign * network_output[b] * kPonanzaConstant)); - const auto discrete = e.sign * e.discrete_nn_eval; - const auto& psv = e.psv; - auto loss = calc_loss(shallow, (Value)psv.score, psv.game_result, psv.gamePly); - loss.grad = std::clamp( - loss.grad * e.sign * kPonanzaConstant * e.weight, -max_grad, max_grad); - gradients[b] = static_cast(loss.grad); - loss_sum_local[thread_id] += loss; - - // The discrete eval will only be valid before first backpropagation, - // that is only for the first batch. - // Similarily we want only gradients from one batch. - if (collect_stats) - { - abs_eval_diff_sum_local[thread_id] += std::abs(discrete - shallow); - abs_discrete_eval_sum_local[thread_id] += std::abs(discrete); - gradient_norm_local[thread_id] += std::abs(loss.grad); - } - } - - trainer->backpropagate(th, gradients.data(), offset, count); - } - ); - - // We can asyncronously erase the examples that we used in the previous - // step. This can be done safely because we're no longer using these - // examples and erase won't invalidate iterators. - examples.erase(prev_batch_begin, examples.end()); - prev_batch_begin = batch_begin; - - thread_pool.wait_for_workers_finished(); - - trainer->step_end(thread_pool, learning_rate); - - collect_stats = false; - } - examples.erase(prev_batch_begin, examples.end()); - - if (verbose) - { - abs_eval_diff_sum = std::accumulate(abs_eval_diff_sum_local.begin(), abs_eval_diff_sum_local.end(), 0.0); - abs_discrete_eval_sum = std::accumulate(abs_discrete_eval_sum_local.begin(), abs_discrete_eval_sum_local.end(), 0.0); - gradient_norm = std::accumulate(gradient_norm_local.begin(), gradient_norm_local.end(), 0.0); - - const double avg_abs_eval_diff = abs_eval_diff_sum / batch_size; - const double avg_abs_discrete_eval = abs_discrete_eval_sum / batch_size; - - auto out = sync_region_cout.new_region(); - - out << "INFO (update_parameters):" - << " epoch = " << epoch - << " , avg_abs(trainer_eval-nnue_eval) = " << avg_abs_eval_diff - << " , avg_abs(nnue_eval) = " << avg_abs_discrete_eval - << " , avg_relative_error = " << avg_abs_eval_diff / avg_abs_discrete_eval - << " , batch_size = " << batch_size - << " , grad_norm = " << gradient_norm - << std::endl; - } else { - // Display some progress but don't synchronize as - // we can't really decide when to release the output lock here - std::cout << '.'; - } - - send_messages({{"quantize_parameters"}}); - - for(auto& loss : loss_sum_local) - { - loss_sum += loss; - } - - return loss_sum; - } - - // Check if there are any problems with learning - void check_health() { - send_messages({{"check_health"}}); - } - - // save merit function parameters to a file - void save_eval(std::string dir_name) { - auto eval_dir = Path::combine(Options["EvalSaveDir"], dir_name); - - auto out = sync_region_cout.new_region(); - - out << "INFO (save_eval): Saving current evaluation file in " << eval_dir << std::endl; - - // mkdir() will fail if this folder already exists, but - // Apart from that. If not, I just want you to make it. - // Also, assume that the folders up to EvalSaveDir have been dug. - sys::create_directories(eval_dir); - - const std::string file_name = Path::combine(eval_dir, NNUE::savedfileName); - std::ofstream stream(file_name, std::ios::binary); -#ifndef NDEBUG - bool result = -#endif - WriteParameters(stream); -#ifndef NDEBUG - assert(result); -#endif - out << "INFO (save_eval): Finished saving evaluation file in " << eval_dir << std::endl; - } -} // namespace Eval::NNUE diff --git a/src/nnue/evaluate_nnue_learner.h b/src/nnue/evaluate_nnue_learner.h deleted file mode 100644 index 3d9f5b31..00000000 --- a/src/nnue/evaluate_nnue_learner.h +++ /dev/null @@ -1,52 +0,0 @@ -#ifndef _EVALUATE_NNUE_LEARNER_H_ -#define _EVALUATE_NNUE_LEARNER_H_ - -#include "learn/learn.h" - -#include "misc.h" - -struct ThreadPool; - -// Interface used for learning NNUE evaluation function -namespace Eval::NNUE { - - // Initialize learning - void initialize_training( - const std::string& seed, - SynchronizedRegionLogger::Region& out); - - // set the number of samples in the mini-batch - void set_batch_size(uint64_t size); - - // Set options such as hyperparameters - void set_options(const std::string& options); - - // Reread the evaluation function parameters for learning from the file - void restore_parameters(const std::string& dir_name); - - // Add 1 sample of learning data - void add_example( - Position& pos, - Color rootColor, - Value discrete_nn_eval, - const Learner::PackedSfenValue& psv, - double weight); - - // update the evaluation function parameters - Learner::Loss update_parameters( - ThreadPool& thread_pool, - uint64_t epoch, - bool verbose, - double learning_rate, - double max_grad, - Learner::CalcLossFunc calc_loss); - - // Check if there are any problems with learning - void check_health(); - - void finalize_net(); - - void save_eval(std::string suffix); -} // namespace Eval::NNUE - -#endif diff --git a/src/nnue/nnue_test_command.cpp b/src/nnue/nnue_test_command.cpp deleted file mode 100644 index d892222b..00000000 --- a/src/nnue/nnue_test_command.cpp +++ /dev/null @@ -1,215 +0,0 @@ -#include "evaluate_nnue.h" -#include "nnue_test_command.h" - -#include "thread.h" -#include "uci.h" - -#include -#include - -#define ASSERT(X) { \ - if (!(X)) { \ - std::cout \ - << "\nError : ASSERT(" << #X << "), " \ - << __FILE__ << "(" << __LINE__ << "): " \ - << __func__ << std::endl; \ - std::this_thread::sleep_for(std::chrono::microseconds(3000)); \ - *(int*)1 =0; \ - } \ -} - -// USI extended command for NNUE evaluation function -namespace Eval::NNUE { - - namespace { - - // Testing RawFeatures mainly for difference calculation - void test_features(Position& pos) { - const std::uint64_t num_games = 1000; - StateInfo si; - pos.set(StartFEN, false, &si, Threads.main()); - const int MAX_PLY = 256; // test up to 256 hands - - StateInfo state[MAX_PLY]; // StateInfo only for the maximum number of steps - int ply; // Trouble from the initial phase - - PRNG prng(20171128); - - std::uint64_t num_moves = 0; - std::vector num_updates(kRefreshTriggers.size() + 1); - std::vector num_resets(kRefreshTriggers.size()); - constexpr IndexType kUnknown = -1; - std::vector trigger_map(RawFeatures::kDimensions, kUnknown); - - auto make_index_sets = [&](const Position& position) { - std::vector>> index_sets( - kRefreshTriggers.size(), std::vector>(2)); - - for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) { - Features::IndexList active_indices[2]; - RawFeatures::append_active_indices(position, kRefreshTriggers[i], - active_indices); - - for (const auto perspective : Colors) { - for (const auto index : active_indices[perspective]) { - ASSERT(index < RawFeatures::kDimensions); - ASSERT(index_sets[i][perspective].count(index) == 0); - ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i); - index_sets[i][perspective].insert(index); - trigger_map[index] = i; - } - } - } - - return index_sets; - }; - - auto update_index_sets = [&](const Position& position, auto* index_sets) { - for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) { - Features::IndexList removed_indices[2], added_indices[2]; - bool reset[2] = { false, false }; - RawFeatures::append_changed_indices(position, kRefreshTriggers[i], - removed_indices, added_indices, reset); - for (const auto perspective : Colors) { - if (reset[perspective]) { - (*index_sets)[i][perspective].clear(); - ++num_resets[i]; - } else { - for (const auto index : removed_indices[perspective]) { - ASSERT(index < RawFeatures::kDimensions); - ASSERT((*index_sets)[i][perspective].count(index) == 1); - ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i); - (*index_sets)[i][perspective].erase(index); - ++num_updates.back(); - ++num_updates[i]; - trigger_map[index] = i; - } - } - - for (const auto index : added_indices[perspective]) { - ASSERT(index < RawFeatures::kDimensions); - ASSERT((*index_sets)[i][perspective].count(index) == 0); - ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i); - (*index_sets)[i][perspective].insert(index); - ++num_updates.back(); - ++num_updates[i]; - trigger_map[index] = i; - } - } - } - }; - - std::cout << "feature set: " << RawFeatures::get_name() - << "[" << RawFeatures::kDimensions << "]" << std::endl; - std::cout << "start testing with random games"; - - for (std::uint64_t i = 0; i < num_games; ++i) { - auto index_sets = make_index_sets(pos); - for (ply = 0; ply < MAX_PLY; ++ply) { - MoveList mg(pos); // Generate all legal hands - - // There was no legal move == Clog - if (mg.size() == 0) - break; - - // Randomly choose from the generated moves and advance the phase with the moves. - Move m = mg.begin()[prng.rand(mg.size())]; - pos.do_move(m, state[ply]); - - ++num_moves; - update_index_sets(pos, &index_sets); - ASSERT(index_sets == make_index_sets(pos)); - } - - pos.set(StartFEN, false, &si, Threads.main()); - - // Output'.' every 100 times (so you can see that it's progressing) - if ((i % 100) == 0) - std::cout << "." << std::flush; - } - - std::cout << "passed." << std::endl; - std::cout << num_games << " games, " << num_moves << " moves, " - << num_updates.back() << " updates, " - << (1.0 * num_updates.back() / num_moves) - << " updates per move" << std::endl; - std::size_t num_observed_indices = 0; - - for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) { - const auto count = std::count(trigger_map.begin(), trigger_map.end(), i); - num_observed_indices += count; - std::cout << "TriggerEvent(" << static_cast(kRefreshTriggers[i]) - << "): " << count << " features (" - << (100.0 * count / RawFeatures::kDimensions) << "%), " - << num_updates[i] << " updates (" - << (1.0 * num_updates[i] / num_moves) << " per move), " - << num_resets[i] << " resets (" - << (100.0 * num_resets[i] / num_moves) << "%)" - << std::endl; - } - std::cout << "observed " << num_observed_indices << " (" - << (100.0 * num_observed_indices / RawFeatures::kDimensions) - << "% of " << RawFeatures::kDimensions - << ") features" << std::endl; - } - - // Output a string that represents the structure of the evaluation function - void print_info(std::istream& stream) { - std::cout << "network architecture: " << get_architecture_string() << std::endl; - - while (true) { - std::string file_name; - stream >> file_name; - if (file_name.empty()) - break; - - std::uint32_t hash_value; - std::string architecture; - const bool success = [&]() { - std::ifstream file_stream(file_name, std::ios::binary); - - if (!file_stream) - return false; - if (!read_header(file_stream, &hash_value, &architecture)) - return false; - - return true; - }(); - - std::cout << file_name << ": "; - if (success) { - if (hash_value == kHashValue) { - std::cout << "matches with this binary"; - if (architecture != get_architecture_string()) { - std::cout << ", but architecture string differs: " << architecture; - } - - std::cout << std::endl; - } else { - std::cout << architecture << std::endl; - } - } else { - std::cout << "failed to read header" << std::endl; - } - } - } - - } // namespace - - // USI extended command for NNUE evaluation function - void test_command(Position& pos, std::istream& stream) { - std::string sub_command; - stream >> sub_command; - - if (sub_command == "test_features") { - test_features(pos); - } else if (sub_command == "info") { - print_info(stream); - } else { - std::cout << "usage:" << std::endl; - std::cout << " test nnue test_features" << std::endl; - std::cout << " test nnue info [path/to/" << fileName << "...]" << std::endl; - } - } - -} // namespace Eval::NNUE diff --git a/src/nnue/nnue_test_command.h b/src/nnue/nnue_test_command.h deleted file mode 100644 index fcfe16f6..00000000 --- a/src/nnue/nnue_test_command.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef _NNUE_TEST_COMMAND_H_ -#define _NNUE_TEST_COMMAND_H_ - -// USI extended command interface for NNUE evaluation function -namespace Eval::NNUE { - - // USI extended command for NNUE evaluation function - void test_command(Position& pos, std::istream& stream); - -} // namespace Eval::NNUE - -#endif diff --git a/src/nnue/trainer/features/all_factorizers.h b/src/nnue/trainer/features/all_factorizers.h deleted file mode 100644 index 75d62ec8..00000000 --- a/src/nnue/trainer/features/all_factorizers.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef _NNUE_TRAINER_FEATURES_ALL_FACTORIZERS_H_ -#define _NNUE_TRAINER_FEATURES_ALL_FACTORIZERS_H_ - -#include "factorizer.h" -#include "factorizer_feature_set.h" - -#include "factorizer_half_kp.h" -#include "factorizer_half_ka.h" - -#endif diff --git a/src/nnue/trainer/features/factorizer.h b/src/nnue/trainer/features/factorizer.h deleted file mode 100644 index b64b0c74..00000000 --- a/src/nnue/trainer/features/factorizer.h +++ /dev/null @@ -1,117 +0,0 @@ -#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_H_ -#define _NNUE_TRAINER_FEATURES_FACTORIZER_H_ - -#include "nnue/nnue_common.h" - -#include "nnue/trainer/trainer.h" - -// NNUE evaluation function feature conversion class template -namespace Eval::NNUE::Features { - - // Class template that converts input features into learning features - // By default, the learning feature is the same as the original input feature, and specialized as necessary - template - class Factorizer { - public: - static constexpr std::string get_name() { - return "Factorizer<" + FeatureType::get_name() + "> -> " + std::string("No factorizer"); - } - - static constexpr std::string get_factorizers_string() { - return " - " + get_name(); - } - - // Get the dimensionality of the learning feature - static constexpr IndexType get_dimensions() { - return FeatureType::kDimensions; - } - - // Get index of learning feature and scale of learning rate - static void append_training_features( - IndexType base_index, std::vector* training_features) { - - assert(base_index emplace_back(base_index); - } - }; - - // Learning feature information - struct FeatureProperties { - bool active; - IndexType dimensions; - }; - - // Add the original input features to the learning features - template - IndexType append_base_feature( - FeatureProperties properties, IndexType base_index, - std::vector* training_features) { - - assert(properties.dimensions == FeatureType::kDimensions); - assert(base_index < FeatureType::kDimensions); - training_features->emplace_back(base_index); - return properties.dimensions; - } - - // If the learning rate scale is not 0, inherit other types of learning features - template - IndexType inherit_features_if_required( - IndexType index_offset, FeatureProperties properties, IndexType base_index, - std::vector* training_features) { - - if (!properties.active) { - return 0; - } - - assert(properties.dimensions == Factorizer::get_dimensions()); - assert(base_index < FeatureType::kDimensions); - - const auto start = training_features->size(); - Factorizer::append_training_features( - base_index, training_features); - - for (auto i = start; i < training_features->size(); ++i) { - auto& feature = (*training_features)[i]; - assert(feature.get_index() < Factorizer::get_dimensions()); - feature.shift_index(index_offset); - } - - return properties.dimensions; - } - - // Return the index difference as needed, without adding learning features - // Call instead of InheritFeaturesIfRequired() if there are no corresponding features - IndexType skip_features(FeatureProperties properties) { - if (!properties.active) - return 0; - - return properties.dimensions; - } - - // Get the dimensionality of the learning feature - template - constexpr IndexType get_active_dimensions( - const FeatureProperties (&properties)[N]) { - - static_assert(N > 0, ""); - - IndexType dimensions = properties[0].dimensions; - - for (std::size_t i = 1; i < N; ++i) { - if (properties[i].active) { - dimensions += properties[i].dimensions; - } - } - - return dimensions; - } - - // get the number of elements in the array - template - constexpr std::size_t get_array_length(const T (&/*array*/)[N]) { - return N; - } - -} // namespace Eval::NNUE::Features - -#endif diff --git a/src/nnue/trainer/features/factorizer_feature_set.h b/src/nnue/trainer/features/factorizer_feature_set.h deleted file mode 100644 index 60f42166..00000000 --- a/src/nnue/trainer/features/factorizer_feature_set.h +++ /dev/null @@ -1,121 +0,0 @@ -#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_ -#define _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_ - -#include "factorizer.h" - -#include "nnue/features/feature_set.h" - -// Specialization for feature set of feature conversion class template of NNUE evaluation function -namespace Eval::NNUE::Features { - - // Class template that converts input features into learning features - // Specialization for FeatureSet - template - class Factorizer> { - private: - using Head = Factorizer>; - using Tail = Factorizer>; - - public: - // number of dimensions of original input features - static constexpr IndexType kBaseDimensions = - FeatureSet::kDimensions; - - static constexpr std::string get_factorizers_string() { - std::string str = " - "; - str += Head::get_name(); - str += '\n'; - str += Tail::get_factorizers_string(); - return str; - } - - // Get the dimensionality of the learning feature - static constexpr IndexType get_dimensions() { - return Head::get_dimensions() + Tail::get_dimensions(); - } - - // Get index of learning feature and scale of learning rate - static void append_training_features( - IndexType base_index, std::vector* training_features, - IndexType base_dimensions = kBaseDimensions) { - - assert(base_index < kBaseDimensions); - - constexpr auto boundary = FeatureSet::kDimensions; - - if (base_index < boundary) { - Tail::append_training_features( - base_index, training_features, base_dimensions); - } - else { - const auto start = training_features->size(); - - Head::append_training_features( - base_index - boundary, training_features, base_dimensions); - - for (auto i = start; i < training_features->size(); ++i) { - auto& feature = (*training_features)[i]; - const auto index = feature.get_index(); - - assert(index < Head::get_dimensions() || - (index >= base_dimensions && - index < base_dimensions + - Head::get_dimensions() - Head::kBaseDimensions)); - - if (index < Head::kBaseDimensions) { - feature.shift_index(Tail::kBaseDimensions); - } - else { - feature.shift_index(Tail::get_dimensions() - Tail::kBaseDimensions); - } - } - } - } - }; - - // Class template that converts input features into learning features - // Specialization when FeatureSet has one template argument - template - class Factorizer> { - public: - // number of dimensions of original input features - static constexpr IndexType kBaseDimensions = FeatureType::kDimensions; - - static constexpr std::string get_name() { - return Factorizer::get_name(); - } - - static constexpr std::string get_factorizers_string() { - return " - " + get_name(); - } - - // Get the dimensionality of the learning feature - static constexpr IndexType get_dimensions() { - return Factorizer::get_dimensions(); - } - - // Get index of learning feature and scale of learning rate - static void append_training_features( - IndexType base_index, std::vector* training_features, - IndexType base_dimensions = kBaseDimensions) { - - assert(base_index < kBaseDimensions); - - const auto start = training_features->size(); - - Factorizer::append_training_features( - base_index, training_features); - - for (auto i = start; i < training_features->size(); ++i) { - auto& feature = (*training_features)[i]; - assert(feature.get_index() < Factorizer::get_dimensions()); - if (feature.get_index() >= kBaseDimensions) { - feature.shift_index(base_dimensions - kBaseDimensions); - } - } - } - }; - -} // namespace Eval::NNUE::Features - -#endif diff --git a/src/nnue/trainer/features/factorizer_half_ka.h b/src/nnue/trainer/features/factorizer_half_ka.h deleted file mode 100644 index 36d36a2d..00000000 --- a/src/nnue/trainer/features/factorizer_half_ka.h +++ /dev/null @@ -1,93 +0,0 @@ -#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KA_H_ -#define _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KA_H_ - -#include "factorizer.h" - -#include "nnue/features/half_ka.h" -#include "nnue/features/a.h" -#include "nnue/features/half_relative_ka.h" - -// Specialization of NNUE evaluation function feature conversion class template for HalfKA -namespace Eval::NNUE::Features { - - // Class template that converts input features into learning features - // Specialization for HalfKA - template - class Factorizer> { - private: - using FeatureType = HalfKA; - - // The maximum value of the number of indexes whose value is 1 at the same time among the feature values - static constexpr IndexType kMaxActiveDimensions = - FeatureType::kMaxActiveDimensions; - - // Type of learning feature - enum TrainingFeatureType { - kFeaturesHalfKA, - kFeaturesA, - kFeaturesHalfRelativeKA, - kNumTrainingFeatureTypes, - }; - - // Learning feature information - static constexpr FeatureProperties kProperties[] = { - // kFeaturesHalfA - {true, FeatureType::kDimensions}, - // kFeaturesA - {true, Factorizer::get_dimensions()}, - // kFeaturesHalfRelativeKA - {true, Factorizer>::get_dimensions()}, - }; - - static_assert(get_array_length(kProperties) == kNumTrainingFeatureTypes, ""); - - public: - static constexpr std::string get_name() { - return std::string("Factorizer<") + FeatureType::kName + "> -> " + "A, HalfRelativeKA"; - } - - static constexpr std::string get_factorizers_string() { - return " - " + get_name(); - } - - // Get the dimensionality of the learning feature - static constexpr IndexType get_dimensions() { - return get_active_dimensions(kProperties); - } - - // Get index of learning feature and scale of learning rate - static void append_training_features( - IndexType base_index, std::vector* training_features) { - - // kFeaturesHalfA - IndexType index_offset = append_base_feature( - kProperties[kFeaturesHalfKA], base_index, training_features); - - const auto sq_k = static_cast(base_index / PS_END2); - const auto a = static_cast(base_index % PS_END2); - - // kFeaturesA - index_offset += inherit_features_if_required( - index_offset, kProperties[kFeaturesA], a, training_features); - - // kFeaturesHalfRelativeKA - if (a >= PS_W_PAWN) { - index_offset += inherit_features_if_required>( - index_offset, kProperties[kFeaturesHalfRelativeKA], - HalfRelativeKA::make_index(sq_k, a), - training_features); - } - else { - index_offset += skip_features(kProperties[kFeaturesHalfRelativeKA]); - } - - assert(index_offset == get_dimensions()); - } - }; - - template - constexpr FeatureProperties Factorizer>::kProperties[]; - -} // namespace Eval::NNUE::Features - -#endif // #ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KA_H_ diff --git a/src/nnue/trainer/features/factorizer_half_kp.h b/src/nnue/trainer/features/factorizer_half_kp.h deleted file mode 100644 index c554f0fc..00000000 --- a/src/nnue/trainer/features/factorizer_half_kp.h +++ /dev/null @@ -1,104 +0,0 @@ -#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_ -#define _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_ - -#include "factorizer.h" - -#include "nnue/features/half_kp.h" -#include "nnue/features/p.h" -#include "nnue/features/half_relative_kp.h" - -// Specialization of NNUE evaluation function feature conversion class template for HalfKP -namespace Eval::NNUE::Features { - - // Class template that converts input features into learning features - // Specialization for HalfKP - template - class Factorizer> { - private: - using FeatureType = HalfKP; - - // The maximum value of the number of indexes whose value is 1 at the same time among the feature values - static constexpr IndexType kMaxActiveDimensions = - FeatureType::kMaxActiveDimensions; - - // Type of learning feature - enum TrainingFeatureType { - kFeaturesHalfKP, - kFeaturesHalfK, - kFeaturesP, - kFeaturesHalfRelativeKP, - kNumTrainingFeatureTypes, - }; - - // Learning feature information - static constexpr FeatureProperties kProperties[] = { - // kFeaturesHalfKP - {true, FeatureType::kDimensions}, - // kFeaturesHalfK - {true, SQUARE_NB}, - // kFeaturesP - {true, Factorizer

::get_dimensions()}, - // kFeaturesHalfRelativeKP - {true, Factorizer>::get_dimensions()}, - }; - - static_assert(get_array_length(kProperties) == kNumTrainingFeatureTypes, ""); - - public: - static constexpr std::string get_name() { - return std::string("Factorizer<") + FeatureType::kName + "> -> " + "HalfK, P, HalfRelativeKP"; - } - - static constexpr std::string get_factorizers_string() { - return " - " + get_name(); - } - - // Get the dimensionality of the learning feature - static constexpr IndexType get_dimensions() { - return get_active_dimensions(kProperties); - } - - // Get index of learning feature and scale of learning rate - static void append_training_features( - IndexType base_index, std::vector* training_features) { - - // kFeaturesHalfKP - IndexType index_offset = append_base_feature( - kProperties[kFeaturesHalfKP], base_index, training_features); - - const auto sq_k = static_cast(base_index / PS_END); - const auto p = static_cast(base_index % PS_END); - - // kFeaturesHalfK - { - const auto& properties = kProperties[kFeaturesHalfK]; - if (properties.active) { - training_features->emplace_back(index_offset + sq_k); - index_offset += properties.dimensions; - } - } - - // kFeaturesP - index_offset += inherit_features_if_required

( - index_offset, kProperties[kFeaturesP], p, training_features); - // kFeaturesHalfRelativeKP - if (p >= PS_W_PAWN) { - index_offset += inherit_features_if_required>( - index_offset, kProperties[kFeaturesHalfRelativeKP], - HalfRelativeKP::make_index(sq_k, p), - training_features); - } - else { - index_offset += skip_features(kProperties[kFeaturesHalfRelativeKP]); - } - - assert(index_offset == get_dimensions()); - } - }; - - template - constexpr FeatureProperties Factorizer>::kProperties[]; - -} // namespace Eval::NNUE::Features - -#endif diff --git a/src/nnue/trainer/trainer.h b/src/nnue/trainer/trainer.h deleted file mode 100644 index 973bc898..00000000 --- a/src/nnue/trainer/trainer.h +++ /dev/null @@ -1,122 +0,0 @@ -#ifndef _NNUE_TRAINER_H_ -#define _NNUE_TRAINER_H_ - -#include "nnue/nnue_common.h" -#include "nnue/features/index_list.h" - -#include - -#if defined(USE_BLAS) -static_assert(std::is_same::value, ""); -#include -#endif - -// Common header of class template for learning NNUE evaluation function -namespace Eval::NNUE { - - // Ponanza constant used in the relation between evaluation value and winning percentage - constexpr double kPonanzaConstant = 600.0; - - // Class that represents one index of learning feature - class TrainingFeature { - using StorageType = std::uint32_t; - static_assert(std::is_unsigned::value, ""); - - public: - static constexpr std::uint32_t kIndexBits = 24; - - static_assert(kIndexBits < std::numeric_limits::digits, ""); - - static constexpr std::uint32_t kCountBits = - std::numeric_limits::digits - kIndexBits; - - explicit TrainingFeature(IndexType index) : - index_and_count_((index << kCountBits) | 1) { - - assert(index < (1 << kIndexBits)); - } - - TrainingFeature& operator+=(const TrainingFeature& other) { - assert(other.get_index() == get_index()); - assert(other.get_count() + get_count() < (1 << kCountBits)); - index_and_count_ += other.get_count(); - return *this; - } - - IndexType get_index() const { - return static_cast(index_and_count_ >> kCountBits); - } - - void shift_index(IndexType offset) { - assert(get_index() + offset < (1 << kIndexBits)); - index_and_count_ += offset << kCountBits; - } - - IndexType get_count() const { - return static_cast(index_and_count_ & ((1 << kCountBits) - 1)); - } - - bool operator<(const TrainingFeature& other) const { - return index_and_count_ < other.index_and_count_; - } - - private: - StorageType index_and_count_; - }; - - // Structure that represents one sample of training data - struct Example { - std::vector training_features[2]; - Learner::PackedSfenValue psv; - Value discrete_nn_eval; - int sign; - double weight; - }; - - // Message used for setting hyperparameters - struct Message { - Message(const std::string& message_name, const std::string& message_value = "") : - name(message_name), value(message_value), num_peekers(0), num_receivers(0) - { - } - - const std::string name; - const std::string value; - std::uint32_t num_peekers; - std::uint32_t num_receivers; - }; - - // determine whether to accept the message - bool receive_message(const std::string& name, Message* message) { - const auto subscript = "[" + std::to_string(message->num_peekers) + "]"; - - if (message->name.substr(0, name.size() + 1) == name + "[") { - ++message->num_peekers; - } - - if (message->name == name || message->name == name + subscript) { - ++message->num_receivers; - return true; - } - - return false; - } - - // round a floating point number to an integer - template - IntType round(double value) { - return static_cast(std::floor(value + 0.5)); - } - - // make_shared with alignment - template - std::shared_ptr make_aligned_shared_ptr(ArgumentTypes&&... arguments) { - const auto ptr = new(std_aligned_alloc(alignof(T), sizeof(T))) - T(std::forward(arguments)...); - - return std::shared_ptr(ptr, AlignedDeleter()); - } - -} // namespace Eval::NNUE - -#endif diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h deleted file mode 100644 index 53e8f904..00000000 --- a/src/nnue/trainer/trainer_affine_transform.h +++ /dev/null @@ -1,476 +0,0 @@ -#ifndef _NNUE_TRAINER_AFFINE_TRANSFORM_H_ -#define _NNUE_TRAINER_AFFINE_TRANSFORM_H_ - -#include "trainer.h" - -#include "extra/stockfish_blas.h" - -#include "learn/learn.h" - -#include "nnue/layers/affine_transform.h" - -#include "thread.h" - -#include - -// Specialization of NNUE evaluation function learning class template for AffineTransform -namespace Eval::NNUE { - - // Learning: Affine transformation layer - template - class Trainer> { - private: - // Type of layer to learn - using LayerType = Layers::AffineTransform; - - public: - // factory function - static std::shared_ptr create( - LayerType* target_layer, FeatureTransformer* ft) { - - return std::shared_ptr( - new Trainer(target_layer, ft)); - } - - // Set options such as hyperparameters - void send_message(Message* message) { - previous_layer_trainer_->send_message(message); - - if (receive_message("momentum", message)) { - momentum_ = static_cast(std::stod(message->value)); - } - - if (receive_message("learning_rate_scale", message)) { - learning_rate_scale_ = - static_cast(std::stod(message->value)); - } - - if (receive_message("reset", message)) { - dequantize_parameters(); - } - - if (receive_message("quantize_parameters", message)) { - quantize_parameters(); - } - - if (receive_message("check_health", message)) { - check_health(); - } - } - - // Initialize the parameters with random numbers - template - void initialize(RNG& rng) { - previous_layer_trainer_->initialize(rng); - - if (kIsOutputLayer) { - // Initialize output layer with 0 - std::fill(std::begin(biases_), std::end(biases_), - static_cast(0.0)); - std::fill(std::begin(weights_), std::end(weights_), - static_cast(0.0)); - } - else { - // Assuming that the input distribution is unit-mean 0.5, equal variance, - // Initialize the output distribution so that each unit has a mean of 0.5 and the same variance as the input - const double kSigma = 1.0 / std::sqrt(kInputDimensions); - auto distribution = std::normal_distribution(0.0, kSigma); - - for (IndexType i = 0; i < kOutputDimensions; ++i) { - double sum = 0.0; - for (IndexType j = 0; j < kInputDimensions; ++j) { - const auto weight = static_cast(distribution(rng)); - weights_[kInputDimensions * i + j] = weight; - sum += weight; - } - - biases_[i] = static_cast(0.5 - 0.5 * sum); - } - } - - quantize_parameters(); - } - - const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector::const_iterator batch_begin, std::vector::const_iterator batch_end) - { - const auto size = batch_end - batch_begin; - - if ((long)output_.size() < (long)kOutputDimensions * size) { - output_.resize(kOutputDimensions * size); - gradients_.resize(kInputDimensions * size); - } - - if (thread_states_.size() < thread_pool.size()) - { - thread_states_.resize(thread_pool.size()); - } - - combined_batch_size_ = size; - combined_batch_input_ = previous_layer_trainer_->step_start(thread_pool, batch_begin, batch_end); - - auto& main_thread_state = thread_states_[0]; - -#if defined(USE_BLAS) - - // update - cblas_sscal( - kOutputDimensions, momentum_, main_thread_state.biases_diff_, 1 - ); - -#else - - Blas::sscal( - kOutputDimensions, momentum_, main_thread_state.biases_diff_, 1 - ); - -#endif - - for (IndexType i = 1; i < thread_states_.size(); ++i) - thread_states_[i].reset_biases(); - - return output_.data(); - } - - // forward propagation - void propagate(Thread& th, const uint64_t offset, const uint64_t count) { - - previous_layer_trainer_->propagate(th, offset, count); - -#if defined(USE_BLAS) - - for (IndexType b = offset; b < offset + count; ++b) { - const IndexType batch_offset = kOutputDimensions * b; - cblas_scopy( - kOutputDimensions, biases_, 1, &output_[batch_offset], 1 - ); - } - - cblas_sgemm( - CblasColMajor, CblasTrans, CblasNoTrans, - kOutputDimensions, count, kInputDimensions, - 1.0, - weights_, kInputDimensions, - combined_batch_input_ + offset * kInputDimensions, kInputDimensions, - 1.0, - &output_[offset * kOutputDimensions], kOutputDimensions - ); -#else - - for (IndexType b = offset; b < offset + count; ++b) { - const IndexType batch_offset = kOutputDimensions * b; - Blas::scopy( - kOutputDimensions, biases_, 1, &output_[batch_offset], 1 - ); - } - - Blas::sgemm( - Blas::MatrixLayout::ColMajor, Blas::MatrixTranspose::Trans, Blas::MatrixTranspose::NoTrans, - kOutputDimensions, count, kInputDimensions, - 1.0, - weights_, kInputDimensions, - combined_batch_input_ + offset * kInputDimensions, kInputDimensions, - 1.0, - &output_[offset * kOutputDimensions], kOutputDimensions - ); - -#endif - } - - // backpropagation - void backpropagate(Thread& th, - const LearnFloatType* gradients, - uint64_t offset, - uint64_t count) { - - auto& thread_state = thread_states_[th.thread_idx()]; - const auto momentum = th.thread_idx() == 0 ? momentum_ : 0.0f; -#if defined(USE_BLAS) - - cblas_sgemm( - CblasColMajor, CblasNoTrans, CblasNoTrans, - kInputDimensions, count, kOutputDimensions, - 1.0, - weights_, kInputDimensions, - gradients + offset * kOutputDimensions, kOutputDimensions, - 0.0, - &gradients_[offset * kInputDimensions], kInputDimensions - ); - - for (IndexType b = offset; b < offset + count; ++b) { - const IndexType batch_offset = kOutputDimensions * b; - cblas_saxpy( - kOutputDimensions, 1.0, - &gradients[batch_offset], 1, thread_state.biases_diff_, 1 - ); - } - - cblas_sgemm( - CblasRowMajor, CblasTrans, CblasNoTrans, - kOutputDimensions, kInputDimensions, count, - 1.0, - gradients + offset * kOutputDimensions, kOutputDimensions, - combined_batch_input_ + offset * kInputDimensions, kInputDimensions, - momentum, - thread_state.weights_diff_, kInputDimensions - ); - -#else - - // backpropagate - Blas::sgemm( - Blas::MatrixLayout::ColMajor, Blas::MatrixTranspose::NoTrans, Blas::MatrixTranspose::NoTrans, - kInputDimensions, count, kOutputDimensions, - 1.0, - weights_, kInputDimensions, - gradients + offset * kOutputDimensions, kOutputDimensions, - 0.0, - &gradients_[offset * kInputDimensions], kInputDimensions - ); - - for (IndexType b = offset; b < offset + count; ++b) { - const IndexType batch_offset = kOutputDimensions * b; - Blas::saxpy(kOutputDimensions, 1.0, - &gradients[batch_offset], 1, thread_state.biases_diff_, 1); - } - - Blas::sgemm( - Blas::MatrixLayout::RowMajor, Blas::MatrixTranspose::Trans, Blas::MatrixTranspose::NoTrans, - kOutputDimensions, kInputDimensions, count, - 1.0, - gradients + offset * kOutputDimensions, kOutputDimensions, - combined_batch_input_ + offset * kInputDimensions, kInputDimensions, - momentum, - thread_state.weights_diff_, kInputDimensions - ); - -#endif - - previous_layer_trainer_->backpropagate(th, gradients_.data(), offset, count); - } - - void reduce_thread_state() - { - for (IndexType i = 1; i < thread_states_.size(); ++i) - { - thread_states_[0] += thread_states_[i]; - } - } - - void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate) - { - const LearnFloatType local_learning_rate = - learning_rate * learning_rate_scale_; - - reduce_thread_state(); - - auto& main_thread_state = thread_states_[0]; - - for (IndexType i = 0; i < kOutputDimensions; ++i) { - const double d = local_learning_rate * main_thread_state.biases_diff_[i]; - biases_[i] -= d; - abs_biases_diff_sum_ += std::abs(d); - } - num_biases_diffs_ += kOutputDimensions; - - for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) { - const double d = local_learning_rate * main_thread_state.weights_diff_[i]; - weights_[i] -= d; - abs_weights_diff_sum_ += std::abs(d); - } - num_weights_diffs_ += kOutputDimensions * kInputDimensions; - - previous_layer_trainer_->step_end(thread_pool, learning_rate); - } - - private: - // constructor - Trainer(LayerType* target_layer, FeatureTransformer* ft) : - combined_batch_size_(0), - combined_batch_input_(nullptr), - previous_layer_trainer_(Trainer::create( - &target_layer->previous_layer_, ft)), - target_layer_(target_layer), - biases_(), - weights_(), - momentum_(0.2), - learning_rate_scale_(1.0) { - - dequantize_parameters(); - } - - void reset_stats() { - abs_biases_diff_sum_ = 0.0; - abs_weights_diff_sum_ = 0.0; - num_biases_diffs_ = 0; - num_weights_diffs_ = 0; - } - - void check_health() { - - double abs_bias_sum = 0.0; - double abs_weight_sum = 0.0; - - for(auto b : biases_) - abs_bias_sum += std::abs(b); - - for(auto w : weights_) - abs_weight_sum += std::abs(w); - - auto out = sync_region_cout.new_region(); - - out << "INFO (check_health):" - << " layer " << LayerType::kLayerIndex - << " - " << LayerType::get_name() - << std::endl; - - out << " - avg_abs_bias = " << abs_bias_sum / std::size(biases_) << std::endl; - out << " - avg_abs_bias_diff = " << abs_biases_diff_sum_ / num_biases_diffs_ << std::endl; - out << " - avg_abs_weight = " << abs_weight_sum / std::size(weights_) << std::endl; - out << " - avg_abs_weight_diff = " << abs_weights_diff_sum_ / num_weights_diffs_ << std::endl; - - out.unlock(); - - reset_stats(); - } - - // Weight saturation and parameterization - void quantize_parameters() { - for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) { - weights_[i] = std::max(-kMaxWeightMagnitude, - std::min(+kMaxWeightMagnitude, weights_[i])); - } - - for (IndexType i = 0; i < kOutputDimensions; ++i) { - target_layer_->biases_[i] = - round(biases_[i] * kBiasScale); - } - - for (IndexType i = 0; i < kOutputDimensions; ++i) { - const auto offset = kInputDimensions * i; - const auto padded_offset = LayerType::kPaddedInputDimensions * i; - for (IndexType j = 0; j < kInputDimensions; ++j) { - target_layer_->weights_[padded_offset + j] = - round( - weights_[offset + j] * kWeightScale); - } - } - } - - // read parameterized integer - void dequantize_parameters() { - for (IndexType i = 0; i < kOutputDimensions; ++i) { - biases_[i] = static_cast( - target_layer_->biases_[i] / kBiasScale); - } - - for (IndexType i = 0; i < kOutputDimensions; ++i) { - const auto offset = kInputDimensions * i; - const auto padded_offset = LayerType::kPaddedInputDimensions * i; - for (IndexType j = 0; j < kInputDimensions; ++j) { - weights_[offset + j] = static_cast( - target_layer_->weights_[padded_offset + j] / kWeightScale); - } - } - - for (auto& state : thread_states_) - { - state.reset_weights(); - state.reset_biases(); - } - - - reset_stats(); - } - - // number of input/output dimensions - static constexpr IndexType kInputDimensions = LayerType::kInputDimensions; - static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions; - - // If the output dimensionality is 1, the output layer - static constexpr bool kIsOutputLayer = kOutputDimensions == 1; - - // Coefficient used for parameterization - static constexpr LearnFloatType kActivationScale = - std::numeric_limits::max(); - - static constexpr LearnFloatType kBiasScale = kIsOutputLayer ? - (kPonanzaConstant * FV_SCALE) : - ((1 << kWeightScaleBits) * kActivationScale); - - static constexpr LearnFloatType kWeightScale = kBiasScale / kActivationScale; - - // Upper limit of absolute value of weight used to prevent overflow when parameterizing integers - static constexpr LearnFloatType kMaxWeightMagnitude = - std::numeric_limits::max() / kWeightScale; - - // number of samples in mini-batch - IndexType combined_batch_size_; - - double abs_biases_diff_sum_; - double abs_weights_diff_sum_; - uint64_t num_biases_diffs_; - uint64_t num_weights_diffs_; - - // Input mini batch - const LearnFloatType* combined_batch_input_; - - // Trainer of the previous layer - const std::shared_ptr> previous_layer_trainer_; - - // layer to learn - LayerType* const target_layer_; - - // parameter - struct alignas(kCacheLineSize) ThreadState - { - // Buffer used for updating parameters - alignas(kCacheLineSize) LearnFloatType biases_diff_[kOutputDimensions]; - alignas(kCacheLineSize) LearnFloatType weights_diff_[kOutputDimensions * kInputDimensions]; - - ThreadState() { reset_weights(); reset_biases(); } - - ThreadState& operator+=(const ThreadState& other) - { - for (IndexType i = 0; i < kOutputDimensions; ++i) - { - biases_diff_[i] += other.biases_diff_[i]; - } - - for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) - { - weights_diff_[i] += other.weights_diff_[i]; - } - - return *this; - } - - void reset_weights() - { - std::fill(std::begin(weights_diff_), std::end(weights_diff_), 0.0f); - } - - void reset_biases() - { - std::fill(std::begin(biases_diff_), std::end(biases_diff_), 0.0f); - } - }; - - alignas(kCacheLineSize) LearnFloatType biases_[kOutputDimensions]; - alignas(kCacheLineSize) LearnFloatType weights_[kOutputDimensions * kInputDimensions]; - - std::vector> thread_states_; - - // Forward propagation buffer - std::vector> output_; - - // buffer for back propagation - std::vector> gradients_; - - // hyper parameter - LearnFloatType momentum_; - LearnFloatType learning_rate_scale_; - }; - -} // namespace Eval::NNUE - -#endif diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h deleted file mode 100644 index 48dec8be..00000000 --- a/src/nnue/trainer/trainer_clipped_relu.h +++ /dev/null @@ -1,354 +0,0 @@ -#ifndef _NNUE_TRAINER_CLIPPED_RELU_H_ -#define _NNUE_TRAINER_CLIPPED_RELU_H_ - -#include "trainer.h" - -#include "learn/learn.h" - -#include "nnue/layers/clipped_relu.h" - -#include "thread.h" - -// Specialization of NNUE evaluation function learning class template for ClippedReLU -namespace Eval::NNUE { - - // Learning: Affine transformation layer - template - class Trainer> { - private: - // Type of layer to learn - using LayerType = Layers::ClippedReLU; - - public: - // factory function - static std::shared_ptr create( - LayerType* target_layer, FeatureTransformer* ft) { - - return std::shared_ptr( - new Trainer(target_layer, ft)); - } - - // Set options such as hyperparameters - void send_message(Message* message) { - previous_layer_trainer_->send_message(message); - if (receive_message("check_health", message)) { - check_health(); - } - } - - // Initialize the parameters with random numbers - template - void initialize(RNG& rng) { - previous_layer_trainer_->initialize(rng); - } - - const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector::const_iterator batch_begin, std::vector::const_iterator batch_end) - { - const auto size = batch_end - batch_begin; - - if ((long)output_.size() < (long)kOutputDimensions * size) { - output_.resize(kOutputDimensions * size); - gradients_.resize(kInputDimensions * size); - } - - if (thread_states_.size() < thread_pool.size()) - { - thread_states_.resize(thread_pool.size()); - } - - input_ = previous_layer_trainer_->step_start(thread_pool, batch_begin, batch_end); - - batch_size_ = size; - - return output_.data(); - } - - // forward propagation - void propagate(Thread& th, const uint64_t offset, const uint64_t count) { - - auto& thread_state = thread_states_[th.thread_idx()]; - - previous_layer_trainer_->propagate(th, offset, count); - -#if defined (USE_SSE2) - - { - static_assert(kOutputDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time"); - - const __m128 kZero4 = _mm_set1_ps(+kZero); - const __m128 kOne4 = _mm_set1_ps(+kOne); - - for (IndexType b = offset; b < offset + count; ++b) - { - const IndexType batch_offset = kOutputDimensions * b; - - for (IndexType i = 0; i < kOutputDimensions; i += 16) - { - __m128 out0 = _mm_loadu_ps(&input_[i + 0 + batch_offset]); - __m128 out1 = _mm_loadu_ps(&input_[i + 4 + batch_offset]); - __m128 out2 = _mm_loadu_ps(&input_[i + 8 + batch_offset]); - __m128 out3 = _mm_loadu_ps(&input_[i + 12 + batch_offset]); - - out0 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out0)); - out1 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out1)); - out2 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out2)); - out3 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out3)); - - _mm_storeu_ps(&output_[i + 0 + batch_offset], out0); - _mm_storeu_ps(&output_[i + 4 + batch_offset], out1); - _mm_storeu_ps(&output_[i + 8 + batch_offset], out2); - _mm_storeu_ps(&output_[i + 12 + batch_offset], out3); - - __m128 minact0 = _mm_loadu_ps(&thread_state.min_activations_[i + 0]); - __m128 minact1 = _mm_loadu_ps(&thread_state.min_activations_[i + 4]); - __m128 minact2 = _mm_loadu_ps(&thread_state.min_activations_[i + 8]); - __m128 minact3 = _mm_loadu_ps(&thread_state.min_activations_[i + 12]); - - __m128 maxact0 = _mm_loadu_ps(&thread_state.max_activations_[i + 0]); - __m128 maxact1 = _mm_loadu_ps(&thread_state.max_activations_[i + 4]); - __m128 maxact2 = _mm_loadu_ps(&thread_state.max_activations_[i + 8]); - __m128 maxact3 = _mm_loadu_ps(&thread_state.max_activations_[i + 12]); - - minact0 = _mm_min_ps(out0, minact0); - minact1 = _mm_min_ps(out1, minact1); - minact2 = _mm_min_ps(out2, minact2); - minact3 = _mm_min_ps(out3, minact3); - - maxact0 = _mm_max_ps(out0, maxact0); - maxact1 = _mm_max_ps(out1, maxact1); - maxact2 = _mm_max_ps(out2, maxact2); - maxact3 = _mm_max_ps(out3, maxact3); - - _mm_storeu_ps(&thread_state.min_activations_[i + 0], minact0); - _mm_storeu_ps(&thread_state.min_activations_[i + 4], minact1); - _mm_storeu_ps(&thread_state.min_activations_[i + 8], minact2); - _mm_storeu_ps(&thread_state.min_activations_[i + 12], minact3); - - _mm_storeu_ps(&thread_state.max_activations_[i + 0], maxact0); - _mm_storeu_ps(&thread_state.max_activations_[i + 4], maxact1); - _mm_storeu_ps(&thread_state.max_activations_[i + 8], maxact2); - _mm_storeu_ps(&thread_state.max_activations_[i + 12], maxact3); - } - } - } - -#else - - for (IndexType b = offset; b < offset + count; ++b) { - const IndexType batch_offset = kOutputDimensions * b; - for (IndexType i = 0; i < kOutputDimensions; ++i) { - const IndexType index = batch_offset + i; - output_[index] = std::max(+kZero, std::min(+kOne, input_[index])); - thread_state.min_activations_[i] = std::min(thread_state.min_activations_[i], output_[index]); - thread_state.max_activations_[i] = std::max(thread_state.max_activations_[i], output_[index]); - } - } - -#endif - } - - // backpropagation - void backpropagate(Thread& th, - const LearnFloatType* gradients, - const uint64_t offset, - const uint64_t count) { - - auto& thread_state = thread_states_[th.thread_idx()]; - -#if defined (USE_SSE2) - - { - static_assert(kOutputDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time"); - - const __m128 kZero4 = _mm_set1_ps(+kZero); - const __m128 kOne4 = _mm_set1_ps(+kOne); - - for (IndexType b = offset; b < offset + count; ++b) - { - const IndexType batch_offset = kOutputDimensions * b; - - for (IndexType i = 0; i < kOutputDimensions; i += 16) - { - __m128 out0 = _mm_loadu_ps(&output_[batch_offset + i + 0]); - __m128 out1 = _mm_loadu_ps(&output_[batch_offset + i + 4]); - __m128 out2 = _mm_loadu_ps(&output_[batch_offset + i + 8]); - __m128 out3 = _mm_loadu_ps(&output_[batch_offset + i + 12]); - - __m128 clipped0 = _mm_or_ps(_mm_cmple_ps(out0, kZero4), _mm_cmpge_ps(out0, kOne4)); - __m128 clipped1 = _mm_or_ps(_mm_cmple_ps(out1, kZero4), _mm_cmpge_ps(out1, kOne4)); - __m128 clipped2 = _mm_or_ps(_mm_cmple_ps(out2, kZero4), _mm_cmpge_ps(out2, kOne4)); - __m128 clipped3 = _mm_or_ps(_mm_cmple_ps(out3, kZero4), _mm_cmpge_ps(out3, kOne4)); - - __m128 grad0 = _mm_loadu_ps(&gradients[batch_offset + i + 0]); - __m128 grad1 = _mm_loadu_ps(&gradients[batch_offset + i + 4]); - __m128 grad2 = _mm_loadu_ps(&gradients[batch_offset + i + 8]); - __m128 grad3 = _mm_loadu_ps(&gradients[batch_offset + i + 12]); - - grad0 = _mm_andnot_ps(clipped0, grad0); - grad1 = _mm_andnot_ps(clipped1, grad1); - grad2 = _mm_andnot_ps(clipped2, grad2); - grad3 = _mm_andnot_ps(clipped3, grad3); - - _mm_storeu_ps(&gradients_[batch_offset + i + 0], grad0); - _mm_storeu_ps(&gradients_[batch_offset + i + 4], grad1); - _mm_storeu_ps(&gradients_[batch_offset + i + 8], grad2); - _mm_storeu_ps(&gradients_[batch_offset + i + 12], grad3); - - const int clipped_mask = - (_mm_movemask_ps(clipped0) << 0) - | (_mm_movemask_ps(clipped1) << 4) - | (_mm_movemask_ps(clipped2) << 8) - | (_mm_movemask_ps(clipped3) << 12); - - thread_state.num_clipped_ += popcount(clipped_mask); - } - } - } - -#else - - for (IndexType b = offset; b < offset + count; ++b) { - const IndexType batch_offset = kOutputDimensions * b; - for (IndexType i = 0; i < kOutputDimensions; ++i) { - const IndexType index = batch_offset + i; - const bool clipped = (output_[index] <= kZero) | (output_[index] >= kOne); - gradients_[index] = gradients[index] * !clipped; - thread_state.num_clipped_ += clipped; - } - } - -#endif - - thread_state.num_total_ += count * kOutputDimensions; - - previous_layer_trainer_->backpropagate(th, gradients_.data(), offset, count); - } - - void reduce_thread_state() - { - for (IndexType i = 1; i < thread_states_.size(); ++i) - { - thread_states_[0] += thread_states_[i]; - } - } - - void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate) - { - previous_layer_trainer_->step_end(thread_pool, learning_rate); - } - - private: - // constructor - Trainer(LayerType* target_layer, FeatureTransformer* ft) : - batch_size_(0), - previous_layer_trainer_(Trainer::create( - &target_layer->previous_layer_, ft)), - target_layer_(target_layer) { - - reset_stats(); - } - - void reset_stats() { - for(auto& state : thread_states_) - state.reset(); - } - - // Check if there are any problems with learning - void check_health() { - - reduce_thread_state(); - - auto& main_thread_state = thread_states_[0]; - - const auto largest_min_activation = *std::max_element( - std::begin(main_thread_state.min_activations_), std::end(main_thread_state.min_activations_)); - const auto smallest_max_activation = *std::min_element( - std::begin(main_thread_state.max_activations_), std::end(main_thread_state.max_activations_)); - - auto out = sync_region_cout.new_region(); - - out << "INFO (check_health):" - << " layer " << LayerType::kLayerIndex - << " - " << LayerType::get_name() - << std::endl; - - out << " - largest min activation = " << largest_min_activation - << " , smallest max activation = " << smallest_max_activation - << std::endl; - - out << " - clipped " << static_cast(main_thread_state.num_clipped_) / main_thread_state.num_total_ * 100.0 << "% of outputs" - << std::endl; - - out.unlock(); - - reset_stats(); - } - - // number of input/output dimensions - static constexpr IndexType kInputDimensions = LayerType::kOutputDimensions; - static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions; - - // LearnFloatType constant - static constexpr LearnFloatType kZero = static_cast(0.0); - static constexpr LearnFloatType kOne = static_cast(1.0); - - // number of samples in mini-batch - IndexType batch_size_; - - const LearnFloatType* input_; - - // Trainer of the previous layer - const std::shared_ptr> previous_layer_trainer_; - - // layer to learn - LayerType* const target_layer_; - - // Forward propagation buffer - std::vector> output_; - - // buffer for back propagation - std::vector> gradients_; - - struct alignas(kCacheLineSize) ThreadState - { - // Health check statistics - LearnFloatType min_activations_[kOutputDimensions]; - LearnFloatType max_activations_[kOutputDimensions]; - uint64_t num_clipped_; - uint64_t num_total_; - - ThreadState() { reset(); } - - ThreadState& operator+=(const ThreadState& other) - { - for (IndexType i = 0; i < kOutputDimensions; ++i) - { - min_activations_[i] = std::min(min_activations_[i], other.min_activations_[i]); - } - - for (IndexType i = 0; i < kOutputDimensions; ++i) - { - max_activations_[i] = std::max(max_activations_[i], other.max_activations_[i]); - } - - num_clipped_ += other.num_clipped_; - num_total_ += other.num_total_; - - return *this; - } - - void reset() - { - std::fill(std::begin(min_activations_), std::end(min_activations_), std::numeric_limits::max()); - std::fill(std::begin(max_activations_), std::end(max_activations_), std::numeric_limits::lowest()); - num_clipped_ = 0; - num_total_ = 0; - } - }; - - std::vector> thread_states_; - }; - -} // namespace Eval::NNUE - -#endif diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h deleted file mode 100644 index b0e0ebba..00000000 --- a/src/nnue/trainer/trainer_feature_transformer.h +++ /dev/null @@ -1,783 +0,0 @@ -#ifndef _NNUE_TRAINER_FEATURE_TRANSFORMER_H_ -#define _NNUE_TRAINER_FEATURE_TRANSFORMER_H_ - -#include "trainer.h" - -#include "extra/stockfish_blas.h" - -#include "features/all_factorizers.h" - -#include "learn/learn.h" - -#include "nnue/nnue_feature_transformer.h" - -#include "thread.h" - -#include -#include -#include -#include -#include - -// Specialization for feature transformer of learning class template of NNUE evaluation function -namespace Eval::NNUE { - - // Learning: Input feature converter - template <> - class Trainer { - private: - // Type of layer to learn - using LayerType = FeatureTransformer; - - public: - template - friend struct AlignedDeleter; - - template - friend std::shared_ptr make_aligned_shared_ptr(ArgumentTypes&&... arguments); - - // factory function - static std::shared_ptr create(LayerType* target_layer) { - return make_aligned_shared_ptr(target_layer); - } - - // Set options such as hyperparameters - void send_message(Message* message) { - if (receive_message("momentum", message)) { - momentum_ = static_cast(std::stod(message->value)); - } - - if (receive_message("learning_rate_scale", message)) { - learning_rate_scale_ = - static_cast(std::stod(message->value)); - } - - if (receive_message("reset", message)) { - dequantize_parameters(); - } - - if (receive_message("quantize_parameters", message)) { - quantize_parameters(); - } - - if (receive_message("clear_unobserved_feature_weights", message)) { - clear_unobserved_feature_weights(); - } - - if (receive_message("check_health", message)) { - check_health(); - } - } - - // Initialize the parameters with random numbers - template - void initialize(RNG& rng) { - std::fill(std::begin(weights_), std::end(weights_), +kZero); - - const double kSigma = 0.1 / std::sqrt(RawFeatures::kMaxActiveDimensions); - auto distribution = std::normal_distribution(0.0, kSigma); - - for (IndexType i = 0; i < kHalfDimensions * RawFeatures::kDimensions; ++i) { - const auto weight = static_cast(distribution(rng)); - weights_[i] = weight; - } - - for (IndexType i = 0; i < kHalfDimensions; ++i) { - biases_[i] = static_cast(0.5); - } - - quantize_parameters(); - } - - const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector::const_iterator batch_begin, std::vector::const_iterator batch_end) - { - const auto size = batch_end - batch_begin; - - if ((long)output_.size() < (long)kOutputDimensions * size) { - output_.resize(kOutputDimensions * size); - gradients_.resize(kOutputDimensions * size); - } - - if (thread_stat_states_.size() < thread_pool.size()) - { - thread_stat_states_.resize(thread_pool.size()); - } - - if (thread_bias_states_.size() < thread_pool.size()) - { - thread_bias_states_.resize(thread_pool.size()); - } - - batch_ = &*batch_begin; - batch_size_ = size; - - auto& main_thread_bias_state = thread_bias_states_[0]; - -#if defined(USE_BLAS) - - cblas_sscal( - kHalfDimensions, momentum_, main_thread_bias_state.biases_diff_, 1 - ); - -#else - - Blas::sscal( - kHalfDimensions, momentum_, main_thread_bias_state.biases_diff_, 1 - ); - -#endif - - for (IndexType i = 1; i < thread_bias_states_.size(); ++i) - thread_bias_states_[i].reset(); - - return output_.data(); - } - - // forward propagation - void propagate(Thread& th, uint64_t offset, uint64_t count) { - - auto& thread_stat_state = thread_stat_states_[th.thread_idx()]; - - for (IndexType b = offset; b < offset + count; ++b) - { - const IndexType batch_offset = kOutputDimensions * b; - - for (IndexType c = 0; c < 2; ++c) { - const IndexType output_offset = batch_offset + kHalfDimensions * c; - -#if defined(USE_BLAS) - - cblas_scopy( - kHalfDimensions, biases_, 1, &output_[output_offset], 1 - ); - - for (const auto& feature : batch_[b].training_features[c]) { - const IndexType weights_offset = kHalfDimensions * feature.get_index(); - cblas_saxpy( - kHalfDimensions, (float)feature.get_count(), - &weights_[weights_offset], 1, &output_[output_offset], 1 - ); - } - -#else - - Blas::scopy( - kHalfDimensions, biases_, 1, &output_[output_offset], 1 - ); - for (const auto& feature : batch_[b].training_features[c]) { - const IndexType weights_offset = kHalfDimensions * feature.get_index(); - Blas::saxpy( - kHalfDimensions, (float)feature.get_count(), - &weights_[weights_offset], &output_[output_offset] - ); - } - -#endif - } - } - -#if defined (USE_SSE2) - - { - static_assert(kHalfDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time"); - - auto m128_hmin_ps = [](__m128 x3210) { - __m128 x0032 = _mm_shuffle_ps(x3210, x3210, _MM_SHUFFLE(0, 0, 3, 2)); - __m128 min_x_x_13_20 = _mm_min_ps(x3210, x0032); - // a = [ # , # , min(x[1], x[3]) , min(x[2], x[0]) ] - __m128 min_x_x_20_13 = _mm_shuffle_ps(min_x_x_13_20, min_x_x_13_20, _MM_SHUFFLE(0, 0, 0, 1)); - return _mm_cvtss_f32(_mm_min_ps(min_x_x_13_20, min_x_x_20_13)); - }; - - auto m128_hmax_ps = [](__m128 x3210) { - __m128 x0032 = _mm_shuffle_ps(x3210, x3210, _MM_SHUFFLE(0, 0, 3, 2)); - __m128 max_x_x_13_20 = _mm_max_ps(x3210, x0032); - // a = [ # , # , max(x[1], x[3]) , max(x[2], x[0]) ] - __m128 max_x_x_20_13 = _mm_shuffle_ps(max_x_x_13_20, max_x_x_13_20, _MM_SHUFFLE(0, 0, 0, 1)); - return _mm_cvtss_f32(_mm_max_ps(max_x_x_13_20, max_x_x_20_13)); - }; - - const __m128 kZero4 = _mm_set1_ps(+kZero); - const __m128 kOne4 = _mm_set1_ps(+kOne); - - __m128 min_pre_activation0 = _mm_set1_ps(thread_stat_state.min_pre_activation_); - __m128 min_pre_activation1 = _mm_set1_ps(thread_stat_state.min_pre_activation_); - __m128 max_pre_activation0 = _mm_set1_ps(thread_stat_state.max_pre_activation_); - __m128 max_pre_activation1 = _mm_set1_ps(thread_stat_state.max_pre_activation_); - - for (IndexType b = offset; b < offset + count; ++b) - { - const IndexType batch_offset = kOutputDimensions * b; - for (IndexType i = 0; i < kOutputDimensions; i += 16) - { - __m128 out0 = _mm_loadu_ps(&output_[batch_offset + i + 0]); - __m128 out1 = _mm_loadu_ps(&output_[batch_offset + i + 4]); - __m128 out2 = _mm_loadu_ps(&output_[batch_offset + i + 8]); - __m128 out3 = _mm_loadu_ps(&output_[batch_offset + i + 12]); - - __m128 min01 = _mm_min_ps(out0, out1); - __m128 min23 = _mm_min_ps(out2, out3); - - __m128 max01 = _mm_max_ps(out0, out1); - __m128 max23 = _mm_max_ps(out2, out3); - - min_pre_activation0 = _mm_min_ps(min_pre_activation0, min01); - min_pre_activation1 = _mm_min_ps(min_pre_activation1, min23); - max_pre_activation0 = _mm_max_ps(max_pre_activation0, max01); - max_pre_activation1 = _mm_max_ps(max_pre_activation1, max23); - - out0 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out0)); - out1 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out1)); - out2 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out2)); - out3 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out3)); - - _mm_storeu_ps(&output_[batch_offset + i + 0], out0); - _mm_storeu_ps(&output_[batch_offset + i + 4], out1); - _mm_storeu_ps(&output_[batch_offset + i + 8], out2); - _mm_storeu_ps(&output_[batch_offset + i + 12], out3); - } - } - - thread_stat_state.min_pre_activation_ = m128_hmin_ps(_mm_min_ps(min_pre_activation0, min_pre_activation1)); - thread_stat_state.max_pre_activation_ = m128_hmax_ps(_mm_max_ps(max_pre_activation0, max_pre_activation1)); - - for (IndexType b = offset; b < offset + count; ++b) - { - const IndexType batch_offset = kOutputDimensions * b; - - for (IndexType half = 0; half < 2; ++half) - { - const IndexType half_offset = batch_offset + half * kHalfDimensions; - for (IndexType i = 0; i < kHalfDimensions; i += 16) - { - const __m128 out0 = _mm_loadu_ps(&output_[i + 0 + half_offset]); - const __m128 out1 = _mm_loadu_ps(&output_[i + 4 + half_offset]); - const __m128 out2 = _mm_loadu_ps(&output_[i + 8 + half_offset]); - const __m128 out3 = _mm_loadu_ps(&output_[i + 12 + half_offset]); - - __m128 minact0 = _mm_loadu_ps(&thread_stat_state.min_activations_[i + 0]); - __m128 minact1 = _mm_loadu_ps(&thread_stat_state.min_activations_[i + 4]); - __m128 minact2 = _mm_loadu_ps(&thread_stat_state.min_activations_[i + 8]); - __m128 minact3 = _mm_loadu_ps(&thread_stat_state.min_activations_[i + 12]); - - __m128 maxact0 = _mm_loadu_ps(&thread_stat_state.max_activations_[i + 0]); - __m128 maxact1 = _mm_loadu_ps(&thread_stat_state.max_activations_[i + 4]); - __m128 maxact2 = _mm_loadu_ps(&thread_stat_state.max_activations_[i + 8]); - __m128 maxact3 = _mm_loadu_ps(&thread_stat_state.max_activations_[i + 12]); - - minact0 = _mm_min_ps(out0, minact0); - minact1 = _mm_min_ps(out1, minact1); - minact2 = _mm_min_ps(out2, minact2); - minact3 = _mm_min_ps(out3, minact3); - - maxact0 = _mm_max_ps(out0, maxact0); - maxact1 = _mm_max_ps(out1, maxact1); - maxact2 = _mm_max_ps(out2, maxact2); - maxact3 = _mm_max_ps(out3, maxact3); - - _mm_storeu_ps(&thread_stat_state.min_activations_[i + 0], minact0); - _mm_storeu_ps(&thread_stat_state.min_activations_[i + 4], minact1); - _mm_storeu_ps(&thread_stat_state.min_activations_[i + 8], minact2); - _mm_storeu_ps(&thread_stat_state.min_activations_[i + 12], minact3); - - _mm_storeu_ps(&thread_stat_state.max_activations_[i + 0], maxact0); - _mm_storeu_ps(&thread_stat_state.max_activations_[i + 4], maxact1); - _mm_storeu_ps(&thread_stat_state.max_activations_[i + 8], maxact2); - _mm_storeu_ps(&thread_stat_state.max_activations_[i + 12], maxact3); - } - } - } - } - -#else - - // clipped ReLU - for (IndexType b = offset; b < offset + count; ++b) { - const IndexType batch_offset = kOutputDimensions * b; - for (IndexType i = 0; i < kOutputDimensions; ++i) { - const IndexType index = batch_offset + i; - thread_stat_state.min_pre_activation_ = std::min(thread_stat_state.min_pre_activation_, output_[index]); - thread_stat_state.max_pre_activation_ = std::max(thread_stat_state.max_pre_activation_, output_[index]); - output_[index] = std::max(+kZero, std::min(+kOne, output_[index])); - const IndexType t = i % kHalfDimensions; - thread_stat_state.min_activations_[t] = std::min(thread_stat_state.min_activations_[t], output_[index]); - thread_stat_state.max_activations_[t] = std::max(thread_stat_state.max_activations_[t], output_[index]); - } - } - -#endif - } - - // backpropagation - void backpropagate(Thread& th, - const LearnFloatType* gradients, - uint64_t offset, - uint64_t count) { - - auto& thread_stat_state = thread_stat_states_[th.thread_idx()]; - auto& thread_bias_state = thread_bias_states_[th.thread_idx()]; - -#if defined (USE_SSE2) - - { - static_assert(kHalfDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time"); - - const __m128 kZero4 = _mm_set1_ps(+kZero); - const __m128 kOne4 = _mm_set1_ps(+kOne); - - for (IndexType b = offset; b < offset + count; ++b) - { - const IndexType batch_offset = kOutputDimensions * b; - for (IndexType i = 0; i < kOutputDimensions; i += 16) - { - __m128 out0 = _mm_loadu_ps(&output_[batch_offset + i + 0]); - __m128 out1 = _mm_loadu_ps(&output_[batch_offset + i + 4]); - __m128 out2 = _mm_loadu_ps(&output_[batch_offset + i + 8]); - __m128 out3 = _mm_loadu_ps(&output_[batch_offset + i + 12]); - - __m128 clipped0 = _mm_or_ps(_mm_cmple_ps(out0, kZero4), _mm_cmpge_ps(out0, kOne4)); - __m128 clipped1 = _mm_or_ps(_mm_cmple_ps(out1, kZero4), _mm_cmpge_ps(out1, kOne4)); - __m128 clipped2 = _mm_or_ps(_mm_cmple_ps(out2, kZero4), _mm_cmpge_ps(out2, kOne4)); - __m128 clipped3 = _mm_or_ps(_mm_cmple_ps(out3, kZero4), _mm_cmpge_ps(out3, kOne4)); - - __m128 grad0 = _mm_loadu_ps(&gradients[batch_offset + i + 0]); - __m128 grad1 = _mm_loadu_ps(&gradients[batch_offset + i + 4]); - __m128 grad2 = _mm_loadu_ps(&gradients[batch_offset + i + 8]); - __m128 grad3 = _mm_loadu_ps(&gradients[batch_offset + i + 12]); - - grad0 = _mm_andnot_ps(clipped0, grad0); - grad1 = _mm_andnot_ps(clipped1, grad1); - grad2 = _mm_andnot_ps(clipped2, grad2); - grad3 = _mm_andnot_ps(clipped3, grad3); - - _mm_storeu_ps(&gradients_[batch_offset + i + 0], grad0); - _mm_storeu_ps(&gradients_[batch_offset + i + 4], grad1); - _mm_storeu_ps(&gradients_[batch_offset + i + 8], grad2); - _mm_storeu_ps(&gradients_[batch_offset + i + 12], grad3); - - const int clipped_mask = - (_mm_movemask_ps(clipped0) << 0) - | (_mm_movemask_ps(clipped1) << 4) - | (_mm_movemask_ps(clipped2) << 8) - | (_mm_movemask_ps(clipped3) << 12); - - thread_stat_state.num_clipped_ += popcount(clipped_mask); - } - } - } - -#else - - for (IndexType b = offset; b < offset + count; ++b) { - const IndexType batch_offset = kOutputDimensions * b; - for (IndexType i = 0; i < kOutputDimensions; ++i) { - const IndexType index = batch_offset + i; - const bool clipped = (output_[index] <= kZero) | (output_[index] >= kOne); - gradients_[index] = gradients[index] * !clipped; - thread_stat_state.num_clipped_ += clipped; - } - } - -#endif - - thread_stat_state.num_total_ += count * kOutputDimensions; - -#if defined(USE_BLAS) - - for (IndexType b = offset; b < offset + count; ++b) { - const IndexType batch_offset = kOutputDimensions * b; - for (IndexType c = 0; c < 2; ++c) { - const IndexType output_offset = batch_offset + kHalfDimensions * c; - cblas_saxpy( - kHalfDimensions, 1.0, - &gradients_[output_offset], 1, thread_bias_state.biases_diff_, 1 - ); - } - } - -#else - - for (IndexType b = offset; b < offset + count; ++b) { - const IndexType batch_offset = kOutputDimensions * b; - for (IndexType c = 0; c < 2; ++c) { - const IndexType output_offset = batch_offset + kHalfDimensions * c; - Blas::saxpy( - kHalfDimensions, 1.0, - &gradients_[output_offset], 1, thread_bias_state.biases_diff_, 1 - ); - } - } - -#endif - } - - void reduce_thread_stat_state() - { - for (IndexType i = 1; i < thread_stat_states_.size(); ++i) - { - thread_stat_states_[0] += thread_stat_states_[i]; - } - } - - void reduce_thread_bias_state() - { - for (IndexType i = 1; i < thread_bias_states_.size(); ++i) - { - thread_bias_states_[0] += thread_bias_states_[i]; - } - } - - void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate) { - - const LearnFloatType local_learning_rate = - learning_rate * learning_rate_scale_; - - // Since the weight matrix updates only the columns corresponding to the features that appeared in the input, - // Correct the learning rate and adjust the scale without using momentum - const LearnFloatType effective_learning_rate = - static_cast(local_learning_rate / (1.0 - momentum_)); - - reduce_thread_bias_state(); - - auto& main_thread_state = thread_bias_states_[0]; - -#if defined(USE_BLAS) - - cblas_saxpy( - kHalfDimensions, -local_learning_rate, - main_thread_state.biases_diff_, 1, biases_, 1 - ); - -#else - - Blas::saxpy( - kHalfDimensions, -local_learning_rate, - main_thread_state.biases_diff_, 1, biases_, 1 - ); - -#endif - - thread_pool.execute_with_workers( - [&, num_threads = thread_pool.size()](Thread& th) { - const auto thread_index = th.thread_idx(); - - for (IndexType b = 0; b < batch_size_; ++b) { - const IndexType batch_offset = kOutputDimensions * b; - - for (IndexType c = 0; c < 2; ++c) { - const IndexType output_offset = batch_offset + kHalfDimensions * c; - for (const auto& feature : batch_[b].training_features[c]) { - const IndexType feature_index = feature.get_index(); - const IndexType weights_offset = - kHalfDimensions * feature_index; -#if defined (USE_SSE2) - _mm_prefetch(reinterpret_cast(&weights_[weights_offset]), _MM_HINT_T2); -#endif - - // We assign each bucket a continuous range of bits at least - // of cache line size to prevent false sharing. - // For HalfKP this is enough to saturate about 80 threads. - const IndexType thread_bucket = - (feature_index / BitsetType::best_concurrent_access_stride) - % num_threads; - - if (thread_bucket != thread_index) - continue; - - // This operation can be performed safely because - // each thread accesses a different memory location - // (even a different cache line) - observed_features.set(feature_index); - - const auto scale = static_cast( - effective_learning_rate / feature.get_count()); - -#if defined (USE_BLAS) - - cblas_saxpy( - kHalfDimensions, -scale, - &gradients_[output_offset], 1, - &weights_[weights_offset], 1 - ); - -#else - - Blas::saxpy( - kHalfDimensions, -scale, - &gradients_[output_offset], - &weights_[weights_offset] - ); - -#endif - } - } - } - } - ); - - thread_pool.wait_for_workers_finished(); - } - - private: - // constructor - Trainer(LayerType* target_layer) : - batch_(nullptr), - batch_size_(0), - target_layer_(target_layer), - biases_(), - weights_(), - momentum_(0.2), - learning_rate_scale_(1.0) { - - dequantize_parameters(); - } - - // Weight saturation and parameterization - void quantize_parameters() { - for (IndexType i = 0; i < kHalfDimensions; ++i) { - target_layer_->biases_[i] = - round(biases_[i] * kBiasScale); - } - - std::vector training_features; - - Threads.for_each_index_with_workers( - 0, RawFeatures::kDimensions, - [this, training_features](Thread&, int j) mutable { - training_features.clear(); - Features::Factorizer::append_training_features( - j, &training_features); - - for (IndexType i = 0; i < kHalfDimensions; ++i) { - double sum = 0.0; - for (const auto& feature : training_features) { - sum += weights_[kHalfDimensions * feature.get_index() + i]; - } - - target_layer_->weights_[kHalfDimensions * j + i] = - round(sum * kWeightScale); - } - } - ); - Threads.wait_for_workers_finished(); - } - - void reset_stats() { - for (auto& state : thread_stat_states_) - state.reset(); - } - - // read parameterized integer - void dequantize_parameters() { - for (IndexType i = 0; i < kHalfDimensions; ++i) { - biases_[i] = static_cast( - target_layer_->biases_[i] / kBiasScale); - } - - std::fill(std::begin(weights_), std::end(weights_), +kZero); - - for (IndexType i = 0; i < kHalfDimensions * RawFeatures::kDimensions; ++i) { - weights_[i] = static_cast( - target_layer_->weights_[i] / kWeightScale); - } - - reset_stats(); - - for (auto& state : thread_bias_states_) - state.reset(); - } - - // Set the weight corresponding to the feature that does not appear in the learning data to 0 - void clear_unobserved_feature_weights() { - for (IndexType i = 0; i < kInputDimensions; ++i) { - if (!observed_features.test(i)) { - std::fill(std::begin(weights_) + kHalfDimensions * i, - std::begin(weights_) + kHalfDimensions * (i + 1), +kZero); - } - } - - quantize_parameters(); - } - - // Check if there are any problems with learning - void check_health() { - - constexpr LearnFloatType kPreActivationLimit = - std::numeric_limits::max() / - kWeightScale; - - reduce_thread_stat_state(); - - auto& main_thread_state = thread_stat_states_[0]; - - const auto largest_min_activation = *std::max_element( - std::begin(main_thread_state.min_activations_), std::end(main_thread_state.min_activations_)); - const auto smallest_max_activation = *std::min_element( - std::begin(main_thread_state.max_activations_), std::end(main_thread_state.max_activations_)); - - double abs_bias_sum = 0.0; - double abs_weight_sum = 0.0; - - for(auto b : biases_) - abs_bias_sum += std::abs(b); - - std::vector training_features; - for (IndexType j = 0; j < RawFeatures::kDimensions; ++j) - { - training_features.clear(); - Features::Factorizer::append_training_features( - j, &training_features); - - for (const auto& feature : training_features) { - for (IndexType i = 0; i < kHalfDimensions; ++i) { - abs_weight_sum += std::abs(weights_[kHalfDimensions * feature.get_index() + i]); - } - } - } - - auto out = sync_region_cout.new_region(); - - out << "INFO (check_health):" - << " layer " << LayerType::kLayerIndex - << " - " << LayerType::get_name() - << std::endl; - - out << " - observed " << observed_features.count() - << " (out of " << kInputDimensions << ") features" - << std::endl; - - out << " - (min, max) of pre-activations = " - << main_thread_state.min_pre_activation_ << ", " - << main_thread_state.max_pre_activation_ << " (limit = " - << kPreActivationLimit << ")" - << std::endl; - - out << " - largest min activation = " << largest_min_activation - << " , smallest max activation = " << smallest_max_activation - << std::endl; - - out << " - avg_abs_bias = " << abs_bias_sum / std::size(biases_) << std::endl; - out << " - avg_abs_weight = " << abs_weight_sum / std::size(weights_) << std::endl; - - out << " - clipped " << static_cast(main_thread_state.num_clipped_) / main_thread_state.num_total_ * 100.0 << "% of outputs" - << std::endl; - - out.unlock(); - - reset_stats(); - } - - // number of input/output dimensions - static constexpr IndexType kInputDimensions = - Features::Factorizer::get_dimensions(); - static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions; - static constexpr IndexType kHalfDimensions = LayerType::kHalfDimensions; - - // Coefficient used for parameterization - static constexpr LearnFloatType kActivationScale = - std::numeric_limits::max(); - static constexpr LearnFloatType kBiasScale = kActivationScale; - static constexpr LearnFloatType kWeightScale = kActivationScale; - - // LearnFloatType constant - static constexpr LearnFloatType kZero = static_cast(0.0); - static constexpr LearnFloatType kOne = static_cast(1.0); - - // mini batch - const Example* batch_; - IndexType batch_size_; - - // layer to learn - LayerType* const target_layer_; - - // parameter - alignas(kCacheLineSize) LearnFloatType biases_[kHalfDimensions]; - alignas(kCacheLineSize) - LearnFloatType weights_[kHalfDimensions * kInputDimensions]; - - // Buffer used for updating parameters - std::vector> gradients_; - - // Forward propagation buffer - std::vector> output_; - - // Features that appeared in the training data - using BitsetType = LargeBitset; - BitsetType observed_features; - - // hyper parameter - LearnFloatType momentum_; - LearnFloatType learning_rate_scale_; - - struct alignas(kCacheLineSize) ThreadStatState - { - alignas(kCacheLineSize) LearnFloatType min_activations_[kHalfDimensions]; - alignas(kCacheLineSize) LearnFloatType max_activations_[kHalfDimensions]; - LearnFloatType min_pre_activation_; - LearnFloatType max_pre_activation_; - uint64_t num_clipped_; - uint64_t num_total_; - - ThreadStatState() { reset(); } - - ThreadStatState& operator+=(const ThreadStatState& other) - { - for (IndexType i = 0; i < kHalfDimensions; ++i) - { - min_activations_[i] = std::min(min_activations_[i], other.min_activations_[i]); - } - - for (IndexType i = 0; i < kHalfDimensions; ++i) - { - max_activations_[i] = std::max(max_activations_[i], other.max_activations_[i]); - } - - min_pre_activation_ = std::min(min_pre_activation_, other.min_pre_activation_); - max_pre_activation_ = std::max(max_pre_activation_, other.max_pre_activation_); - - num_clipped_ += other.num_clipped_; - num_total_ += other.num_total_; - - return *this; - } - - void reset() - { - std::fill(std::begin(min_activations_), std::end(min_activations_), std::numeric_limits::max()); - std::fill(std::begin(max_activations_), std::end(max_activations_), std::numeric_limits::lowest()); - min_pre_activation_ = std::numeric_limits::max(); - max_pre_activation_ = std::numeric_limits::lowest(); - num_clipped_ = 0; - num_total_ = 0; - } - }; - - struct alignas(kCacheLineSize) ThreadBiasState - { - alignas(kCacheLineSize) LearnFloatType biases_diff_[kHalfDimensions]; - - ThreadBiasState() { reset(); } - - ThreadBiasState& operator+=(const ThreadBiasState& other) - { - for (IndexType i = 0; i < kHalfDimensions; ++i) - { - biases_diff_[i] += other.biases_diff_[i]; - } - - return *this; - } - - void reset() - { - std::fill(std::begin(biases_diff_), std::end(biases_diff_), 0.0f); - } - }; - - std::vector> thread_stat_states_; - std::vector> thread_bias_states_; - }; - -} // namespace Eval::NNUE - -#endif diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h deleted file mode 100644 index ff1265dc..00000000 --- a/src/nnue/trainer/trainer_input_slice.h +++ /dev/null @@ -1,383 +0,0 @@ -#ifndef _NNUE_TRAINER_INPUT_SLICE_H_ -#define _NNUE_TRAINER_INPUT_SLICE_H_ - -#include "trainer.h" - -#include "extra/stockfish_blas.h" - -#include "learn/learn.h" - -#include "nnue/layers/input_slice.h" - -#include "thread.h" - -// Specialization of NNUE evaluation function learning class template for InputSlice -namespace Eval::NNUE { - - // Learning: Input layer - // This is tricky. It exists because when there's more than one trainer - // on top of a single feature transformer we want to only call propagate/backpropagate - // on the feature transformer once. This is straightforward in the old - // multithreading case, because propagate/backpropagate is called just once from the - // main thread. But with the current implementation of coarser multithreading - // we end up calling each method from each thread. Therefore we have to keep - // the num_calls and current_operation per thread basis, each thread must work - // on its designated batch slice, and the only synchronization points are - // step_start and step_end - for which we use state of the first thread. - // Each thread requires their own bookkeeping because it's possible that - // one thread is still in propagate of some batch slice while the other thread - // is doing backpropagate of some other slice. We also ensure the thread state - // isn't suspectible to false sharing by using a full cache line for the state. - class SharedInputTrainer { - public: - // factory function - static std::shared_ptr create( - FeatureTransformer* ft) { - - static std::shared_ptr instance; - - if (!instance) { - instance.reset(new SharedInputTrainer(ft)); - } - - ++instance->num_referrers_; - - return instance; - } - - // Set options such as hyperparameters - void send_message(Message* message) { - auto& thread_state = thread_states_[0]; - - if (thread_state.num_calls == 0) { - thread_state.current_operation = Operation::kSendMessage; - feature_transformer_trainer_->send_message(message); - } - - assert(thread_state.current_operation == Operation::kSendMessage); - - if (++thread_state.num_calls == num_referrers_) { - thread_state.num_calls = 0; - thread_state.current_operation = Operation::kNone; - } - } - - // Initialize the parameters with random numbers - template - void initialize(RNG& rng) { - auto& thread_state = thread_states_[0]; - - if (thread_state.num_calls == 0) { - thread_state.current_operation = Operation::kInitialize; - feature_transformer_trainer_->initialize(rng); - } - - assert(thread_state.current_operation == Operation::kInitialize); - - if (++thread_state.num_calls == num_referrers_) { - thread_state.num_calls = 0; - thread_state.current_operation = Operation::kNone; - } - } - - const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector::const_iterator batch_begin, std::vector::const_iterator batch_end) - { - const auto size = batch_end - batch_begin; - - if ((long)gradients_.size() < (long)kInputDimensions * size) { - gradients_.resize(kInputDimensions * size); - } - - if (thread_states_.size() < thread_pool.size()) - { - thread_states_.resize(thread_pool.size()); - } - - batch_size_ = size; - - auto& thread_state = thread_states_[0]; - - if (thread_state.num_calls == 0) { - thread_state.current_operation = Operation::kStepStart; - output_ = feature_transformer_trainer_->step_start(thread_pool, batch_begin, batch_end); - } - - assert(thread_state.current_operation == Operation::kStepStart); - - if (++thread_state.num_calls == num_referrers_) { - thread_state.num_calls = 0; - thread_state.current_operation = Operation::kNone; - } - - return output_; - } - - // forward propagation - void propagate(Thread& th, uint64_t offset, uint64_t count) { - const auto thread_id = th.thread_idx(); - - auto& thread_state = thread_states_[thread_id]; - - if (thread_state.num_calls == 0) { - thread_state.current_operation = Operation::kPropagate; - feature_transformer_trainer_->propagate(th, offset, count); - } - - assert(thread_state.current_operation == Operation::kPropagate); - - if (++thread_state.num_calls == num_referrers_) { - thread_state.num_calls = 0; - thread_state.current_operation = Operation::kNone; - } - } - - // backpropagation - void backpropagate(Thread& th, - const LearnFloatType* gradients, - uint64_t offset, - uint64_t count) { - - const auto thread_id = th.thread_idx(); - - auto& thread_state = thread_states_[thread_id]; - - if (num_referrers_ == 1) { - feature_transformer_trainer_->backpropagate(th, gradients, offset, count); - return; - } - - if (thread_state.num_calls == 0) { - thread_state.current_operation = Operation::kBackPropagate; - for (IndexType b = offset; b < offset + count; ++b) { - const IndexType batch_offset = kInputDimensions * b; - for (IndexType i = 0; i < kInputDimensions; ++i) { - gradients_[batch_offset + i] = static_cast(0.0); - } - } - } - - assert(thread_state.current_operation == Operation::kBackPropagate); - - for (IndexType b = offset; b < offset + count; ++b) { - const IndexType batch_offset = kInputDimensions * b; - for (IndexType i = 0; i < kInputDimensions; ++i) { - gradients_[batch_offset + i] += gradients[batch_offset + i]; - } - } - - if (++thread_state.num_calls == num_referrers_) { - feature_transformer_trainer_->backpropagate( - th, gradients_.data(), offset, count); - thread_state.num_calls = 0; - thread_state.current_operation = Operation::kNone; - } - } - - void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate) { - auto& thread_state = thread_states_[0]; - - if (thread_state.num_calls == 0) { - thread_state.current_operation = Operation::kStepEnd; - feature_transformer_trainer_->step_end(thread_pool, learning_rate); - } - - assert(thread_state.current_operation == Operation::kStepEnd); - - if (++thread_state.num_calls == num_referrers_) { - thread_state.num_calls = 0; - thread_state.current_operation = Operation::kNone; - } - } - - private: - // constructor - SharedInputTrainer(FeatureTransformer* ft) : - batch_size_(0), - num_referrers_(0), - thread_states_(1), - feature_transformer_trainer_(Trainer::create( - ft)), - output_(nullptr) { - } - - // number of input/output dimensions - static constexpr IndexType kInputDimensions = - FeatureTransformer::kOutputDimensions; - - // type of processing - enum class Operation { - kNone, - kSendMessage, - kInitialize, - kStepStart, - kPropagate, - kBackPropagate, - kStepEnd, - }; - - // number of samples in mini-batch - IndexType batch_size_; - - // number of layers sharing this layer as input - std::uint32_t num_referrers_; - - struct alignas(kCacheLineSize) ThreadState - { - std::uint32_t num_calls{0}; - - // current processing type - Operation current_operation = Operation::kNone; - }; - - // Number of times the current process has been called - std::vector> thread_states_; - - // Trainer of input feature converter - const std::shared_ptr> - feature_transformer_trainer_; - - // pointer to output shared for forward propagation - const LearnFloatType* output_; - - // buffer for back propagation - std::vector> gradients_; - }; - - // Learning: Input layer - template - class Trainer> { - private: - // Type of layer to learn - using LayerType = Layers::InputSlice; - - public: - // factory function - static std::shared_ptr create( - LayerType* /*target_layer*/, FeatureTransformer* ft) { - - return std::shared_ptr(new Trainer(ft)); - } - - // Set options such as hyperparameters - void send_message(Message* message) { - shared_input_trainer_->send_message(message); - } - - // Initialize the parameters with random numbers - template - void initialize(RNG& rng) { - shared_input_trainer_->initialize(rng); - } - - const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector::const_iterator batch_begin, std::vector::const_iterator batch_end) - { - const auto size = batch_end - batch_begin; - - if ((long)output_.size() < (long)kOutputDimensions * size) { - output_.resize(kOutputDimensions * size); - gradients_.resize(kInputDimensions * size); - } - - batch_size_ = size; - - input_ = shared_input_trainer_->step_start(thread_pool, batch_begin, batch_end); - - return output_.data(); - } - - // forward propagation - void propagate(Thread& th, uint64_t offset, uint64_t count) { - - shared_input_trainer_->propagate(th, offset, count); - - for (IndexType b = offset; b < offset + count; ++b) { - const IndexType input_offset = kInputDimensions * b; - const IndexType output_offset = kOutputDimensions * b; - -#if defined(USE_BLAS) - - cblas_scopy( - kOutputDimensions, &input_[input_offset + Offset], 1, - &output_[output_offset], 1 - ); -#else - - Blas::scopy( - kOutputDimensions, &input_[input_offset + Offset], 1, - &output_[output_offset], 1 - ); - -#endif - } - } - - // backpropagation - void backpropagate(Thread& th, - const LearnFloatType* gradients, - uint64_t offset, - uint64_t count) { - - for (IndexType b = offset; b < offset + count; ++b) - { - const IndexType input_offset = kInputDimensions * b; - const IndexType output_offset = kOutputDimensions * b; - - IndexType i = 0; - if constexpr (Offset > 0) - { - for (; i < Offset; ++i) { - gradients_[input_offset + i] = static_cast(0.0); - } - } - - for (; i < Offset + kOutputDimensions; ++i) { - gradients_[input_offset + i] = gradients[output_offset + i - Offset]; - } - - if constexpr (Offset + kOutputDimensions < kInputDimensions) - { - for (; i < kInputDimensions; ++i) - { - gradients_[input_offset + i] = static_cast(0.0); - } - } - } - - shared_input_trainer_->backpropagate(th, gradients_.data(), offset, count); - } - - void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate) { - shared_input_trainer_->step_end(thread_pool, learning_rate); - } - - private: - // constructor - Trainer(FeatureTransformer* ft) : - batch_size_(0), - shared_input_trainer_(SharedInputTrainer::create(ft)) { - } - - // number of input/output dimensions - static constexpr IndexType kInputDimensions = - FeatureTransformer::kOutputDimensions; - static constexpr IndexType kOutputDimensions = OutputDimensions; - static_assert(Offset + kOutputDimensions <= kInputDimensions, ""); - - // number of samples in mini-batch - IndexType batch_size_; - - const LearnFloatType* input_; - - // Trainer of shared input layer - const std::shared_ptr shared_input_trainer_; - - // Forward propagation buffer - std::vector> output_; - - // buffer for back propagation - std::vector> gradients_; - }; - -} // namespace Eval::NNUE - -#endif diff --git a/src/nnue/trainer/trainer_sum.h b/src/nnue/trainer/trainer_sum.h deleted file mode 100644 index 88ff302c..00000000 --- a/src/nnue/trainer/trainer_sum.h +++ /dev/null @@ -1,201 +0,0 @@ -#ifndef _NNUE_TRAINER_SUM_H_ -#define _NNUE_TRAINER_SUM_H_ - -#include "trainer.h" - -#include "extra/stockfish_blas.h" - -#include "learn/learn.h" - -#include "nnue/layers/sum.h" - -#include "thread.h" - -// Specialization of NNUE evaluation function learning class template for Sum -namespace Eval::NNUE { - - // Learning: A layer that sums the outputs of multiple layers - template - class Trainer> : - Trainer> { - private: - // Type of layer to learn - using LayerType = Layers::Sum; - using Tail = Trainer>; - - public: - // factory function - static std::shared_ptr create( - LayerType* target_layer, FeatureTransformer* ft) { - - return std::shared_ptr( - new Trainer(target_layer, ft)); - } - - // Set options such as hyperparameters - void send_message(Message* message) { - // The results of other member functions do not depend on the processing order, so - // Tail is processed first for the purpose of simplifying the implementation, but - // SendMessage processes Head first to make it easier to understand subscript correspondence - previous_layer_trainer_->send_message(message); - Tail::send_message(message); - } - - // Initialize the parameters with random numbers - template - void initialize(RNG& rng) { - Tail::initialize(rng); - previous_layer_trainer_->initialize(rng); - } - - // forward propagation - /*const*/ LearnFloatType* propagate(ThreadPool& thread_pool, const std::vector& batch) { - batch_size_ = static_cast(batch.size()); - auto output = Tail::propagate(thread_pool, batch); - const auto head_output = previous_layer_trainer_->propagate(thread_pool, batch); - -#if defined(USE_BLAS) - - cblas_saxpy( - kOutputDimensions * batch_size_, 1.0, - head_output, 1, output, 1 - ); - -#else - - Blas::saxpy( - thread_pool, - kOutputDimensions * batch_size_, 1.0, - head_output, 1, output, 1 - ); - -#endif - return output; - } - - // backpropagation - void backpropagate(ThreadPool& thread_pool, - const LearnFloatType* gradients, - LearnFloatType learning_rate) { - - Tail::backpropagate(thread_pool, gradients, learning_rate); - previous_layer_trainer_->backpropagate(thread_pool, gradients, learning_rate); - } - - private: - // constructor - Trainer(LayerType* target_layer, FeatureTransformer* ft): - Tail(target_layer, ft), - batch_size_(0), - previous_layer_trainer_(Trainer::create( - &target_layer->previous_layer_, ft)), - target_layer_(target_layer) { - } - - // number of input/output dimensions - static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions; - - // make subclass friend - template - friend class Trainer; - - // number of samples in mini-batch - IndexType batch_size_; - - // Trainer of the previous layer - const std::shared_ptr> previous_layer_trainer_; - - // layer to learn - LayerType* const target_layer_; - }; - - - // Learning: Layer that takes the sum of the outputs of multiple layers (when there is one template argument) - template - class Trainer> { - private: - // Type of layer to learn - using LayerType = Layers::Sum; - - public: - // factory function - static std::shared_ptr create( - LayerType* target_layer, FeatureTransformer* ft) { - - return std::shared_ptr( - new Trainer(target_layer, ft)); - } - - // Set options such as hyperparameters - void send_message(Message* message) { - previous_layer_trainer_->send_message(message); - } - - // Initialize the parameters with random numbers - template - void initialize(RNG& rng) { - previous_layer_trainer_->initialize(rng); - } - - // forward propagation - /*const*/ LearnFloatType* propagate(const std::vector& batch) { - if (output_.size() < kOutputDimensions * batch.size()) { - output_.resize(kOutputDimensions * batch.size()); - } - - batch_size_ = static_cast(batch.size()); - const auto output = previous_layer_trainer_->propagate(batch); - -#if defined(USE_BLAS) - cblas_scopy(kOutputDimensions * batch_size_, output, 1, &output_[0], 1); -#else - for (IndexType b = 0; b < batch_size_; ++b) { - const IndexType batch_offset = kOutputDimensions * b; - for (IndexType i = 0; i < kOutputDimensions; ++i) { - output_[batch_offset + i] = output[batch_offset + i]; - } - } - -#endif - return output_.data(); - } - - // backpropagation - void backpropagate(const LearnFloatType* gradients, - LearnFloatType learning_rate) { - - previous_layer_trainer_->backpropagate(gradients, learning_rate); - } - - private: - // constructor - Trainer(LayerType* target_layer, FeatureTransformer* ft) : - batch_size_(0), - previous_layer_trainer_(Trainer::create( - &target_layer->previous_layer_, ft)), - target_layer_(target_layer) { - } - - // number of input/output dimensions - static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions; - - // make subclass friend - template - friend class Trainer; - - // number of samples in mini-batch - IndexType batch_size_; - - // Trainer of the previous layer - const std::shared_ptr> previous_layer_trainer_; - - // layer to learn - LayerType* const target_layer_; - - // Forward propagation buffer - std::vector> output_; - }; - -} // namespace Eval::NNUE - -#endif diff --git a/src/uci.cpp b/src/uci.cpp index 7da2881f..9a9a9e3c 100644 --- a/src/uci.cpp +++ b/src/uci.cpp @@ -22,11 +22,9 @@ #include #include -#include "extra/stockfish_blas.h" #include "nnue/evaluate_nnue.h" #include "evaluate.h" #include "movegen.h" -#include "nnue/nnue_test_command.h" #include "position.h" #include "search.h" #include "syzygy/tbprobe.h" @@ -37,7 +35,6 @@ #include "learn/gensfen.h" #include "learn/gensfen_nonpv.h" -#include "learn/learn.h" #include "learn/convert.h" #include "learn/transform.h" #include "learn/stats.h" @@ -49,17 +46,6 @@ extern vector setup_bench(const Position&, istream&); // FEN string of the initial position, normal chess const char* StartFEN = "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1"; -void test_cmd(Position& pos, istringstream& is) -{ - // Initialize as it may be searched. - Eval::NNUE::init(); - - std::string param; - is >> param; - - if (param == "nnue") Eval::NNUE::test_command(pos, is); -} - namespace { // position() is called when engine receives the "position" UCI command. @@ -344,7 +330,6 @@ void UCI::loop(int argc, char* argv[]) { else if (token == "gensfen") Learner::gensfen(is); else if (token == "gensfen_nonpv") Learner::gensfen_nonpv(is); - else if (token == "learn") Learner::learn(is); else if (token == "convert") Learner::convert(is); else if (token == "convert_bin") Learner::convert_bin(is); else if (token == "convert_plain") Learner::convert_plain(is); @@ -361,17 +346,7 @@ void UCI::loop(int argc, char* argv[]) { std::cout << th.thread_idx() << '\n'; }); } - else if (token == "blastest") - { - Blas::test(Threads); - } - else if (token == "blasbench") - { - Blas::bench(Threads); - } - // test command - else if (token == "test") test_cmd(pos, is); else sync_cout << "Unknown command: " << cmd << sync_endl;