float24: take care of the denormalized values

This commit is contained in:
B3n30 2017-07-30 13:07:32 +02:00
parent 6cae218e66
commit 03db0ea058

View File

@ -28,24 +28,22 @@ public:
u32 hex; u32 hex;
std::memcpy(&hex, &val, sizeof(u32)); std::memcpy(&hex, &val, sizeof(u32));
if (val == 0.f) // Take care of special case 0.0
return Float<M, E>::Zero(); // Pica200 has no -0 if (val == 0.f) {
const int bias = 127 - Float<M, E>::EXPONENT_BIAS; // Since Pica has no -0, we don't have to check for the sign
u32 sign = hex >> 31; return Float<M, E>::Zero();
u32 exponent = ((hex >> 23) & ((1 << 8) - 1)) - bias; }
u32 mantissa = (hex & ((1 << 23) - 1)) >> (23 - M - 1); const s32 bias = 127 - EXPONENT_BIAS;
const s32 sign = hex & 1 << 31;
// calculate with a M+1-bit mantissa and round down to M bit const s32 exponent = ((hex >> 23) & ((1 << 8) - 1)) - bias;
if (mantissa << 31)
mantissa = ((mantissa >> 1) + 1);
else
mantissa = (mantissa >> 1);
// Take care of the special cases NAN and INF
if (std::isnan(val)) { if (std::isnan(val)) {
Float<M, E> res; Float<M, E> res;
res.value = val; res.value = val;
return res; return res;
} else if (exponent & (1 << E)) { } else if (exponent > EXPONENT_MASK) {
// exponent is bigger then the maximum value for E, thus infinity
if (sign) { if (sign) {
Float<M, E> res; Float<M, E> res;
res.value = -std::numeric_limits<float>::infinity(); res.value = -std::numeric_limits<float>::infinity();
@ -57,7 +55,28 @@ public:
} }
} }
u32 res = (sign << 31) | ((exponent + bias) << 23) | (mantissa << (23 - M)); // calculate with a M+1-bit mantissa and round down to M bit
u32 mantissa = (hex & ((1 << 23) - 1)) >> (23 - M - 1);
if (mantissa & 1) {
// TODO(B3N30): Check that this is the correct PICA rounding
mantissa = ((mantissa >> 1) + 1);
} else {
mantissa = (mantissa >> 1);
}
// Take care of the denormalized values. Are the limits correct?
if ((exponent < -EXPONENT_BIAS) && (exponent > -(s32)(EXPONENT_BIAS + M))) {
// Mask out the least significant bits that would get lost due to normalization
mantissa &= ~((1 << (-exponent - EXPONENT_BIAS)) - 1);
} else if (exponent <= -(s32)(EXPONENT_BIAS + M)) {
// It's even to small for denoermalized values
return Float<M, E>::Zero();
}
// TODO(B3N30): Make this faster. Compared to the original hex only the mantissa is changed
// slightly
u32 res = sign | ((exponent + bias) << 23) | (mantissa << (23 - M));
Float<M, E> result; Float<M, E> result;
std::memcpy(&result.value, &res, sizeof(float)); std::memcpy(&result.value, &res, sizeof(float));
return result; return result;
@ -66,9 +85,9 @@ public:
static Float<M, E> FromRaw(u32 hex) { static Float<M, E> FromRaw(u32 hex) {
Float<M, E> res; Float<M, E> res;
const int width = M + E + 1; const s32 width = M + E + 1;
const int bias = 128 - (1 << (E - 1)); const s32 bias = 127 - EXPONENT_BIAS;
const int exponent = (hex >> M) & ((1 << E) - 1); const s32 exponent = (hex >> M) & ((1 << E) - 1);
const unsigned mantissa = hex & ((1 << M) - 1); const unsigned mantissa = hex & ((1 << M) - 1);
if (hex & ((1 << (width - 1)) - 1)) if (hex & ((1 << (width - 1)) - 1))
@ -161,10 +180,8 @@ public:
private: private:
static_assert(M + E + 1 <= 32, "Maximum bitsize is 32"); static_assert(M + E + 1 <= 32, "Maximum bitsize is 32");
static const unsigned MASK = (1 << (M + E + 1)) - 1;
static const unsigned MANTISSA_MASK = (1 << M) - 1;
static const unsigned EXPONENT_MASK = (1 << E) - 1; static const unsigned EXPONENT_MASK = (1 << E) - 1;
static const u32 EXPONENT_BIAS = (1 << (E - 1)) - 1; static const s32 EXPONENT_BIAS = (1 << (E - 1)) - 1;
// Stored as a regular float, merely for convenience // Stored as a regular float, merely for convenience
// TODO: Perform proper arithmetic on this! // TODO: Perform proper arithmetic on this!