275 lines
7.4 KiB
C#
275 lines
7.4 KiB
C#
using System;
|
|
|
|
namespace Passer.LinearAlgebra
|
|
{
|
|
|
|
public class float16
|
|
{
|
|
//
|
|
// FILE: float16.cpp
|
|
// AUTHOR: Rob Tillaart
|
|
// VERSION: 0.1.8
|
|
// PURPOSE: library for Float16s for Arduino
|
|
// URL: http://en.wikipedia.org/wiki/Half-precision_floating-point_format
|
|
|
|
ushort _value;
|
|
|
|
public float16() { _value = 0; }
|
|
|
|
public float16(float f)
|
|
{
|
|
_value = f32tof16(f);
|
|
}
|
|
|
|
public float toFloat()
|
|
{
|
|
return f16tof32(_value);
|
|
}
|
|
|
|
public ushort GetBinary() { return _value; }
|
|
public void SetBinary(ushort value) { _value = value; }
|
|
|
|
//////////////////////////////////////////////////////////
|
|
//
|
|
// EQUALITIES
|
|
//
|
|
/*
|
|
bool float16::operator ==(const float16 &f) { return (_value == f._value); }
|
|
|
|
bool float16::operator !=(const float16 &f) { return (_value != f._value); }
|
|
|
|
bool float16::operator >(const float16 &f) {
|
|
if ((_value & 0x8000) && (f._value & 0x8000))
|
|
return _value < f._value;
|
|
if (_value & 0x8000)
|
|
return false;
|
|
if (f._value & 0x8000)
|
|
return true;
|
|
return _value > f._value;
|
|
}
|
|
|
|
bool float16::operator >=(const float16 &f) {
|
|
if ((_value & 0x8000) && (f._value & 0x8000))
|
|
return _value <= f._value;
|
|
if (_value & 0x8000)
|
|
return false;
|
|
if (f._value & 0x8000)
|
|
return true;
|
|
return _value >= f._value;
|
|
}
|
|
|
|
bool float16::operator <(const float16 &f) {
|
|
if ((_value & 0x8000) && (f._value & 0x8000))
|
|
return _value > f._value;
|
|
if (_value & 0x8000)
|
|
return true;
|
|
if (f._value & 0x8000)
|
|
return false;
|
|
return _value < f._value;
|
|
}
|
|
|
|
bool float16::operator <=(const float16 &f) {
|
|
if ((_value & 0x8000) && (f._value & 0x8000))
|
|
return _value >= f._value;
|
|
if (_value & 0x8000)
|
|
return true;
|
|
if (f._value & 0x8000)
|
|
return false;
|
|
return _value <= f._value;
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////
|
|
//
|
|
// NEGATION
|
|
//
|
|
float16 float16::operator -() {
|
|
float16 f16;
|
|
f16.setBinary(_value ^ 0x8000);
|
|
return f16;
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////
|
|
//
|
|
// MATH
|
|
//
|
|
float16 float16::operator +(const float16 &f) {
|
|
return float16(this->toDouble() + f.toDouble());
|
|
}
|
|
|
|
float16 float16::operator -(const float16 &f) {
|
|
return float16(this->toDouble() - f.toDouble());
|
|
}
|
|
|
|
float16 float16::operator *(const float16 &f) {
|
|
return float16(this->toDouble() * f.toDouble());
|
|
}
|
|
|
|
float16 float16::operator /(const float16 &f) {
|
|
return float16(this->toDouble() / f.toDouble());
|
|
}
|
|
|
|
float16 & float16::operator+=(const float16 &f) {
|
|
*this = this->toDouble() + f.toDouble();
|
|
return *this;
|
|
}
|
|
|
|
float16 & float16::operator-=(const float16 &f) {
|
|
*this = this->toDouble() - f.toDouble();
|
|
return *this;
|
|
}
|
|
|
|
float16 & float16::operator*=(const float16 &f) {
|
|
*this = this->toDouble() * f.toDouble();
|
|
return *this;
|
|
}
|
|
|
|
float16 & float16::operator/=(const float16 &f) {
|
|
*this = this->toDouble() / f.toDouble();
|
|
return *this;
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////
|
|
//
|
|
// MATH HELPER FUNCTIONS
|
|
//
|
|
int float16::sign() {
|
|
if (_value & 0x8000)
|
|
return -1;
|
|
if (_value & 0xFFFF)
|
|
return 1;
|
|
return 0;
|
|
}
|
|
|
|
bool float16::isZero() { return ((_value & 0x7FFF) == 0x0000); }
|
|
|
|
bool float16::isNaN() {
|
|
if ((_value & 0x7C00) != 0x7C00)
|
|
return false;
|
|
if ((_value & 0x03FF) == 0x0000)
|
|
return false;
|
|
return true;
|
|
}
|
|
|
|
bool float16::isInf() { return ((_value == 0x7C00) || (_value == 0xFC00)); }
|
|
*/
|
|
//////////////////////////////////////////////////////////
|
|
//
|
|
// CORE CONVERSION
|
|
//
|
|
float f16tof32(ushort _value)
|
|
{
|
|
//ushort sgn;
|
|
ushort man;
|
|
int exp;
|
|
float f;
|
|
|
|
//Debug.Log($"{_value}");
|
|
|
|
bool sgn = (_value & 0x8000) > 0;
|
|
exp = (_value & 0x7C00) >> 10;
|
|
man = (ushort)(_value & 0x03FF);
|
|
|
|
//Debug.Log($"{sgn} {exp} {man}");
|
|
|
|
// ZERO
|
|
if ((_value & 0x7FFF) == 0)
|
|
{
|
|
return sgn ? -0 : 0;
|
|
}
|
|
// NAN & INF
|
|
if (exp == 0x001F)
|
|
{
|
|
if (man == 0)
|
|
return sgn ? float.NegativeInfinity : float.PositiveInfinity; //-INFINITY : INFINITY;
|
|
else
|
|
return float.NaN; // NAN;
|
|
}
|
|
|
|
// SUBNORMAL/NORMAL
|
|
if (exp == 0)
|
|
f = 0;
|
|
else
|
|
f = 1;
|
|
|
|
// PROCESS MANTISSE
|
|
for (int i = 9; i >= 0; i--)
|
|
{
|
|
f *= 2;
|
|
if ((man & (1 << i)) != 0)
|
|
f = f + 1;
|
|
}
|
|
//Debug.Log($"{f}");
|
|
f = f * (float)Math.Pow(2.0f, exp - 25);
|
|
if (exp == 0)
|
|
{
|
|
f = f * (float)Math.Pow(2.0f, -13); // 5.96046447754e-8;
|
|
}
|
|
//Debug.Log($"{f}");
|
|
return sgn ? -f : f;
|
|
}
|
|
|
|
ushort f32tof16(float f)
|
|
{
|
|
//uint t = *(uint*)&f;
|
|
uint t = (uint)BitConverter.SingleToInt32Bits(f);
|
|
// man bits = 10; but we keep 11 for rounding
|
|
ushort man = (ushort)((t & 0x007FFFFF) >> 12);
|
|
short exp = (short)((t & 0x7F800000) >> 23);
|
|
bool sgn = (t & 0x80000000) != 0;
|
|
|
|
// handle 0
|
|
if ((t & 0x7FFFFFFF) == 0)
|
|
{
|
|
return sgn ? (ushort)0x8000 : (ushort)0x0000;
|
|
}
|
|
// denormalized float32 does not fit in float16
|
|
if (exp == 0x00)
|
|
{
|
|
return sgn ? (ushort)0x8000 : (ushort)0x0000;
|
|
}
|
|
// handle infinity & NAN
|
|
if (exp == 0x00FF)
|
|
{
|
|
if (man != 0)
|
|
return 0xFE00; // NAN
|
|
return sgn ? (ushort)0xFC00 : (ushort)0x7C00; // -INF : INF
|
|
}
|
|
|
|
// normal numbers
|
|
exp = (short)(exp - 127 + 15);
|
|
// overflow does not fit => INF
|
|
if (exp > 30)
|
|
{
|
|
return sgn ? (ushort)0xFC00 : (ushort)0x7C00; // -INF : INF
|
|
}
|
|
// subnormal numbers
|
|
if (exp < -38)
|
|
{
|
|
return sgn ? (ushort)0x8000 : (ushort)0x0000; // -0 or 0 ? just 0 ?
|
|
}
|
|
if (exp <= 0) // subnormal
|
|
{
|
|
man >>= (exp + 14);
|
|
// rounding
|
|
man++;
|
|
man >>= 1;
|
|
if (sgn)
|
|
return (ushort)(0x8000 | man);
|
|
return man;
|
|
}
|
|
|
|
// normal
|
|
// TODO rounding
|
|
exp <<= 10;
|
|
man++;
|
|
man >>= 1;
|
|
ushort uexp = (ushort)exp;
|
|
if (sgn)
|
|
return (ushort)(0x8000 | uexp | man);
|
|
return (ushort)(uexp | man);
|
|
}
|
|
|
|
// -- END OF FILE --
|
|
}
|
|
|
|
} |