Sofern sich die Zeit findet, will ich ja ne Klasse schreiben womit man FP16-Zahlen nutzen kann. CineFX-like wird mit FP32 gerechnet, doch die Speicherung erfolgt eben als FP16. Da man ohnehin über FP32 "kommuniziert" hielte ich es jetzt für sinnvoll, als Klassen-Member überhaupt nur das (künstlich auf FP16-Niveau ungenau gemachte) entsprechende float zu halten.
Es soll aber auch möglich sein, mit einer Methode die Fp16-Binärdarstellung zu bekommen. Nun braucht man diese Umwandlung (FP32 -> FP16) aber zwei mal: Einmal um die Binär-Darstellung zurückzugeben, und einmal um aus dem FP16 wieder das (dann ungenauere) FP32 zu machen. Da würde ich nur ungerne den Code doppelt drin haben. Ansonsten müsste man das aufsplitten: Eine Methode FP32 -> FP16, und eine FP16 -> FP32. Zum Zurückgeben der Binärdarstellung braucht man nur FP32 -> FP16. Zum Abspeichern einer übergebenen Zahl jedoch beide, FP32 -> FP16 und danach gleich wieder FP16 -> FP32. Nun kommt das letztere ziemlich häufig vor. Macht es trotzdem Sinn, das eben über zwei Methodenaufrufe zu gestalten? ("Overhead" anyone?)
Eine interne Speicherung nur als ungenaues float spart, Mantisse, Sign und Exponent noch mal extra (als Int) als Member vorhalten zu müssen. Jede neue Instanz hätte mit diesen Ints ja gleich einen höheren Speicherplatz-Verbrauch. Sollte das beim Design auch eine Rolle spielen, oder kommt es auf die paar Bytes auch nicht an?
zeckensack
2003-12-12, 18:31:00
Gehst du das mit C++ an?
€
Zum Abspeichern einer übergebenen Zahl jedoch beide, FP32 -> FP16 und danach gleich wieder FP16 -> FP32
Ich würde vorschlagen, du speicherst nur als float ("FP32"), und machst die Ungenauigkeit "in place", also durch ein bisserl Zeiger-Casten und Bit-Schiebereien direkt in der FP32-Darstellung.
Das würde dir die Implementierung der kompletten Arithmetik ersparen, du brauchst die Operatoren dann jeweils nur so auslegen:fp16
fp16::operator * (const fp16& rhs)
{
fp16 rv;
rv.fp32=this->fp32*rhs.fp32;
rv.mach_mich_ungenau();
return(rv);
}
Das würde dir eine schier unendliche Menge möglicher Fehler ersparen.
Edit: total lumpig falschen Code korrigiert ...
Im mom siehts so aus:
fp16.h#ifndef __FP16h
#define __FP16h
#include "fp16.cpp"
#endif
fp16.cpp// fp16 class
// aths, 12. 12. 2003
struct fp16comp {
int sign,exponent,mantissa;
};
static const fp16comp fp16NaN = {0, 31, 1};
static const fp16comp fp16Inf = {0, 31, 0};
static const fp16comp fp16nInf = {1, 31, 0};
static const fp16comp fp16zero = {0, 0, 0};
static const fp16comp fp16nzero = {1, 0, 0};
class fp16 {
float value;
public:
fp16(void) {value=0;}
fp16(float a) {value=fp16tofp32(fp32tofp16(a));}
float getvalue(void) {return(value);}
void setvalue(float a) {value=fp16tofp32(fp32tofp16(a));}
float fp16tofp32 (fp16comp a) {
int ifloat=0;
if (a.exponent>30) {a.exponent=128+15;} // expand Inf to FP32
if ((a.exponent==0) && (a.mantissa==0)) {a.exponent=-112;} // expand zero to FP32
a.exponent+=127-15; // apply bias
a.exponent<<=23; // positioning of exponent
a.mantissa<<=13; // expand mantissa to FP32
ifloat=(a.sign<<31)|a.exponent|a.mantissa; // make FP32
float *result=(float*)&ifloat; // convert to float
return(*result); // return
}
fp16comp fp32tofp16(float a) {
fp16comp result; // resultset
int *ifloat=(int*)&a; // working variable
int denormmove=0; // used for denormalized fp16
//return (fp16zero);
result.sign=(*ifloat >> 31) & 0x1;
result.exponent=(*ifloat >> 23) & 0xFF;
result.mantissa=*ifloat & 0x007FFFFF;
result.exponent=result.exponent-127;
if ((result.exponent==128) && (result.mantissa!=0)) {return (fp16NaN);}
if ((result.exponent>15) && (result.sign==0)) {return (fp16Inf);}
if ((result.exponent>15) && (result.sign!=0)) {return (fp16nInf);}
//cout<<"-"<<result.exponent<<"-"<<endl;
/*
if ((result.exponent<-35) && (result.sign==0)) {return (fp16zero);}
if ((result.exponent<-35) && (result.sign!=0)) {return (fp16nzero);}/**/
// denormalized mantissa
if (result.exponent<-14) {denormmove=result.exponent+14;}
// round mantissa
result.mantissa>>=12;
result.mantissa>>=denormmove;
if (result.mantissa & 1) {++result.mantissa;} // round up
result.mantissa>>=1;
if (result.mantissa & 0x00000400) {++result.exponent;}
result.mantissa=(result.mantissa & 0x3FF); // cut to 10 bits
if ((result.exponent>15) && (result.sign==0)) {return (fp16Inf);}
if ((result.exponent>15) && (result.sign!=0)) {return (fp16nInf);}
if (result.mantissa<-14) {result.mantissa=0;}
result.exponent+=15; // apply bias
return(result);
}
};
Denorms gehen noch nicht so, wie gedacht.
ScottManDeath
2003-12-16, 05:01:00
ILM hat OpenEXR (http://www.openexr.com), ein bildformat entwickelt welches 16 bit half float unterstützt, mit source code (ftp://ftp.gnu.org/savannah/cvs/openexr-cvs-latest.tar.gz) (die dateien sind etwas "verkrüppelt", d.h. sie enthalten noch cvs infos, ist aber mit copy & paste zu lösen ;)
darunter auch eine klasse für halfs
ein paar auszüge
der header
///////////////////////////////////////////////////////////////////////////
//
// Copyright (c) 2002, Industrial Light & Magic, a division of Lucas
// Digital Ltd. LLC
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Industrial Light & Magic nor the names of
// its contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
///////////////////////////////////////////////////////////////////////////
// Primary authors:
// Florian Kainz <kainz@@ilm.com>
// Rod Bogart <rgb@@ilm.com>
//---------------------------------------------------------------------------
//
// half -- a 16-bit floating point number class:
//
// Type half can represent positive and negative numbers, whose
// magnitude is between roughly 6.1e-5 and 6.5e+4, with a relative
// error of 9.8e-4; numbers smaller than 6.1e-5 can be represented
// with an absolute error of 6.0e-8. All integers from -2048 to
// +2048 can be represented exactly.
//
// Type half behaves (almost) like the built-in C++ floating point
// types. In arithmetic expressions, half, float and double can be
// mixed freely. Here are a few examples:
//
// half a (3.5);
// float b (a + sqrt (a));
// a += b;
// b += a;
// b = a + 7;
//
// Conversions from half to float are lossless; all half numbers
// are exactly representable as floats.
//
// Conversions from float to half may not preserve the float's
// value exactly. If a float is not representable as a half, the
// float value is rounded to the nearest representable half. If
// a float value is exactly in the middle between the two closest
// representable half values, then the float value is rounded to
// the half with the greater magnitude.
//
// Overflows during float-to-half conversions cause arithmetic
// exceptions. An overflow occurs when the float value to be
// converted is too large to be represented as a half, or if the
// float value is an infinity or a NAN.
//
// The implementation of type half makes the following assumptions
// about the implementation of the built-in C++ types:
//
// float is an IEEE 754 single-precision number
// sizeof (float) == 4
// sizeof (unsigned int) == sizeof (float)
// alignof (unsigned int) == alignof (float)
// sizeof (unsigned short) == 2
//
//---------------------------------------------------------------------------
#ifndef _HALF_H_
#define _HALF_H_
#include <iostream>
class half
{
public:
//-------------
// Constructors
//-------------
half (); // no initialization
half (const half &h);
half (float f);
//--------------------
// Conversion to float
//--------------------
operator float () const;
//------------
// Unary minus
//------------
half operator - () const;
//-----------
// Assignment
//-----------
half operator = (half h);
half operator = (float f);
half operator += (half h);
half operator += (float f);
half operator -= (half h);
half operator -= (float f);
half operator *= (half h);
half operator *= (float f);
half operator /= (half h);
half operator /= (float f);
//---------------------------------------------------------
// Round to n-bit precision (n should be between 0 and 10).
// After rounding, the significand's 10-n least significant
// bits will be zero.
//---------------------------------------------------------
half round (unsigned int n) const;
//--------------------------------------------------------------------
// Classification:
//
// h.isFinite() returns true if h is a normalized number,
// a denormalized number or zero
//
// h.isNormalized() returns true if h is a normalized number
//
// h.isDenormalized() returns true if h is a denormalized number
//
// h.isZero() returns true if h is zero
//
// h.isNan() returns true if h is a NAN
//
// h.isInfinity() returns true if h is a positive
// or a negative infinity
//
// h.isNegative() returns true if the sign bit of h
// is set (negative)
//--------------------------------------------------------------------
bool isFinite () const;
bool isNormalized () const;
bool isDenormalized () const;
bool isZero () const;
bool isNan () const;
bool isInfinity () const;
bool isNegative () const;
//--------------------------------------------
// Special values
//
// posInf() returns +infinity
//
// negInf() returns +infinity
//
// qNan() returns a NAN with the bit
// pattern 0111111111111111
//
// sNan() returns a NAN with the bit
// pattern 0111110111111111
//--------------------------------------------
static half posInf ();
static half negInf ();
static half qNan ();
static half sNan ();
//--------------------------------------
// Access to the internal representation
//--------------------------------------
unsigned short bits () const;
void setBits (unsigned short bits);
public:
union uif
{
unsigned int i;
float f;
};
private:
static short convert (int i);
static float overflow ();
static bool selftest ();
unsigned short _h;
static const uif _toFloat[1 << 16];
static const unsigned short _eLut[1 << 9];
static const bool _itWorks;
};
//-----------
// Stream I/O
//-----------
std::ostream & operator << (std::ostream &os, half h);
std::istream & operator >> (std::istream &is, half &h);
//----------
// Debugging
//----------
void printBits (std::ostream &os, half h);
void printBits (std::ostream &os, float f);
void printBits (char c[19], half h);
void printBits (char c[35], float f);
//-------
// Limits
//-------
#define HALF_MIN 5.96046448e-08 // Smallest positive half
#define HALF_NRM_MIN 6.10351562e-05 // Smallest positive normalized half
#define HALF_MAX 65504.0 // Largest positive half
#define HALF_EPSILON 0.00097656 // Smallest positive e for which
// half (1.0 + e) != half (1.0)
#define HALF_MANT_DIG 11 // Number of digits in mantissa
// (significand + hidden leading 1)
#define HALF_DIG 2 // Number of base 10 digits that
// can be represented without change
#define HALF_RADIX 2 // Base of the exponent
#define HALF_MIN_EXP -13 // Minimum negative integer such that
// HALF_RADIX raised to the power of
// one less than that integer is a
// normalized half
#define HALF_MAX_EXP 16 // Maximum positive integer such that
// HALF_RADIX raised to the power of
// one less than that integer is a
// normalized half
#define HALF_MIN_10_EXP -4 // Minimum positive integer such
// that 10 raised to that power is
// a normalized half
#define HALF_MAX_10_EXP 4 // Maximum positive integer such
// that 10 raised to that power is
// a normalized half
//---------------------------------------------------------------------------
//
// Implementation --
//
// Representation of a float:
//
// We assume that a float, f, is an IEEE 754 single-precision
// floating point number, whose bits are arranged as follows:
//
// 31 (msb)
// |
// | 30 23
// | | |
// | | | 22 0 (lsb)
// | | | | |
// X XXXXXXXX XXXXXXXXXXXXXXXXXXXXXXX
//
// s e m
//
// S is the sign-bit, e is the exponent and m is the significand.
//
// If e is between 1 and 254, f is a normalized number:
//
// s e-127
// f = (-1) * 2 * 1.m
//
// If e is 0, and m is not zero, f is a denormalized number:
//
// s -126
// f = (-1) * 2 * 0.m
//
// If e and m are both zero, f is zero:
//
// f = 0.0
//
// If e is 255, f is an "infinity" or "not a number" (NAN),
// depending on whether m is zero or not.
//
// Examples:
//
// 0 00000000 00000000000000000000000 = 0.0
// 0 01111110 00000000000000000000000 = 0.5
// 0 01111111 00000000000000000000000 = 1.0
// 0 10000000 00000000000000000000000 = 2.0
// 0 10000000 10000000000000000000000 = 3.0
// 1 10000101 11110000010000000000000 = -124.0625
// 0 11111111 00000000000000000000000 = +infinity
// 1 11111111 00000000000000000000000 = -infinity
// 0 11111111 10000000000000000000000 = NAN
// 1 11111111 11111111111111111111111 = NAN
//
// Representation of a half:
//
// Here is the bit-layout for a half number, h:
//
// 15 (msb)
// |
// | 14 10
// | | |
// | | | 9 0 (lsb)
// | | | | |
// X XXXXX XXXXXXXXXX
//
// s e m
//
// S is the sign-bit, e is the exponent and m is the significand.
//
// If e is between 1 and 30, h is a normalized number:
//
// s e-15
// h = (-1) * 2 * 1.m
//
// If e is 0, and m is not zero, h is a denormalized number:
//
// S -14
// h = (-1) * 2 * 0.m
//
// If e and m are both zero, h is zero:
//
// h = 0.0
//
// If e is 31, h is an "infinity" or "not a number" (NAN),
// depending on whether m is zero or not.
//
// Examples:
//
// 0 00000 0000000000 = 0.0
// 0 01110 0000000000 = 0.5
// 0 01111 0000000000 = 1.0
// 0 10000 0000000000 = 2.0
// 0 10000 1000000000 = 3.0
// 1 10101 1111000001 = -124.0625
// 0 11111 0000000000 = +infinity
// 1 11111 0000000000 = -infinity
// 0 11111 1000000000 = NAN
// 1 11111 1111111111 = NAN
//
// Conversion:
//
// Converting from a float to a half requires some non-trivial bit
// manipulations. In some cases, this makes conversion relatively
// slow, but the most common case is accelerated via table lookups.
//
// Converting back from a half to a float is easier because we don't
// have to do any rounding. In addition, there are only 65536
// different half numbers; we can convert each of those numbers once
// and store the results in a table. Later, all conversions can be
// done using only simple table lookups.
//
//---------------------------------------------------------------------------
//--------------------
// Simple constructors
//--------------------
inline
half::half ()
{
// no initialization
}
inline
half::half (const half &h)
{
_h = h._h;
}
//----------------------------
// Half-from-float constructor
//----------------------------
inline
half::half (float f)
{
if (f == 0)
{
//
// Common special case - zero.
// For speed, we don't preserve the zero's sign.
//
_h = 0;
}
else
{
//
// We extract the combined sign and exponent, e, from our
// floating-point number, f. Then we convert e to the sign
// and exponent of the half number via a table lookup.
//
// For the most common case, where a normalized half is produced,
// the table lookup returns a non-zero value; in this case, all
// we have to do, is round f's significand to 10 bits and combine
// the result with e.
//
// For all other cases (overflow, zeroes, denormalized numbers
// resulting from underflow, infinities and NANs), the table
// lookup returns zero, and we call a longer, non-inline function
// to do the float-to-half conversion.
//
uif x;
x.f = f;
register int e = (x.i >> 23) & 0x000001ff;
e = _eLut[e];
if (e)
{
//
// Simple case - round the significand and
// combine it with the sign and exponent.
//
_h = e + (((x.i & 0x007fffff) + 0x00001000) >> 13);
}
else
{
//
// Difficult case - call a function.
//
_h = convert (x.i);
}
}
}
//------------------------------------------
// Half-to-float conversion via table lookup
//------------------------------------------
inline
half::operator float () const
{
return _toFloat[_h].f;
}
//-------------------------
// Round to n-bit precision
//-------------------------
inline half
half::round (unsigned int n) const
{
//
// Parameter check.
//
if (n >= 10)
return *this;
//
// Disassemble h into the sign, s,
// and the combined exponent and significand, e.
//
unsigned short s = _h & 0x8000;
unsigned short e = _h & 0x7fff;
//
// Round the exponent and significand to the nearest value
// where ones occur only in the (10-n) most significant bits.
// Note that the exponent adjusts automatically if rounding
// up causes the significand to overflow.
//
e >>= 9 - n;
e += e & 1;
e <<= 9 - n;
//
// Check for exponent overflow.
//
if (e >= 0x7c00)
{
//
// Overflow occurred -- truncate instead of rounding.
//
e = _h;
e >>= 10 - n;
e <<= 10 - n;
}
//
// Put the original sign bit back.
//
half h;
h._h = s | e;
return h;
}
//-----------------------
// Other inline functions
//-----------------------
inline half
half::operator - () const
{
half h;
h._h = _h ^ 0x8000;
return h;
}
inline half
half::operator = (half h)
{
_h = h._h;
return *this;
}
inline half
half::operator = (float f)
{
*this = half (f);
return *this;
}
inline half
half::operator += (half h)
{
*this = half (float (*this) + float (h));
return *this;
}
inline half
half::operator += (float f)
{
*this = half (float (*this) + f);
return *this;
}
inline half
half::operator -= (half h)
{
*this = half (float (*this) - float (h));
return *this;
}
inline half
half::operator -= (float f)
{
*this = half (float (*this) - f);
return *this;
}
inline half
half::operator *= (half h)
{
*this = half (float (*this) * float (h));
return *this;
}
inline half
half::operator *= (float f)
{
*this = half (float (*this) * f);
return *this;
}
inline half
half::operator /= (half h)
{
*this = half (float (*this) / float (h));
return *this;
}
inline half
half::operator /= (float f)
{
*this = half (float (*this) / f);
return *this;
}
inline bool
half::isFinite () const
{
unsigned short e = (_h >> 10) & 0x001f;
return e < 31;
}
inline bool
half::isNormalized () const
{
unsigned short e = (_h >> 10) & 0x001f;
return e > 0 && e < 31;
}
inline bool
half::isDenormalized () const
{
unsigned short e = (_h >> 10) & 0x001f;
unsigned short m = _h & 0x3ff;
return e == 0 && m != 0;
}
inline bool
half::isZero () const
{
return (_h & 0x7fff) == 0;
}
inline bool
half::isNan () const
{
unsigned short e = (_h >> 10) & 0x001f;
unsigned short m = _h & 0x3ff;
return e == 31 && m != 0;
}
inline bool
half::isInfinity () const
{
unsigned short e = (_h >> 10) & 0x001f;
unsigned short m = _h & 0x3ff;
return e == 31 && m == 0;
}
inline bool
half::isNegative () const
{
return (_h & 0x8000) != 0;
}
inline half
half::posInf ()
{
half h;
h._h = 0x7c00;
return h;
}
inline half
half::negInf ()
{
half h;
h._h = 0xfc00;
return h;
}
inline half
half::qNan ()
{
half h;
h._h = 0x7fff;
return h;
}
inline half
half::sNan ()
{
half h;
h._h = 0x7dff;
return h;
}
inline unsigned short
half::bits () const
{
return _h;
}
inline void
half::setBits (unsigned short bits)
{
_h = bits;
}
#endif
die cpp
///////////////////////////////////////////////////////////////////////////
//
// Copyright (c) 2002, Industrial Light & Magic, a division of Lucas
// Digital Ltd. LLC
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Industrial Light & Magic nor the names of
// its contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
///////////////////////////////////////////////////////////////////////////
// Primary authors:
// Florian Kainz <kainz@@ilm.com>
// Rod Bogart <rgb@@ilm.com>
//---------------------------------------------------------------------------
//
// class half --
// implementation of non-inline members
//
//---------------------------------------------------------------------------
#include <assert.h>
#include <half.h>
using namespace std;
//-------------------------------------------------------------
// Lookup tables for half-to-float and float-to-half conversion
//-------------------------------------------------------------
const half::uif half::_toFloat[1 << 16] =
#include <toFloat.h>
const unsigned short half::_eLut[1 << 9] =
#include <eLut.h>
//--------------------------------------
// Dummy flag, initialized by selftest()
//--------------------------------------
const bool half::_itWorks =
selftest();
//-----------------------------------------------
// Overflow handler for float-to-half conversion;
// generates a hardware floating-point overflow,
// which may be trapped by the operating system.
//-----------------------------------------------
float
half::overflow ()
{
volatile float f = 1e10;
for (int i = 0; i < 10; i++)
f *= f; // this will overflow before
// the for_loop terminates
return f;
}
//-----------------------------------------------------
// Float-to-half conversion -- general case, including
// zeroes, denormalized numbers and exponent overflows.
//-----------------------------------------------------
short
half::convert (int i)
{
//
// Our floating point number, f, is represented by the bit
// pattern in integer i. Disassemble that bit pattern into
// the sign, s, the exponent, e, and the significand, m.
// Shift s into the position where it will go in in the
// resulting half number.
// Adjust e, accounting for the different exponent bias
// of float and half (127 versus 15).
//
register int s = (i >> 16) & 0x00008000;
register int e = ((i >> 23) & 0x000000ff) - (127 - 15);
register int m = i & 0x007fffff;
//
// Now reassemble s, e and m into a half:
//
if (e <= 0)
{
if (e < -10)
{
//
// E is less than -10. The absolute value of f is
// less than HALF_MIN (f may be a small normalized
// float, a denormalized float or a zero).
//
// We convert f to a half zero.
//
return 0;
}
//
// E is between -10 and 0. F is a normalized float,
// whose magnitude is less than HALF_NRM_MIN.
//
// We convert f to a denormalized half.
//
m = (m | 0x00800000) >> (1 - e);
//
// Round to nearest, round "0.5" up.
//
// Rounding may cause the significand to overflow and make
// our number normalized. Because of the way a half's bits
// are laid out, we don't have to treat this case separately;
// the code below will handle it correctly.
//
if (m & 0x00001000)
m += 0x00002000;
//
// Assemble the half from s, e (zero) and m.
//
return s | (m >> 13);
}
else if (e == 0xff - (127 - 15))
{
if (m == 0)
{
//
// F is an infinity; convert f to a half
// infinity with the same sign as f.
//
return s | 0x7c00;
}
else
{
//
// F is a NAN; produce a half NAN that preserves
// the sign bit and the 10 leftmost bits of the
// significand of f.
//
return s | 0x7c00 | (m >> 13);
}
}
else
{
//
// E is greater than zero. F is a normalized float.
// We try to convert f to a normalized half.
//
//
// Round to nearest, round "0.5" up
//
if (m & 0x00001000)
{
m += 0x00002000;
if (m & 0x00800000)
{
m = 0; // overflow in significand,
e += 1; // adjust exponent
}
}
//
// Handle exponent overflow
//
if (e > 30)
{
overflow (); // Cause a hardware floating point overflow;
return s | 0x7c00; // if this returns, the half becomes an
} // infinity with the same sign as f.
//
// Assemble the half from s, e and m.
//
return s | (e << 10) | (m >> 13);
}
}
//--------------------------------------
// Simple selftest, triggered by static
// initialization of half::_itWorks flag
//--------------------------------------
namespace
{
void
testNormalized (float f)
{
half h (f);
float e (1 - h / f);
if (e < 0)
e = -e;
if (e > HALF_EPSILON * 0.5)
{
cerr << "Internal error: float/half conversion does not work.";
assert (false);
}
}
void
testDenormalized (float f)
{
half h (f);
float e (h - f);
if (e < 0)
e = -e;
if (e > HALF_MIN * 0.5)
{
cerr << "Internal error: float/half conversion does not work.";
assert (false);
}
}
}
bool
half::selftest ()
{
testNormalized ((float) HALF_MAX);
testNormalized ((float) -HALF_MAX);
testNormalized ( 0.1f);
testNormalized (-0.1f);
testNormalized ( 0.5f);
testNormalized (-0.5f);
testNormalized ( 1.0f);
testNormalized (-1.0f);
testNormalized ( 2.0f);
testNormalized (-2.0f);
testNormalized ( 3.0f);
testNormalized (-3.0f);
testNormalized ( 17.0f);
testNormalized (-17.0f);
testNormalized ((float) HALF_NRM_MIN);
testNormalized ((float) -HALF_NRM_MIN);
testDenormalized ((float) HALF_MIN);
testDenormalized ((float) -HALF_MIN);
testDenormalized ( 0.0f);
testDenormalized (-0.0f);
return true;
}
//---------------------
// Stream I/O operators
//---------------------
ostream &
operator << (ostream &os, half h)
{
os << float (h);
return os;
}
istream &
operator >> (istream &is, half &h)
{
float f;
is >> f;
h = half (f);
return is;
}
//---------------------------------------
// Functions to print the bit-layout of
// floats and halfs, mostly for debugging
//---------------------------------------
void
printBits (ostream &os, half h)
{
unsigned short b = h.bits();
for (int i = 15; i >= 0; i--)
{
os << (((b >> i) & 1)? '1': '0');
if (i == 15 || i == 10)
os << ' ';
}
}
void
printBits (ostream &os, float f)
{
half::uif x;
x.f = f;
for (int i = 31; i >= 0; i--)
{
os << (((x.i >> i) & 1)? '1': '0');
if (i == 31 || i == 23)
os << ' ';
}
}
void
printBits (char c[19], half h)
{
unsigned short b = h.bits();
for (int i = 15, j = 0; i >= 0; i--, j++)
{
c[j] = (((b >> i) & 1)? '1': '0');
if (i == 15 || i == 10)
c[++j] = ' ';
}
c[18] = 0;
}
void
printBits (char c[35], float f)
{
half::uif x;
x.f = f;
for (int i = 31, j = 0; i >= 0; i--, j++)
{
c[j] = (((x.i >> i) & 1)? '1': '0');
if (i == 31 || i == 23)
c[++j] = ' ';
}
c[34] = 0;
}
half to float
///////////////////////////////////////////////////////////////////////////
//
// Copyright (c) 2002, Industrial Light & Magic, a division of Lucas
// Digital Ltd. LLC
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Industrial Light & Magic nor the names of
// its contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
///////////////////////////////////////////////////////////////////////////
//---------------------------------------------------------------------------
//
// toFloat
//
// A program to generate the lookup table for half-to-float
// conversion needed by class half.
// The program loops over all 65536 possible half numbers,
// converts each of them to a float, and prints the result.
//
//---------------------------------------------------------------------------
#include <iostream>
#include <iomanip>
using namespace std;
//---------------------------------------------------
// Interpret an unsigned short bit pattern as a half,
// and convert that half to the corresponding float's
// bit pattern.
//---------------------------------------------------
unsigned int
halfToFloat (unsigned short y)
{
int s = (y >> 15) & 0x00000001;
int e = (y >> 10) & 0x0000001f;
int m = y & 0x000003ff;
if (e == 0)
{
if (m == 0)
{
//
// Plus or minus zero
//
return s << 31;
}
else
{
//
// Denormalized number -- renormalize it
//
while (!(m & 0x00000400))
{
m <<= 1;
e -= 1;
}
e += 1;
m &= ~0x00000400;
}
}
else if (e == 31)
{
if (m == 0)
{
//
// Positive or negative infinity
//
return (s << 31) | 0x7f800000;
}
else
{
//
// Nan -- preserve sign and significand bits
//
return (s << 31) | 0x7f800000 | (m << 13);
}
}
//
// Normalized number
//
e = e + (127 - 15);
m = m << 13;
//
// Assemble s, e and m.
//
return (s << 31) | (e << 23) | m;
}
//---------------------------------------------
// Main - prints the half-to-float lookup table
//---------------------------------------------
int
main ()
{
cout.precision (9);
#ifndef HAVE_IOS_BASE
cout.setf (ios::hex, ios::basefield);
#else
cout.setf (ios_base::hex, ios_base::basefield);
#endif
cout << "//\n"
"// This is an automatically generated file.\n"
"// Do not edit.\n"
"//\n\n";
cout << "{\n ";
const int iMax = (1 << 16);
for (int i = 0; i < iMax; i++)
{
cout << "{0x" << setfill ('0') << setw (8) << halfToFloat (i) << "}, ";
if (i % 4 == 3)
{
cout << "\n";
if (i < iMax - 1)
cout << " ";
}
}
cout << "};\n";
return 0;
}
vBulletin®, Copyright ©2000-2025, Jelsoft Enterprises Ltd.