float16/arithmetic.go at main · zerfoo/float16 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
package float16

import (
	"math"
)

// Global arithmetic settings
var (
	DefaultArithmeticMode = ModeIEEEArithmetic
	DefaultRounding       = DefaultRoundingMode
)

// ArithmeticMode defines the precision/performance trade-off for arithmetic operations
type ArithmeticMode int

const (
	// ModeIEEE provides full IEEE 754 compliance with proper rounding
	ModeIEEEArithmetic ArithmeticMode = iota
	// ModeFastArithmetic optimizes for speed, may sacrifice some precision
	ModeFastArithmetic
	// ModeExactArithmetic provides exact results when possible, errors on precision loss
	ModeExactArithmetic
)

// Add performs addition of two Float16 values
func Add(a, b Float16) Float16 {
	result, _ := AddWithMode(a, b, DefaultArithmeticMode, DefaultRounding)
	return result
}

// AddWithMode performs addition with specified arithmetic and rounding modes
func AddWithMode(a, b Float16, mode ArithmeticMode, rounding RoundingMode) (Float16, error) {
	// Handle special cases first for performance
	if a.IsZero() {
		return b, nil
	}
	if b.IsZero() {
		return a, nil
	}

	// Handle NaN cases
	if a.IsNaN() || b.IsNaN() {
		if mode == ModeExactArithmetic {
			return 0, &Float16Error{
				Op:   "add",
				Msg:  "NaN operand in exact mode",
				Code: ErrNaN,
			}
		}
		// Return a quiet NaN
		return QuietNaN, nil
	}

	// Handle infinity cases
	if a.IsInf(0) || b.IsInf(0) {
		if a.IsInf(1) && b.IsInf(-1) {
			// +∞ + (-∞) = NaN
			if mode == ModeExactArithmetic {
				return 0, &Float16Error{
					Op:   "add",
					Msg:  "infinity - infinity is undefined",
					Code: ErrInvalidOperation,
				}
			}
			return QuietNaN, nil
		}
		if a.IsInf(-1) && b.IsInf(1) {
			// (-∞) + (+∞) = NaN
			if mode == ModeExactArithmetic {
				return 0, &Float16Error{
					Op:   "add",
					Msg:  "infinity - infinity is undefined",
					Code: ErrInvalidOperation,
				}
			}
			return QuietNaN, nil
		}
		// Return the infinity
		if a.IsInf(0) {
			return a, nil
		}
		return b, nil
	}

	// For high performance, convert to float32, compute, and convert back
	// This approach is faster than implementing full IEEE 754 arithmetic in float16
	if mode == ModeFastArithmetic {
		f32a := a.ToFloat32()
		f32b := b.ToFloat32()
		result := f32a + f32b
		return FromFloat32(result), nil
	}

	// Full IEEE 754 implementation for exact mode
	return addIEEE754(a, b, rounding)
}

// Sub performs subtraction of two Float16 values
func Sub(a, b Float16) Float16 {
	result, _ := SubWithMode(a, b, DefaultArithmeticMode, DefaultRounding)
	return result
}

// SubWithMode performs subtraction with specified arithmetic and rounding modes
func SubWithMode(a, b Float16, mode ArithmeticMode, rounding RoundingMode) (Float16, error) {
	// Subtraction is addition with negated second operand
	return AddWithMode(a, b.Neg(), mode, rounding)
}

// Mul performs multiplication of two Float16 values
func Mul(a, b Float16) Float16 {
	result, _ := MulWithMode(a, b, DefaultArithmeticMode, DefaultRounding)
	return result
}

// MulWithMode performs multiplication with specified arithmetic and rounding modes
func MulWithMode(a, b Float16, mode ArithmeticMode, rounding RoundingMode) (Float16, error) {
	// Handle special cases
	// Check for zero times infinity cases first
	aIsZero := a.IsZero()
	bIsInf := b.IsInf(0)
	if (aIsZero && bIsInf) || (a.IsInf(0) && b.IsZero()) {
		// 0 * ∞ = NaN
		if mode == ModeExactArithmetic {
			return 0, &Float16Error{
				Op:   "mul",
				Msg:  "zero times infinity is undefined",
				Code: ErrInvalidOperation,
			}
		}
		return QuietNaN, nil
	}

	// Handle zero cases
	if aIsZero || b.IsZero() {
		// Handle sign of zero result: 0 * anything = ±0
		signA := a.Signbit()
		signB := b.Signbit()
		if signA != signB {
			return NegativeZero, nil
		}
		return PositiveZero, nil
	}

	// Handle NaN cases
	if a.IsNaN() || b.IsNaN() {
		if mode == ModeExactArithmetic {
			return 0, &Float16Error{
				Op:   "mul",
				Msg:  "NaN operand in exact mode",
				Code: ErrNaN,
			}
		}
		return QuietNaN, nil
	}

	// Handle infinity cases
	if a.IsInf(0) || b.IsInf(0) {
		// Check for 0 * ∞ which is NaN
		if (a.IsInf(0) && b.IsZero()) || (a.IsZero() && b.IsInf(0)) {
			if mode == ModeExactArithmetic {
				return 0, &Float16Error{
					Op:   "mul",
					Msg:  "zero times infinity is undefined",
					Code: ErrInvalidOperation,
				}
			}
			return QuietNaN, nil
		}

		// ∞ * finite = ±∞ (sign depends on operand signs)
		signA := a.Signbit()
		signB := b.Signbit()
		if signA != signB {
			return NegativeInfinity, nil
		}
		return PositiveInfinity, nil
	}

	// For high performance, use float32 arithmetic
	if mode == ModeFastArithmetic {
		f32a := a.ToFloat32()
		f32b := b.ToFloat32()
		result := f32a * f32b
		return FromFloat32(result), nil
	}

	// Full IEEE 754 implementation
	return mulIEEE754(a, b, rounding)
}

// Div performs division of two Float16 values
func Div(a, b Float16) Float16 {
	result, _ := DivWithMode(a, b, DefaultArithmeticMode, DefaultRounding)
	return result
}

// DivWithMode performs division with specified arithmetic and rounding modes
func DivWithMode(a, b Float16, mode ArithmeticMode, rounding RoundingMode) (Float16, error) {
	// Handle division by zero
	if b.IsZero() {
		if a.IsZero() {
			// 0/0 = NaN
			if mode == ModeExactArithmetic {
				return 0, &Float16Error{
					Op:   "div",
					Msg:  "zero divided by zero is undefined",
					Code: ErrInvalidOperation,
				}
			}
			return QuietNaN, nil
		}
		// finite/0 = ±∞
		if mode == ModeExactArithmetic {
			return 0, &Float16Error{
				Op:   "div",
				Msg:  "division by zero",
				Code: ErrDivisionByZero,
			}
		}
		signA := a.Signbit()
		signB := b.Signbit()
		if signA != signB {
			return NegativeInfinity, nil
		}
		return PositiveInfinity, nil
	}

	// Handle zero dividend
	if a.IsZero() {
		// 0/finite = ±0
		signA := a.Signbit()
		signB := b.Signbit()
		if signA != signB {
			return NegativeZero, nil
		}
		return PositiveZero, nil
	}

	// Handle infinity cases
	if a.IsInf(0) || b.IsInf(0) {
		if a.IsInf(0) && b.IsInf(0) {
			// ∞/∞ = NaN
			if mode == ModeExactArithmetic {
				return 0, &Float16Error{
					Op:   "div",
					Msg:  "infinity divided by infinity is undefined",
					Code: ErrInvalidOperation,
				}
			}
			return QuietNaN, nil
		}

		if a.IsInf(0) {
			// ∞/finite = ±∞
			signA := a.Signbit()
			signB := b.Signbit()
			if signA != signB {
				return NegativeInfinity, nil
			}
			return PositiveInfinity, nil
		}

		// finite/∞ = ±0
		signA := a.Signbit()
		signB := b.Signbit()
		if signA != signB {
			return NegativeZero, nil
		}
		return PositiveZero, nil
	}

	// Handle NaN cases
	if a.IsNaN() || b.IsNaN() {
		if mode == ModeExactArithmetic {
			return 0, &Float16Error{
				Op:   "div",
				Msg:  "NaN operand in exact mode",
				Code: ErrNaN,
			}
		}
		return QuietNaN, nil
	}

	// Handle infinity cases
	if a.IsInf(0) && b.IsInf(0) {
		// ∞/∞ = NaN
		if mode == ModeExactArithmetic {
			return 0, &Float16Error{
				Op:   "div",
				Msg:  "infinity divided by infinity is undefined",
				Code: ErrInvalidOperation,
			}
		}
		return QuietNaN, nil
	}

	if a.IsInf(0) {
		// ∞/finite = ±∞
		signA := a.Signbit()
		signB := b.Signbit()
		if signA != signB {
			return NegativeInfinity, nil
		}
		return PositiveInfinity, nil
	}

	if b.IsInf(0) {
		// finite/∞ = ±0
		signA := a.Signbit()
		signB := b.Signbit()
		if signA != signB {
			return NegativeZero, nil
		}
		return PositiveZero, nil
	}

	// For high performance, use float32 arithmetic
	if mode == ModeFastArithmetic {
		f32a := a.ToFloat32()
		f32b := b.ToFloat32()
		result := f32a / f32b
		return FromFloat32(result), nil
	}

	// Full IEEE 754 implementation
	return divIEEE754(a, b, rounding)
}

// IEEE 754 compliant arithmetic implementations

// addIEEE754 implements full IEEE 754 addition
func addIEEE754(a, b Float16, rounding RoundingMode) (Float16, error) {
	// For addition, we can use the simpler approach of converting to float32
	// since the intermediate precision is sufficient for exact float16 results
	f32a := a.ToFloat32()
	f32b := b.ToFloat32()
	result := f32a + f32b
	return FromFloat32WithRounding(result, rounding), nil
}

// mulIEEE754 implements full IEEE 754 multiplication
func mulIEEE754(a, b Float16, rounding RoundingMode) (Float16, error) {
	// For multiplication, we can use the simpler approach of converting to float32
	// since the intermediate precision is sufficient for exact float16 results
	f32a := a.ToFloat32()
	f32b := b.ToFloat32()
	result := f32a * f32b
	return FromFloat32WithRounding(result, rounding), nil
}

// divIEEE754 implements full IEEE 754 division
func divIEEE754(a, b Float16, rounding RoundingMode) (Float16, error) {
	// For division, we can use the simpler approach of converting to float32
	// since the intermediate precision is sufficient for exact float16 results
	f32a := a.ToFloat32()
	f32b := b.ToFloat32()
	result := f32a / f32b
	return FromFloat32WithRounding(result, rounding), nil
}

// Comparison operations

// Equal returns true if two Float16 values are equal
func Equal(a, b Float16) bool {
	// Handle NaN: NaN != NaN
	if a.IsNaN() || b.IsNaN() {
		return false
	}
	// Handle zero: +0 == -0
	if a.IsZero() && b.IsZero() {
		return true
	}
	return a == b
}

// Less returns true if a < b
func Less(a, b Float16) bool {
	// Handle NaN: any comparison with NaN is false
	if a.IsNaN() || b.IsNaN() {
		return false
	}

	// Handle zero: -0 == +0 for comparison
	if a.IsZero() && b.IsZero() {
		return false
	}

	// Handle signs
	signA := a.Signbit()
	signB := b.Signbit()

	if signA && !signB {
		return true // negative < positive
	}
	if !signA && signB {
		return false // positive > negative
	}

	// Same sign: compare magnitudes
	if signA {
		// Both negative: larger magnitude is smaller value
		return a > b
	} else {
		// Both positive: smaller magnitude is smaller value
		return a < b
	}
}

// Greater returns true if a > b
func Greater(a, b Float16) bool {
	return Less(b, a)
}

// LessEqual returns true if a <= b
func LessEqual(a, b Float16) bool {
	return Less(a, b) || Equal(a, b)
}

// GreaterEqual returns true if a >= b
func GreaterEqual(a, b Float16) bool {
	return Greater(a, b) || Equal(a, b)
}

// Min returns the smaller of two Float16 values
func Min(a, b Float16) Float16 {
	// Handle NaN: return the non-NaN value, or NaN if both are NaN
	if a.IsNaN() {
		return b
	}
	if b.IsNaN() {
		return a
	}
	// Handle -0 and +0
	if a.IsZero() && b.IsZero() {
		if a.Signbit() {
			return a // a is -0
		}
		return b // b is -0, or both are +0
	}
	if Less(a, b) {
		return a
	}
	return b
}

// Max returns the larger of two Float16 values
func Max(a, b Float16) Float16 {
	// Handle NaN: return the non-NaN value, or NaN if both are NaN
	if a.IsNaN() {
		return b
	}
	if b.IsNaN() {
		return a
	}

	if Greater(a, b) {
		return a
	}
	return b
}

// Batch operations for high-performance computing

// AddSlice performs element-wise addition of two Float16 slices
func AddSlice(a, b []Float16) []Float16 {
	if len(a) != len(b) {
		panic("float16: slice length mismatch")
	}

	result := make([]Float16, len(a))
	for i := range a {
		result[i] = Add(a[i], b[i])
	}
	return result
}

// SubSlice performs element-wise subtraction of two Float16 slices
func SubSlice(a, b []Float16) []Float16 {
	if len(a) != len(b) {
		panic("float16: slice length mismatch")
	}

	result := make([]Float16, len(a))
	for i := range a {
		result[i] = Sub(a[i], b[i])
	}
	return result
}

// MulSlice performs element-wise multiplication of two Float16 slices
func MulSlice(a, b []Float16) []Float16 {
	if len(a) != len(b) {
		panic("float16: slice length mismatch")
	}

	result := make([]Float16, len(a))
	for i := range a {
		result[i] = Mul(a[i], b[i])
	}
	return result
}

// DivSlice performs element-wise division of two Float16 slices
func DivSlice(a, b []Float16) []Float16 {
	if len(a) != len(b) {
		panic("float16: slice length mismatch")
	}

	result := make([]Float16, len(a))
	for i := range a {
		result[i] = Div(a[i], b[i])
	}
	return result
}

// ScaleSlice multiplies each element in the slice by a scalar
func ScaleSlice(s []Float16, scalar Float16) []Float16 {
	result := make([]Float16, len(s))
	for i := range s {
		result[i] = Mul(s[i], scalar)
	}
	return result
}

// SumSlice returns the sum of all elements in the slice
func SumSlice(s []Float16) Float16 {
	sum := PositiveZero
	for _, v := range s {
		sum = Add(sum, v)
	}
	return sum
}

// DotProduct computes the dot product of two Float16 slices
func DotProduct(a, b []Float16) Float16 {
	if len(a) != len(b) {
		panic("float16: slice length mismatch")
	}

	sum := PositiveZero
	for i := range a {
		product := Mul(a[i], b[i])
		sum = Add(sum, product)
	}
	return sum
}

// Norm2 computes the L2 norm (Euclidean norm) of a Float16 slice
func Norm2(s []Float16) Float16 {
	sumSquares := PositiveZero
	for _, v := range s {
		square := Mul(v, v)
		sumSquares = Add(sumSquares, square)
	}
	return FromFloat64(math.Sqrt(sumSquares.ToFloat64()))
}