-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvec__int128__ppc_8h.html
6154 lines (5863 loc) · 483 KB
/
vec__int128__ppc_8h.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
<meta http-equiv="X-UA-Compatible" content="IE=9"/>
<meta name="generator" content="Doxygen 1.8.13"/>
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<title>POWER Vector Library Manual: src/pveclib/vec_int128_ppc.h File Reference</title>
<link href="tabs.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="jquery.js"></script>
<script type="text/javascript" src="dynsections.js"></script>
<link href="search/search.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="search/searchdata.js"></script>
<script type="text/javascript" src="search/search.js"></script>
<link href="doxygen.css" rel="stylesheet" type="text/css" />
</head>
<body>
<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
<div id="titlearea">
<table cellspacing="0" cellpadding="0">
<tbody>
<tr style="height: 56px;">
<td id="projectalign" style="padding-left: 0.5em;">
<div id="projectname">POWER Vector Library Manual
 <span id="projectnumber">1.0.4</span>
</div>
</td>
</tr>
</tbody>
</table>
</div>
<!-- end header part -->
<!-- Generated by Doxygen 1.8.13 -->
<script type="text/javascript">
var searchBox = new SearchBox("searchBox", "search",false,'Search');
</script>
<script type="text/javascript" src="menudata.js"></script>
<script type="text/javascript" src="menu.js"></script>
<script type="text/javascript">
$(function() {
initMenu('',true,false,'search.php','Search');
$(document).ready(function() { init_search(); });
});
</script>
<div id="main-nav"></div>
<!-- window showing the filter options -->
<div id="MSearchSelectWindow"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
onkeydown="return searchBox.OnSearchSelectKey(event)">
</div>
<!-- iframe showing the search results (closed by default) -->
<div id="MSearchResultsWindow">
<iframe src="javascript:void(0)" frameborder="0"
name="MSearchResults" id="MSearchResults">
</iframe>
</div>
<div id="nav-path" class="navpath">
<ul>
<li class="navelem"><a class="el" href="dir_68267d1309a1af8e8297ef4c3efbcdba.html">src</a></li><li class="navelem"><a class="el" href="dir_3653a864936a87c29f489ec2a5b8be1c.html">pveclib</a></li> </ul>
</div>
</div><!-- top -->
<div class="header">
<div class="summary">
<a href="#define-members">Macros</a> |
<a href="#func-members">Functions</a> </div>
<div class="headertitle">
<div class="title">vec_int128_ppc.h File Reference</div> </div>
</div><!--header-->
<div class="contents">
<p>Header package containing a collection of 128-bit computation functions implemented with PowerISA VMX and VSX instructions.
<a href="#details">More...</a></p>
<div class="textblock"><code>#include <<a class="el" href="vec__common__ppc_8h_source.html">pveclib/vec_common_ppc.h</a>></code><br />
<code>#include <<a class="el" href="vec__int64__ppc_8h_source.html">pveclib/vec_int64_ppc.h</a>></code><br />
</div>
<p><a href="vec__int128__ppc_8h_source.html">Go to the source code of this file.</a></p>
<table class="memberdecls">
<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="define-members"></a>
Macros</h2></td></tr>
<tr class="memitem:a0f75e65180e68c4753f3d9c2f42d1a31"><td class="memItemLeft" align="right" valign="top">#define </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a0f75e65180e68c4753f3d9c2f42d1a31">CONST_VUINT128_QxW</a>(__q0, __q1, __q2, __q3)</td></tr>
<tr class="memdesc:a0f75e65180e68c4753f3d9c2f42d1a31"><td class="mdescLeft"> </td><td class="mdescRight">Generate a vector unsigned __int128 constant from words. <a href="#a0f75e65180e68c4753f3d9c2f42d1a31">More...</a><br /></td></tr>
<tr class="separator:a0f75e65180e68c4753f3d9c2f42d1a31"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a12118674c4e47eb7c939bb29a379d381"><td class="memItemLeft" align="right" valign="top">#define </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a12118674c4e47eb7c939bb29a379d381">CONST_VUINT128_QxD</a>(__q0, __q1)</td></tr>
<tr class="memdesc:a12118674c4e47eb7c939bb29a379d381"><td class="mdescLeft"> </td><td class="mdescRight">Generate a vector unsigned __int128 constant from doublewords. <a href="#a12118674c4e47eb7c939bb29a379d381">More...</a><br /></td></tr>
<tr class="separator:a12118674c4e47eb7c939bb29a379d381"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a25faf0c51245eefdaeda1dc5dd71c516"><td class="memItemLeft" align="right" valign="top">#define </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a25faf0c51245eefdaeda1dc5dd71c516">CONST_VUINT128_Qx19d</a>(__q0, __q1)</td></tr>
<tr class="memdesc:a25faf0c51245eefdaeda1dc5dd71c516"><td class="mdescLeft"> </td><td class="mdescRight">Generate a vector unsigned __int128 constant from doublewords. <a href="#a25faf0c51245eefdaeda1dc5dd71c516">More...</a><br /></td></tr>
<tr class="separator:a25faf0c51245eefdaeda1dc5dd71c516"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:aa9c94b59ae2504f498923ed506a22083"><td class="memItemLeft" align="right" valign="top">#define </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#aa9c94b59ae2504f498923ed506a22083">CONST_VUINT128_Qx18d</a>(__q0, __q1)</td></tr>
<tr class="memdesc:aa9c94b59ae2504f498923ed506a22083"><td class="mdescLeft"> </td><td class="mdescRight">Generate a vector unsigned __int128 constant from doublewords. <a href="#aa9c94b59ae2504f498923ed506a22083">More...</a><br /></td></tr>
<tr class="separator:aa9c94b59ae2504f498923ed506a22083"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:acd5c20e29b155f8f575d60f6af8f7955"><td class="memItemLeft" align="right" valign="top">#define </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#acd5c20e29b155f8f575d60f6af8f7955">CONST_VUINT128_Qx16d</a>(__q0, __q1)</td></tr>
<tr class="memdesc:acd5c20e29b155f8f575d60f6af8f7955"><td class="mdescLeft"> </td><td class="mdescRight">Generate a vector unsigned __int128 constant from doublewords. <a href="#acd5c20e29b155f8f575d60f6af8f7955">More...</a><br /></td></tr>
<tr class="separator:acd5c20e29b155f8f575d60f6af8f7955"><td class="memSeparator" colspan="2"> </td></tr>
</table><table class="memberdecls">
<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="func-members"></a>
Functions</h2></td></tr>
<tr class="memitem:abf1707d712cc191915a8f558eaaa1fe7"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#abf1707d712cc191915a8f558eaaa1fe7">vec_absduq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrb)</td></tr>
<tr class="memdesc:abf1707d712cc191915a8f558eaaa1fe7"><td class="mdescLeft"> </td><td class="mdescRight">Vector Absolute Difference Unsigned Quadword. <a href="#abf1707d712cc191915a8f558eaaa1fe7">More...</a><br /></td></tr>
<tr class="separator:abf1707d712cc191915a8f558eaaa1fe7"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a655de600915e449a8681572961939422"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a655de600915e449a8681572961939422">vec_avguq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrb)</td></tr>
<tr class="memdesc:a655de600915e449a8681572961939422"><td class="mdescLeft"> </td><td class="mdescRight">Vector Average Unsigned Quadword. <a href="#a655de600915e449a8681572961939422">More...</a><br /></td></tr>
<tr class="separator:a655de600915e449a8681572961939422"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:ad7aaadba249ce46c4c94f78df1020da3"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#ad7aaadba249ce46c4c94f78df1020da3">vec_addcuq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> a, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> b)</td></tr>
<tr class="memdesc:ad7aaadba249ce46c4c94f78df1020da3"><td class="mdescLeft"> </td><td class="mdescRight">Vector Add & write Carry Unsigned Quadword. <a href="#ad7aaadba249ce46c4c94f78df1020da3">More...</a><br /></td></tr>
<tr class="separator:ad7aaadba249ce46c4c94f78df1020da3"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:af18b98d2d73f1afbc439e1407c78f305"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#af18b98d2d73f1afbc439e1407c78f305">vec_addecuq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> a, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> b, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> ci)</td></tr>
<tr class="memdesc:af18b98d2d73f1afbc439e1407c78f305"><td class="mdescLeft"> </td><td class="mdescRight">Vector Add Extended & write Carry Unsigned Quadword. <a href="#af18b98d2d73f1afbc439e1407c78f305">More...</a><br /></td></tr>
<tr class="separator:af18b98d2d73f1afbc439e1407c78f305"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a44e63f70b182d60fe03b43a80647451a"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a44e63f70b182d60fe03b43a80647451a">vec_addeuqm</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> a, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> b, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> ci)</td></tr>
<tr class="memdesc:a44e63f70b182d60fe03b43a80647451a"><td class="mdescLeft"> </td><td class="mdescRight">Vector Add Extended Unsigned Quadword Modulo. <a href="#a44e63f70b182d60fe03b43a80647451a">More...</a><br /></td></tr>
<tr class="separator:a44e63f70b182d60fe03b43a80647451a"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a539de2a4426a84102471306acc571ce8"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a539de2a4426a84102471306acc571ce8">vec_adduqm</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> a, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> b)</td></tr>
<tr class="memdesc:a539de2a4426a84102471306acc571ce8"><td class="mdescLeft"> </td><td class="mdescRight">Vector Add Unsigned Quadword Modulo. <a href="#a539de2a4426a84102471306acc571ce8">More...</a><br /></td></tr>
<tr class="separator:a539de2a4426a84102471306acc571ce8"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a363fa7103ccd730c47bb34cb9f05e80b"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a363fa7103ccd730c47bb34cb9f05e80b">vec_addcq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> *cout, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> a, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> b)</td></tr>
<tr class="memdesc:a363fa7103ccd730c47bb34cb9f05e80b"><td class="mdescLeft"> </td><td class="mdescRight">Vector Add with carry Unsigned Quadword. <a href="#a363fa7103ccd730c47bb34cb9f05e80b">More...</a><br /></td></tr>
<tr class="separator:a363fa7103ccd730c47bb34cb9f05e80b"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a9e27910c148d525e17d099688aec9ba1"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a9e27910c148d525e17d099688aec9ba1">vec_addeq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> *cout, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> a, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> b, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> ci)</td></tr>
<tr class="memdesc:a9e27910c148d525e17d099688aec9ba1"><td class="mdescLeft"> </td><td class="mdescRight">Vector Add Extend with carry Unsigned Quadword. <a href="#a9e27910c148d525e17d099688aec9ba1">More...</a><br /></td></tr>
<tr class="separator:a9e27910c148d525e17d099688aec9ba1"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a066cc120c198773a2f8dfd17480b7a49"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a066cc120c198773a2f8dfd17480b7a49">vec_clzq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra)</td></tr>
<tr class="memdesc:a066cc120c198773a2f8dfd17480b7a49"><td class="mdescLeft"> </td><td class="mdescRight">Vector Count Leading Zeros Quadword. <a href="#a066cc120c198773a2f8dfd17480b7a49">More...</a><br /></td></tr>
<tr class="separator:a066cc120c198773a2f8dfd17480b7a49"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:adf308aff6d9e25ae55b2c9d998c5de68"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#a16cdf519bbbf190c311bd27d3e254208">vb128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#adf308aff6d9e25ae55b2c9d998c5de68">vec_cmpeqsq</a> (<a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> vrb)</td></tr>
<tr class="memdesc:adf308aff6d9e25ae55b2c9d998c5de68"><td class="mdescLeft"> </td><td class="mdescRight">Vector Compare Equal Signed Quadword. <a href="#adf308aff6d9e25ae55b2c9d998c5de68">More...</a><br /></td></tr>
<tr class="separator:adf308aff6d9e25ae55b2c9d998c5de68"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a7197cd5c6e946211f2718b5e8464cdc0"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#a16cdf519bbbf190c311bd27d3e254208">vb128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a7197cd5c6e946211f2718b5e8464cdc0">vec_cmpequq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrb)</td></tr>
<tr class="memdesc:a7197cd5c6e946211f2718b5e8464cdc0"><td class="mdescLeft"> </td><td class="mdescRight">Vector Compare Equal Unsigned Quadword. <a href="#a7197cd5c6e946211f2718b5e8464cdc0">More...</a><br /></td></tr>
<tr class="separator:a7197cd5c6e946211f2718b5e8464cdc0"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:ac7e92209124903c3e8c535263246ff37"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#a16cdf519bbbf190c311bd27d3e254208">vb128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#ac7e92209124903c3e8c535263246ff37">vec_cmpgesq</a> (<a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> vrb)</td></tr>
<tr class="memdesc:ac7e92209124903c3e8c535263246ff37"><td class="mdescLeft"> </td><td class="mdescRight">Vector Compare Greater Than or Equal Signed Quadword. <a href="#ac7e92209124903c3e8c535263246ff37">More...</a><br /></td></tr>
<tr class="separator:ac7e92209124903c3e8c535263246ff37"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:abd88782f327214c07d42519b7d4c69ce"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#a16cdf519bbbf190c311bd27d3e254208">vb128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#abd88782f327214c07d42519b7d4c69ce">vec_cmpgeuq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrb)</td></tr>
<tr class="memdesc:abd88782f327214c07d42519b7d4c69ce"><td class="mdescLeft"> </td><td class="mdescRight">Vector Compare Greater Than or Equal Unsigned Quadword. <a href="#abd88782f327214c07d42519b7d4c69ce">More...</a><br /></td></tr>
<tr class="separator:abd88782f327214c07d42519b7d4c69ce"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a45ce55b0cba15cddb6764a900922d768"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#a16cdf519bbbf190c311bd27d3e254208">vb128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a45ce55b0cba15cddb6764a900922d768">vec_cmpgtsq</a> (<a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> vrb)</td></tr>
<tr class="memdesc:a45ce55b0cba15cddb6764a900922d768"><td class="mdescLeft"> </td><td class="mdescRight">Vector Compare Greater Than Signed Quadword. <a href="#a45ce55b0cba15cddb6764a900922d768">More...</a><br /></td></tr>
<tr class="separator:a45ce55b0cba15cddb6764a900922d768"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:ad4ce43dcbc14fb34623d5ece8073b86e"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#a16cdf519bbbf190c311bd27d3e254208">vb128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#ad4ce43dcbc14fb34623d5ece8073b86e">vec_cmpgtuq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrb)</td></tr>
<tr class="memdesc:ad4ce43dcbc14fb34623d5ece8073b86e"><td class="mdescLeft"> </td><td class="mdescRight">Vector Compare Greater Than Unsigned Quadword. <a href="#ad4ce43dcbc14fb34623d5ece8073b86e">More...</a><br /></td></tr>
<tr class="separator:ad4ce43dcbc14fb34623d5ece8073b86e"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a3642b1e0d48117c91189f69dd0e955ad"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#a16cdf519bbbf190c311bd27d3e254208">vb128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a3642b1e0d48117c91189f69dd0e955ad">vec_cmplesq</a> (<a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> vrb)</td></tr>
<tr class="memdesc:a3642b1e0d48117c91189f69dd0e955ad"><td class="mdescLeft"> </td><td class="mdescRight">Vector Compare Less Than or Equal Signed Quadword. <a href="#a3642b1e0d48117c91189f69dd0e955ad">More...</a><br /></td></tr>
<tr class="separator:a3642b1e0d48117c91189f69dd0e955ad"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a69b25e7f46986d00997fedaeeb7871c2"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#a16cdf519bbbf190c311bd27d3e254208">vb128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a69b25e7f46986d00997fedaeeb7871c2">vec_cmpleuq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrb)</td></tr>
<tr class="memdesc:a69b25e7f46986d00997fedaeeb7871c2"><td class="mdescLeft"> </td><td class="mdescRight">Vector Compare Less Than or Equal Unsigned Quadword. <a href="#a69b25e7f46986d00997fedaeeb7871c2">More...</a><br /></td></tr>
<tr class="separator:a69b25e7f46986d00997fedaeeb7871c2"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a82d83d78ff2330205a8d74741b34a1be"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#a16cdf519bbbf190c311bd27d3e254208">vb128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a82d83d78ff2330205a8d74741b34a1be">vec_cmpltsq</a> (<a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> vrb)</td></tr>
<tr class="memdesc:a82d83d78ff2330205a8d74741b34a1be"><td class="mdescLeft"> </td><td class="mdescRight">Vector Compare Less Than Signed Quadword. <a href="#a82d83d78ff2330205a8d74741b34a1be">More...</a><br /></td></tr>
<tr class="separator:a82d83d78ff2330205a8d74741b34a1be"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a7f9ebc5ad32b151a3e08136d51aad4dc"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#a16cdf519bbbf190c311bd27d3e254208">vb128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a7f9ebc5ad32b151a3e08136d51aad4dc">vec_cmpltuq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrb)</td></tr>
<tr class="memdesc:a7f9ebc5ad32b151a3e08136d51aad4dc"><td class="mdescLeft"> </td><td class="mdescRight">Vector Compare Less Than Unsigned Quadword. <a href="#a7f9ebc5ad32b151a3e08136d51aad4dc">More...</a><br /></td></tr>
<tr class="separator:a7f9ebc5ad32b151a3e08136d51aad4dc"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:ae2cdf052bf633951201589454e50f52e"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#a16cdf519bbbf190c311bd27d3e254208">vb128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#ae2cdf052bf633951201589454e50f52e">vec_cmpnesq</a> (<a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> vrb)</td></tr>
<tr class="memdesc:ae2cdf052bf633951201589454e50f52e"><td class="mdescLeft"> </td><td class="mdescRight">Vector Compare Equal Signed Quadword. <a href="#ae2cdf052bf633951201589454e50f52e">More...</a><br /></td></tr>
<tr class="separator:ae2cdf052bf633951201589454e50f52e"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a186d0b94bbc652e700ab4e1733b9524c"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#a16cdf519bbbf190c311bd27d3e254208">vb128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a186d0b94bbc652e700ab4e1733b9524c">vec_cmpneuq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrb)</td></tr>
<tr class="memdesc:a186d0b94bbc652e700ab4e1733b9524c"><td class="mdescLeft"> </td><td class="mdescRight">Vector Compare Not Equal Unsigned Quadword. <a href="#a186d0b94bbc652e700ab4e1733b9524c">More...</a><br /></td></tr>
<tr class="separator:a186d0b94bbc652e700ab4e1733b9524c"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a391cc9e4b1221618840767c7487d3032"><td class="memItemLeft" align="right" valign="top">static int </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a391cc9e4b1221618840767c7487d3032">vec_cmpsq_all_eq</a> (<a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> vrb)</td></tr>
<tr class="memdesc:a391cc9e4b1221618840767c7487d3032"><td class="mdescLeft"> </td><td class="mdescRight">Vector Compare all Equal Signed Quadword. <a href="#a391cc9e4b1221618840767c7487d3032">More...</a><br /></td></tr>
<tr class="separator:a391cc9e4b1221618840767c7487d3032"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a269401b65405524bb2d971bef595cb0d"><td class="memItemLeft" align="right" valign="top">static int </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a269401b65405524bb2d971bef595cb0d">vec_cmpsq_all_ge</a> (<a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> vrb)</td></tr>
<tr class="memdesc:a269401b65405524bb2d971bef595cb0d"><td class="mdescLeft"> </td><td class="mdescRight">Vector Compare any Greater Than or Equal Signed Quadword. <a href="#a269401b65405524bb2d971bef595cb0d">More...</a><br /></td></tr>
<tr class="separator:a269401b65405524bb2d971bef595cb0d"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a62a38e9016e2d94a56f935ddded3830b"><td class="memItemLeft" align="right" valign="top">static int </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a62a38e9016e2d94a56f935ddded3830b">vec_cmpsq_all_gt</a> (<a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> vrb)</td></tr>
<tr class="memdesc:a62a38e9016e2d94a56f935ddded3830b"><td class="mdescLeft"> </td><td class="mdescRight">Vector Compare any Greater Than Signed Quadword. <a href="#a62a38e9016e2d94a56f935ddded3830b">More...</a><br /></td></tr>
<tr class="separator:a62a38e9016e2d94a56f935ddded3830b"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a88209b466e628a6a77c6ddab7a15b4c8"><td class="memItemLeft" align="right" valign="top">static int </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a88209b466e628a6a77c6ddab7a15b4c8">vec_cmpsq_all_le</a> (<a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> vrb)</td></tr>
<tr class="memdesc:a88209b466e628a6a77c6ddab7a15b4c8"><td class="mdescLeft"> </td><td class="mdescRight">Vector Compare any Less Than or Equal Signed Quadword. <a href="#a88209b466e628a6a77c6ddab7a15b4c8">More...</a><br /></td></tr>
<tr class="separator:a88209b466e628a6a77c6ddab7a15b4c8"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a395dad1916a94a6cdb2b601565d7ffce"><td class="memItemLeft" align="right" valign="top">static int </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a395dad1916a94a6cdb2b601565d7ffce">vec_cmpsq_all_lt</a> (<a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> vrb)</td></tr>
<tr class="memdesc:a395dad1916a94a6cdb2b601565d7ffce"><td class="mdescLeft"> </td><td class="mdescRight">Vector Compare any Less Than Signed Quadword. <a href="#a395dad1916a94a6cdb2b601565d7ffce">More...</a><br /></td></tr>
<tr class="separator:a395dad1916a94a6cdb2b601565d7ffce"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:af7587275a406a1e2437ef86c23e2875a"><td class="memItemLeft" align="right" valign="top">static int </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#af7587275a406a1e2437ef86c23e2875a">vec_cmpsq_all_ne</a> (<a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> vrb)</td></tr>
<tr class="memdesc:af7587275a406a1e2437ef86c23e2875a"><td class="mdescLeft"> </td><td class="mdescRight">Vector Compare all Not Equal Signed Quadword. <a href="#af7587275a406a1e2437ef86c23e2875a">More...</a><br /></td></tr>
<tr class="separator:af7587275a406a1e2437ef86c23e2875a"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a2c2c01f3aa165fedba47600f87067768"><td class="memItemLeft" align="right" valign="top">static int </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a2c2c01f3aa165fedba47600f87067768">vec_cmpuq_all_eq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrb)</td></tr>
<tr class="memdesc:a2c2c01f3aa165fedba47600f87067768"><td class="mdescLeft"> </td><td class="mdescRight">Vector Compare all Equal Unsigned Quadword. <a href="#a2c2c01f3aa165fedba47600f87067768">More...</a><br /></td></tr>
<tr class="separator:a2c2c01f3aa165fedba47600f87067768"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:af8f06b2c3d612a7cfdeb3bb883c59e19"><td class="memItemLeft" align="right" valign="top">static int </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#af8f06b2c3d612a7cfdeb3bb883c59e19">vec_cmpuq_all_ge</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrb)</td></tr>
<tr class="memdesc:af8f06b2c3d612a7cfdeb3bb883c59e19"><td class="mdescLeft"> </td><td class="mdescRight">Vector Compare any Greater Than or Equal Unsigned Quadword. <a href="#af8f06b2c3d612a7cfdeb3bb883c59e19">More...</a><br /></td></tr>
<tr class="separator:af8f06b2c3d612a7cfdeb3bb883c59e19"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:ac93dc5ed8bb3501470cf70c5cb5796a9"><td class="memItemLeft" align="right" valign="top">static int </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#ac93dc5ed8bb3501470cf70c5cb5796a9">vec_cmpuq_all_gt</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrb)</td></tr>
<tr class="memdesc:ac93dc5ed8bb3501470cf70c5cb5796a9"><td class="mdescLeft"> </td><td class="mdescRight">Vector Compare any Greater Than Unsigned Quadword. <a href="#ac93dc5ed8bb3501470cf70c5cb5796a9">More...</a><br /></td></tr>
<tr class="separator:ac93dc5ed8bb3501470cf70c5cb5796a9"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a2b7f505ebca731aa6fdc7433f82c0c6d"><td class="memItemLeft" align="right" valign="top">static int </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a2b7f505ebca731aa6fdc7433f82c0c6d">vec_cmpuq_all_le</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrb)</td></tr>
<tr class="memdesc:a2b7f505ebca731aa6fdc7433f82c0c6d"><td class="mdescLeft"> </td><td class="mdescRight">Vector Compare any Less Than or Equal Unsigned Quadword. <a href="#a2b7f505ebca731aa6fdc7433f82c0c6d">More...</a><br /></td></tr>
<tr class="separator:a2b7f505ebca731aa6fdc7433f82c0c6d"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a31ddb6149475e80f4a1d38277317d980"><td class="memItemLeft" align="right" valign="top">static int </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a31ddb6149475e80f4a1d38277317d980">vec_cmpuq_all_lt</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrb)</td></tr>
<tr class="memdesc:a31ddb6149475e80f4a1d38277317d980"><td class="mdescLeft"> </td><td class="mdescRight">Vector Compare any Less Than Unsigned Quadword. <a href="#a31ddb6149475e80f4a1d38277317d980">More...</a><br /></td></tr>
<tr class="separator:a31ddb6149475e80f4a1d38277317d980"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a1799f860ba79e698c66b171392afde01"><td class="memItemLeft" align="right" valign="top">static int </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a1799f860ba79e698c66b171392afde01">vec_cmpuq_all_ne</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrb)</td></tr>
<tr class="memdesc:a1799f860ba79e698c66b171392afde01"><td class="mdescLeft"> </td><td class="mdescRight">Vector Compare all Not Equal Unsigned Quadword. <a href="#a1799f860ba79e698c66b171392afde01">More...</a><br /></td></tr>
<tr class="separator:a1799f860ba79e698c66b171392afde01"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a5f250dfab2a4aee0fd247a1d0217237b"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a5f250dfab2a4aee0fd247a1d0217237b">vec_cmul10ecuq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> *cout, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> a, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> cin)</td></tr>
<tr class="memdesc:a5f250dfab2a4aee0fd247a1d0217237b"><td class="mdescLeft"> </td><td class="mdescRight">Vector combined Multiply by 10 Extended & write Carry Unsigned Quadword. <a href="#a5f250dfab2a4aee0fd247a1d0217237b">More...</a><br /></td></tr>
<tr class="separator:a5f250dfab2a4aee0fd247a1d0217237b"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a9449c746cad42f0cd9e2fe4560364e18"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a9449c746cad42f0cd9e2fe4560364e18">vec_cmul10cuq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> *cout, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> a)</td></tr>
<tr class="memdesc:a9449c746cad42f0cd9e2fe4560364e18"><td class="mdescLeft"> </td><td class="mdescRight">Vector combined Multiply by 10 & write Carry Unsigned Quadword. <a href="#a9449c746cad42f0cd9e2fe4560364e18">More...</a><br /></td></tr>
<tr class="separator:a9449c746cad42f0cd9e2fe4560364e18"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:ae320909aca43d55b8be1069f38544ee8"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#ae320909aca43d55b8be1069f38544ee8">vec_divsq_10e31</a> (<a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> vra)</td></tr>
<tr class="memdesc:ae320909aca43d55b8be1069f38544ee8"><td class="mdescLeft"> </td><td class="mdescRight">Vector Divide by const 10e31 Signed Quadword. <a href="#ae320909aca43d55b8be1069f38544ee8">More...</a><br /></td></tr>
<tr class="separator:ae320909aca43d55b8be1069f38544ee8"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:afa2db6d665f837f96c746d88027e9e19"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#afa2db6d665f837f96c746d88027e9e19">vec_divudq_10e31</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> *qh, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrb)</td></tr>
<tr class="memdesc:afa2db6d665f837f96c746d88027e9e19"><td class="mdescLeft"> </td><td class="mdescRight">Vector Divide Unsigned Double Quadword by const 10e31. <a href="#afa2db6d665f837f96c746d88027e9e19">More...</a><br /></td></tr>
<tr class="separator:afa2db6d665f837f96c746d88027e9e19"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a917acd42e775f4bb323ba2104c52d7cb"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a917acd42e775f4bb323ba2104c52d7cb">vec_divudq_10e32</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> *qh, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrb)</td></tr>
<tr class="memdesc:a917acd42e775f4bb323ba2104c52d7cb"><td class="mdescLeft"> </td><td class="mdescRight">Vector Divide Unsigned Double Quadword by const 10e32. <a href="#a917acd42e775f4bb323ba2104c52d7cb">More...</a><br /></td></tr>
<tr class="separator:a917acd42e775f4bb323ba2104c52d7cb"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a9a6a39212f8a8b9ebf20e0117e1e1e88"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a9a6a39212f8a8b9ebf20e0117e1e1e88">vec_divuq_10e31</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra)</td></tr>
<tr class="memdesc:a9a6a39212f8a8b9ebf20e0117e1e1e88"><td class="mdescLeft"> </td><td class="mdescRight">Vector Divide by const 10e31 Unsigned Quadword. <a href="#a9a6a39212f8a8b9ebf20e0117e1e1e88">More...</a><br /></td></tr>
<tr class="separator:a9a6a39212f8a8b9ebf20e0117e1e1e88"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:ae2b45341cc9cc918198bb69da0552098"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#ae2b45341cc9cc918198bb69da0552098">vec_divuq_10e32</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra)</td></tr>
<tr class="memdesc:ae2b45341cc9cc918198bb69da0552098"><td class="mdescLeft"> </td><td class="mdescRight">Vector Divide by const 10e32 Unsigned Quadword. <a href="#ae2b45341cc9cc918198bb69da0552098">More...</a><br /></td></tr>
<tr class="separator:ae2b45341cc9cc918198bb69da0552098"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:acef64f9ffe8af5a8f08b6bdd0a9e218f"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#acef64f9ffe8af5a8f08b6bdd0a9e218f">vec_maxsq</a> (<a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> vrb)</td></tr>
<tr class="memdesc:acef64f9ffe8af5a8f08b6bdd0a9e218f"><td class="mdescLeft"> </td><td class="mdescRight">Vector Maximum Signed Quadword. <a href="#acef64f9ffe8af5a8f08b6bdd0a9e218f">More...</a><br /></td></tr>
<tr class="separator:acef64f9ffe8af5a8f08b6bdd0a9e218f"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:adb02d0572ecc17eca0de6d4f0d9aa302"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#adb02d0572ecc17eca0de6d4f0d9aa302">vec_maxuq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrb)</td></tr>
<tr class="memdesc:adb02d0572ecc17eca0de6d4f0d9aa302"><td class="mdescLeft"> </td><td class="mdescRight">Vector Maximum Unsigned Quadword. <a href="#adb02d0572ecc17eca0de6d4f0d9aa302">More...</a><br /></td></tr>
<tr class="separator:adb02d0572ecc17eca0de6d4f0d9aa302"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:ab435fd182688a615fb88b6578321839d"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#ab435fd182688a615fb88b6578321839d">vec_minsq</a> (<a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> vrb)</td></tr>
<tr class="memdesc:ab435fd182688a615fb88b6578321839d"><td class="mdescLeft"> </td><td class="mdescRight">Vector Minimum Signed Quadword. <a href="#ab435fd182688a615fb88b6578321839d">More...</a><br /></td></tr>
<tr class="separator:ab435fd182688a615fb88b6578321839d"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:ad0fa6a9987d3bd9593d1780b1c28c390"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#ad0fa6a9987d3bd9593d1780b1c28c390">vec_minuq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrb)</td></tr>
<tr class="memdesc:ad0fa6a9987d3bd9593d1780b1c28c390"><td class="mdescLeft"> </td><td class="mdescRight">Vector Minimum Unsigned Quadword. <a href="#ad0fa6a9987d3bd9593d1780b1c28c390">More...</a><br /></td></tr>
<tr class="separator:ad0fa6a9987d3bd9593d1780b1c28c390"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:aab5db88e4608d4a7408df9042adce86c"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#aab5db88e4608d4a7408df9042adce86c">vec_modsq_10e31</a> (<a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> q)</td></tr>
<tr class="memdesc:aab5db88e4608d4a7408df9042adce86c"><td class="mdescLeft"> </td><td class="mdescRight">Vector Modulo by const 10e31 Signed Quadword. <a href="#aab5db88e4608d4a7408df9042adce86c">More...</a><br /></td></tr>
<tr class="separator:aab5db88e4608d4a7408df9042adce86c"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a31a893a75e42f5f6c4dfe793678fea59"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a31a893a75e42f5f6c4dfe793678fea59">vec_modudq_10e31</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrb, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> *ql)</td></tr>
<tr class="memdesc:a31a893a75e42f5f6c4dfe793678fea59"><td class="mdescLeft"> </td><td class="mdescRight">Vector Modulo Unsigned Double Quadword by const 10e31. <a href="#a31a893a75e42f5f6c4dfe793678fea59">More...</a><br /></td></tr>
<tr class="separator:a31a893a75e42f5f6c4dfe793678fea59"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a2ccbd77900956c01a51b88e672e593c6"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a2ccbd77900956c01a51b88e672e593c6">vec_modudq_10e32</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrb, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> *ql)</td></tr>
<tr class="memdesc:a2ccbd77900956c01a51b88e672e593c6"><td class="mdescLeft"> </td><td class="mdescRight">Vector Modulo Unsigned Double Quadword by const 10e32. <a href="#a2ccbd77900956c01a51b88e672e593c6">More...</a><br /></td></tr>
<tr class="separator:a2ccbd77900956c01a51b88e672e593c6"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:af4b3b91f7e80522d8a8c0c171e077b99"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#af4b3b91f7e80522d8a8c0c171e077b99">vec_moduq_10e31</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> q)</td></tr>
<tr class="memdesc:af4b3b91f7e80522d8a8c0c171e077b99"><td class="mdescLeft"> </td><td class="mdescRight">Vector Modulo by const 10e31 Unsigned Quadword. <a href="#af4b3b91f7e80522d8a8c0c171e077b99">More...</a><br /></td></tr>
<tr class="separator:af4b3b91f7e80522d8a8c0c171e077b99"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:aff4f1d8a707289d2271eafad4aeb1e82"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#aff4f1d8a707289d2271eafad4aeb1e82">vec_moduq_10e32</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> q)</td></tr>
<tr class="memdesc:aff4f1d8a707289d2271eafad4aeb1e82"><td class="mdescLeft"> </td><td class="mdescRight">Vector Modulo by const 10e32 Unsigned Quadword. <a href="#aff4f1d8a707289d2271eafad4aeb1e82">More...</a><br /></td></tr>
<tr class="separator:aff4f1d8a707289d2271eafad4aeb1e82"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a8c641b0107fc3e1621ef729c04efd583"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a8c641b0107fc3e1621ef729c04efd583">vec_mul10cuq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> a)</td></tr>
<tr class="memdesc:a8c641b0107fc3e1621ef729c04efd583"><td class="mdescLeft"> </td><td class="mdescRight">Vector Multiply by 10 & write Carry Unsigned Quadword. <a href="#a8c641b0107fc3e1621ef729c04efd583">More...</a><br /></td></tr>
<tr class="separator:a8c641b0107fc3e1621ef729c04efd583"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a7ca2a6427ecb9458858b5caaac8c4dca"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a7ca2a6427ecb9458858b5caaac8c4dca">vec_mul10ecuq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> a, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> cin)</td></tr>
<tr class="memdesc:a7ca2a6427ecb9458858b5caaac8c4dca"><td class="mdescLeft"> </td><td class="mdescRight">Vector Multiply by 10 Extended & write Carry Unsigned Quadword. <a href="#a7ca2a6427ecb9458858b5caaac8c4dca">More...</a><br /></td></tr>
<tr class="separator:a7ca2a6427ecb9458858b5caaac8c4dca"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a2245626e7b90621b33ba79b763a4215e"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a2245626e7b90621b33ba79b763a4215e">vec_mul10euq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> a, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> cin)</td></tr>
<tr class="memdesc:a2245626e7b90621b33ba79b763a4215e"><td class="mdescLeft"> </td><td class="mdescRight">Vector Multiply by 10 Extended Unsigned Quadword. <a href="#a2245626e7b90621b33ba79b763a4215e">More...</a><br /></td></tr>
<tr class="separator:a2245626e7b90621b33ba79b763a4215e"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a3675fa1a2334eff913df447904be78ad"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a3675fa1a2334eff913df447904be78ad">vec_mul10uq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> a)</td></tr>
<tr class="memdesc:a3675fa1a2334eff913df447904be78ad"><td class="mdescLeft"> </td><td class="mdescRight">Vector Multiply by 10 Unsigned Quadword. <a href="#a3675fa1a2334eff913df447904be78ad">More...</a><br /></td></tr>
<tr class="separator:a3675fa1a2334eff913df447904be78ad"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a8bc23a0cd3f522c017ec95d5ce93a2f0"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a8bc23a0cd3f522c017ec95d5ce93a2f0">vec_cmul100cuq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> *cout, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> a)</td></tr>
<tr class="memdesc:a8bc23a0cd3f522c017ec95d5ce93a2f0"><td class="mdescLeft"> </td><td class="mdescRight">Vector combined Multiply by 100 & write Carry Unsigned Quadword. <a href="#a8bc23a0cd3f522c017ec95d5ce93a2f0">More...</a><br /></td></tr>
<tr class="separator:a8bc23a0cd3f522c017ec95d5ce93a2f0"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:ac3f85b6577e5ab0de2b3f68ca45dd33b"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#ac3f85b6577e5ab0de2b3f68ca45dd33b">vec_cmul100ecuq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> *cout, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> a, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> cin)</td></tr>
<tr class="memdesc:ac3f85b6577e5ab0de2b3f68ca45dd33b"><td class="mdescLeft"> </td><td class="mdescRight">Vector combined Multiply by 100 Extended & write Carry Unsigned Quadword. <a href="#ac3f85b6577e5ab0de2b3f68ca45dd33b">More...</a><br /></td></tr>
<tr class="separator:ac3f85b6577e5ab0de2b3f68ca45dd33b"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a1d183ebd232e5826be109cdaa421aeed"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a1d183ebd232e5826be109cdaa421aeed">vec_msumudm</a> (<a class="el" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> a, <a class="el" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> b, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> c)</td></tr>
<tr class="memdesc:a1d183ebd232e5826be109cdaa421aeed"><td class="mdescLeft"> </td><td class="mdescRight">Vector Multiply-Sum Unsigned Doubleword Modulo. <a href="#a1d183ebd232e5826be109cdaa421aeed">More...</a><br /></td></tr>
<tr class="separator:a1d183ebd232e5826be109cdaa421aeed"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a26f95e02f7b0551e3f2bb7e4b4da040d"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a26f95e02f7b0551e3f2bb7e4b4da040d">vec_muleud</a> (<a class="el" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> a, <a class="el" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> b)</td></tr>
<tr class="memdesc:a26f95e02f7b0551e3f2bb7e4b4da040d"><td class="mdescLeft"> </td><td class="mdescRight">Vector Multiply Even Unsigned Doublewords. <a href="#a26f95e02f7b0551e3f2bb7e4b4da040d">More...</a><br /></td></tr>
<tr class="separator:a26f95e02f7b0551e3f2bb7e4b4da040d"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a10780cd8a88f18ec564ee6254c179a06"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a10780cd8a88f18ec564ee6254c179a06">vec_mulhud</a> (<a class="el" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> vrb)</td></tr>
<tr class="memdesc:a10780cd8a88f18ec564ee6254c179a06"><td class="mdescLeft"> </td><td class="mdescRight">Vector Multiply High Unsigned Doubleword. <a href="#a10780cd8a88f18ec564ee6254c179a06">More...</a><br /></td></tr>
<tr class="separator:a10780cd8a88f18ec564ee6254c179a06"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:aa989582cbfaa7984f78a937225e92f4a"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#aa989582cbfaa7984f78a937225e92f4a">vec_muloud</a> (<a class="el" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> a, <a class="el" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> b)</td></tr>
<tr class="memdesc:aa989582cbfaa7984f78a937225e92f4a"><td class="mdescLeft"> </td><td class="mdescRight">Vector Multiply Odd Unsigned Doublewords. <a href="#aa989582cbfaa7984f78a937225e92f4a">More...</a><br /></td></tr>
<tr class="separator:aa989582cbfaa7984f78a937225e92f4a"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a2f19a53d6d28ac9b2aab5b8e1c5b2cbb"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a2f19a53d6d28ac9b2aab5b8e1c5b2cbb">vec_muludm</a> (<a class="el" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> vrb)</td></tr>
<tr class="memdesc:a2f19a53d6d28ac9b2aab5b8e1c5b2cbb"><td class="mdescLeft"> </td><td class="mdescRight">Vector Multiply Unsigned Doubleword Modulo. <a href="#a2f19a53d6d28ac9b2aab5b8e1c5b2cbb">More...</a><br /></td></tr>
<tr class="separator:a2f19a53d6d28ac9b2aab5b8e1c5b2cbb"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:ad6be9c8f02e43c39a659d6bbc9c3a2d2"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#ad6be9c8f02e43c39a659d6bbc9c3a2d2">vec_mulhuq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> a, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> b)</td></tr>
<tr class="memdesc:ad6be9c8f02e43c39a659d6bbc9c3a2d2"><td class="mdescLeft"> </td><td class="mdescRight">Vector Multiply High Unsigned Quadword. <a href="#ad6be9c8f02e43c39a659d6bbc9c3a2d2">More...</a><br /></td></tr>
<tr class="separator:ad6be9c8f02e43c39a659d6bbc9c3a2d2"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a9aaaf0e4c2705be1e0e8e925b09c52de"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a9aaaf0e4c2705be1e0e8e925b09c52de">vec_mulluq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> a, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> b)</td></tr>
<tr class="memdesc:a9aaaf0e4c2705be1e0e8e925b09c52de"><td class="mdescLeft"> </td><td class="mdescRight">Vector Multiply Low Unsigned Quadword. <a href="#a9aaaf0e4c2705be1e0e8e925b09c52de">More...</a><br /></td></tr>
<tr class="separator:a9aaaf0e4c2705be1e0e8e925b09c52de"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:aee5c5b2998ef105b4c6f39739748ffa8"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#aee5c5b2998ef105b4c6f39739748ffa8">vec_muludq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> *mulu, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> a, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> b)</td></tr>
<tr class="memdesc:aee5c5b2998ef105b4c6f39739748ffa8"><td class="mdescLeft"> </td><td class="mdescRight">Vector Multiply Unsigned Double Quadword. <a href="#aee5c5b2998ef105b4c6f39739748ffa8">More...</a><br /></td></tr>
<tr class="separator:aee5c5b2998ef105b4c6f39739748ffa8"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a2a38409db81a0765586023d5874ab6b0"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a2a38409db81a0765586023d5874ab6b0">vec_madduq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> *mulu, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> a, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> b, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> c)</td></tr>
<tr class="memdesc:a2a38409db81a0765586023d5874ab6b0"><td class="mdescLeft"> </td><td class="mdescRight">Vector Multiply-Add Unsigned Quadword. <a href="#a2a38409db81a0765586023d5874ab6b0">More...</a><br /></td></tr>
<tr class="separator:a2a38409db81a0765586023d5874ab6b0"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a7e7f25c382f8016baa6452fc02c03b83"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a7e7f25c382f8016baa6452fc02c03b83">vec_madd2uq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> *mulu, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> a, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> b, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> c1, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> c2)</td></tr>
<tr class="memdesc:a7e7f25c382f8016baa6452fc02c03b83"><td class="mdescLeft"> </td><td class="mdescRight">Vector Multiply-Add2 Unsigned Quadword. <a href="#a7e7f25c382f8016baa6452fc02c03b83">More...</a><br /></td></tr>
<tr class="separator:a7e7f25c382f8016baa6452fc02c03b83"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:ae05185c980535dd28aec3a2a9431cb69"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#ae05185c980535dd28aec3a2a9431cb69">vec_popcntq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra)</td></tr>
<tr class="memdesc:ae05185c980535dd28aec3a2a9431cb69"><td class="mdescLeft"> </td><td class="mdescRight">Vector Population Count Quadword. <a href="#ae05185c980535dd28aec3a2a9431cb69">More...</a><br /></td></tr>
<tr class="separator:ae05185c980535dd28aec3a2a9431cb69"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:aa40644aaa8146d00f84fce58dd4fd24e"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#aa40644aaa8146d00f84fce58dd4fd24e">vec_revbq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra)</td></tr>
<tr class="memdesc:aa40644aaa8146d00f84fce58dd4fd24e"><td class="mdescLeft"> </td><td class="mdescRight">Vector Byte Reverse Quadword. <a href="#aa40644aaa8146d00f84fce58dd4fd24e">More...</a><br /></td></tr>
<tr class="separator:aa40644aaa8146d00f84fce58dd4fd24e"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a52b3bfcc5b277628cd80ecd90440f8a5"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a52b3bfcc5b277628cd80ecd90440f8a5">vec_rlq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrb)</td></tr>
<tr class="memdesc:a52b3bfcc5b277628cd80ecd90440f8a5"><td class="mdescLeft"> </td><td class="mdescRight">Vector Rotate Left Quadword. <a href="#a52b3bfcc5b277628cd80ecd90440f8a5">More...</a><br /></td></tr>
<tr class="separator:a52b3bfcc5b277628cd80ecd90440f8a5"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a4777f3b762c3313df0a13aa352c2f189"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a4777f3b762c3313df0a13aa352c2f189">vec_rlqi</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra, const unsigned int shb)</td></tr>
<tr class="memdesc:a4777f3b762c3313df0a13aa352c2f189"><td class="mdescLeft"> </td><td class="mdescRight">Vector Rotate Left Quadword Immediate. <a href="#a4777f3b762c3313df0a13aa352c2f189">More...</a><br /></td></tr>
<tr class="separator:a4777f3b762c3313df0a13aa352c2f189"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:af74036e39e72e0f3c29706d30fbb96d1"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#a16cdf519bbbf190c311bd27d3e254208">vb128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#af74036e39e72e0f3c29706d30fbb96d1">vec_setb_cyq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vcy)</td></tr>
<tr class="memdesc:af74036e39e72e0f3c29706d30fbb96d1"><td class="mdescLeft"> </td><td class="mdescRight">Vector Set Bool from Quadword Carry. <a href="#af74036e39e72e0f3c29706d30fbb96d1">More...</a><br /></td></tr>
<tr class="separator:af74036e39e72e0f3c29706d30fbb96d1"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a921647d5b67f0de5006ee32fb3d9c4f1"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#a16cdf519bbbf190c311bd27d3e254208">vb128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a921647d5b67f0de5006ee32fb3d9c4f1">vec_setb_ncq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vcy)</td></tr>
<tr class="memdesc:a921647d5b67f0de5006ee32fb3d9c4f1"><td class="mdescLeft"> </td><td class="mdescRight">Vector Set Bool from Quadword not Carry. <a href="#a921647d5b67f0de5006ee32fb3d9c4f1">More...</a><br /></td></tr>
<tr class="separator:a921647d5b67f0de5006ee32fb3d9c4f1"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:af21d01bb19f0ea8605d8c37035837802"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#a16cdf519bbbf190c311bd27d3e254208">vb128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#af21d01bb19f0ea8605d8c37035837802">vec_setb_sq</a> (<a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> vra)</td></tr>
<tr class="memdesc:af21d01bb19f0ea8605d8c37035837802"><td class="mdescLeft"> </td><td class="mdescRight">Vector Set Bool from Signed Quadword. <a href="#af21d01bb19f0ea8605d8c37035837802">More...</a><br /></td></tr>
<tr class="separator:af21d01bb19f0ea8605d8c37035837802"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a8ba40be93339359793ef776e1d5d7577"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a8ba40be93339359793ef776e1d5d7577">vec_sldq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrw, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrx, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrb)</td></tr>
<tr class="memdesc:a8ba40be93339359793ef776e1d5d7577"><td class="mdescLeft"> </td><td class="mdescRight">Vector Shift Left Double Quadword. <a href="#a8ba40be93339359793ef776e1d5d7577">More...</a><br /></td></tr>
<tr class="separator:a8ba40be93339359793ef776e1d5d7577"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:aaa33904ec4de42f54cceab34adb303c5"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#aaa33904ec4de42f54cceab34adb303c5">vec_sldqi</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrw, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrx, const unsigned int shb)</td></tr>
<tr class="memdesc:aaa33904ec4de42f54cceab34adb303c5"><td class="mdescLeft"> </td><td class="mdescRight">Vector Shift Left Double Quadword Immediate. <a href="#aaa33904ec4de42f54cceab34adb303c5">More...</a><br /></td></tr>
<tr class="separator:aaa33904ec4de42f54cceab34adb303c5"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a49fe2c36fca9911ab99a1f8abb53f0ff"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a49fe2c36fca9911ab99a1f8abb53f0ff">vec_slq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrb)</td></tr>
<tr class="memdesc:a49fe2c36fca9911ab99a1f8abb53f0ff"><td class="mdescLeft"> </td><td class="mdescRight">Vector Shift Left Quadword. <a href="#a49fe2c36fca9911ab99a1f8abb53f0ff">More...</a><br /></td></tr>
<tr class="separator:a49fe2c36fca9911ab99a1f8abb53f0ff"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a070fe972995f3954362835f5b72e5ff6"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a070fe972995f3954362835f5b72e5ff6">vec_slqi</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra, const unsigned int shb)</td></tr>
<tr class="memdesc:a070fe972995f3954362835f5b72e5ff6"><td class="mdescLeft"> </td><td class="mdescRight">Vector Shift Left Quadword Immediate. <a href="#a070fe972995f3954362835f5b72e5ff6">More...</a><br /></td></tr>
<tr class="separator:a070fe972995f3954362835f5b72e5ff6"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:ab99c4af0aae31b02e1f17f12500198f0"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#ab99c4af0aae31b02e1f17f12500198f0">vec_sraq</a> (<a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrb)</td></tr>
<tr class="memdesc:ab99c4af0aae31b02e1f17f12500198f0"><td class="mdescLeft"> </td><td class="mdescRight">Vector Shift Right Algebraic Quadword. <a href="#ab99c4af0aae31b02e1f17f12500198f0">More...</a><br /></td></tr>
<tr class="separator:ab99c4af0aae31b02e1f17f12500198f0"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a50546b5b39e0c21cffe678f225ff59b7"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a50546b5b39e0c21cffe678f225ff59b7">vec_sraqi</a> (<a class="el" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> vra, const unsigned int shb)</td></tr>
<tr class="memdesc:a50546b5b39e0c21cffe678f225ff59b7"><td class="mdescLeft"> </td><td class="mdescRight">Vector Shift Right Algebraic Quadword Immediate. <a href="#a50546b5b39e0c21cffe678f225ff59b7">More...</a><br /></td></tr>
<tr class="separator:a50546b5b39e0c21cffe678f225ff59b7"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a0edd172a5656b842d6586c5078284942"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a0edd172a5656b842d6586c5078284942">vec_srq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrb)</td></tr>
<tr class="memdesc:a0edd172a5656b842d6586c5078284942"><td class="mdescLeft"> </td><td class="mdescRight">Vector Shift Right Quadword. <a href="#a0edd172a5656b842d6586c5078284942">More...</a><br /></td></tr>
<tr class="separator:a0edd172a5656b842d6586c5078284942"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:ac05c640c6a42770cb95466ff4a2d903c"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#ac05c640c6a42770cb95466ff4a2d903c">vec_srqi</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra, const unsigned int shb)</td></tr>
<tr class="memdesc:ac05c640c6a42770cb95466ff4a2d903c"><td class="mdescLeft"> </td><td class="mdescRight">Vector Shift Right Quadword Immediate. <a href="#ac05c640c6a42770cb95466ff4a2d903c">More...</a><br /></td></tr>
<tr class="separator:ac05c640c6a42770cb95466ff4a2d903c"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a4f6dca233bb7e4edc2adb751d478572e"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a4f6dca233bb7e4edc2adb751d478572e">vec_slq4</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra)</td></tr>
<tr class="separator:a4f6dca233bb7e4edc2adb751d478572e"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a9964ce224b90a0986122f79f6455cba5"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a9964ce224b90a0986122f79f6455cba5">vec_slq5</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra)</td></tr>
<tr class="separator:a9964ce224b90a0986122f79f6455cba5"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a49d355191fabd04a434723265ccafa20"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a49d355191fabd04a434723265ccafa20">vec_srq4</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra)</td></tr>
<tr class="separator:a49d355191fabd04a434723265ccafa20"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a257bcf71eefa1d08482587637dc400da"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a257bcf71eefa1d08482587637dc400da">vec_srq5</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra)</td></tr>
<tr class="separator:a257bcf71eefa1d08482587637dc400da"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a95d3546b2fd6840b46b031c15b4f60d3"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a95d3546b2fd6840b46b031c15b4f60d3">vec_subcuq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrb)</td></tr>
<tr class="memdesc:a95d3546b2fd6840b46b031c15b4f60d3"><td class="mdescLeft"> </td><td class="mdescRight">Vector Subtract and Write Carry Unsigned Quadword. <a href="#a95d3546b2fd6840b46b031c15b4f60d3">More...</a><br /></td></tr>
<tr class="separator:a95d3546b2fd6840b46b031c15b4f60d3"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a04f6df21399a4e6228eca254611b23c5"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a04f6df21399a4e6228eca254611b23c5">vec_subecuq</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrb, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrc)</td></tr>
<tr class="memdesc:a04f6df21399a4e6228eca254611b23c5"><td class="mdescLeft"> </td><td class="mdescRight">Vector Subtract Extended and Write Carry Unsigned Quadword. <a href="#a04f6df21399a4e6228eca254611b23c5">More...</a><br /></td></tr>
<tr class="separator:a04f6df21399a4e6228eca254611b23c5"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a2e40f9bf5df59b725cbfb6738c765202"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a2e40f9bf5df59b725cbfb6738c765202">vec_subeuqm</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrb, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrc)</td></tr>
<tr class="memdesc:a2e40f9bf5df59b725cbfb6738c765202"><td class="mdescLeft"> </td><td class="mdescRight">Vector Subtract Extended Unsigned Quadword Modulo. <a href="#a2e40f9bf5df59b725cbfb6738c765202">More...</a><br /></td></tr>
<tr class="separator:a2e40f9bf5df59b725cbfb6738c765202"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a6bafb410404d4f1e10a99263b57d1df0"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a6bafb410404d4f1e10a99263b57d1df0">vec_subuqm</a> (<a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrb)</td></tr>
<tr class="memdesc:a6bafb410404d4f1e10a99263b57d1df0"><td class="mdescLeft"> </td><td class="mdescRight">Vector Subtract Unsigned Quadword Modulo. <a href="#a6bafb410404d4f1e10a99263b57d1df0">More...</a><br /></td></tr>
<tr class="separator:a6bafb410404d4f1e10a99263b57d1df0"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a84e6361054b52ac4564bcef25b718151"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a84e6361054b52ac4564bcef25b718151">vec_vmuleud</a> (<a class="el" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> a, <a class="el" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> b)</td></tr>
<tr class="memdesc:a84e6361054b52ac4564bcef25b718151"><td class="mdescLeft"> </td><td class="mdescRight">Vector Multiply Even Unsigned Doublewords. <a href="#a84e6361054b52ac4564bcef25b718151">More...</a><br /></td></tr>
<tr class="separator:a84e6361054b52ac4564bcef25b718151"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a6280736f91cb67eca10b55e750bfe1de"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a6280736f91cb67eca10b55e750bfe1de">vec_vmaddeud</a> (<a class="el" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> a, <a class="el" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> b, <a class="el" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> c)</td></tr>
<tr class="memdesc:a6280736f91cb67eca10b55e750bfe1de"><td class="mdescLeft"> </td><td class="mdescRight">Vector Multiply-Add Even Unsigned Doublewords. <a href="#a6280736f91cb67eca10b55e750bfe1de">More...</a><br /></td></tr>
<tr class="separator:a6280736f91cb67eca10b55e750bfe1de"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a9e8daafaa42c16823750d7fe61224662"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a9e8daafaa42c16823750d7fe61224662">vec_vmadd2eud</a> (<a class="el" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> a, <a class="el" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> b, <a class="el" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> c, <a class="el" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> d)</td></tr>
<tr class="memdesc:a9e8daafaa42c16823750d7fe61224662"><td class="mdescLeft"> </td><td class="mdescRight">Vector Multiply-Add2 Even Unsigned Doublewords. <a href="#a9e8daafaa42c16823750d7fe61224662">More...</a><br /></td></tr>
<tr class="separator:a9e8daafaa42c16823750d7fe61224662"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a208744996e7482604ad274b44999d6ce"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a208744996e7482604ad274b44999d6ce">vec_vmuloud</a> (<a class="el" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> a, <a class="el" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> b)</td></tr>
<tr class="memdesc:a208744996e7482604ad274b44999d6ce"><td class="mdescLeft"> </td><td class="mdescRight">Vector Multiply Odd Unsigned Doublewords. <a href="#a208744996e7482604ad274b44999d6ce">More...</a><br /></td></tr>
<tr class="separator:a208744996e7482604ad274b44999d6ce"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:ae0b83d2696455fea53b1ecf434a0daf8"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#ae0b83d2696455fea53b1ecf434a0daf8">vec_vmaddoud</a> (<a class="el" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> a, <a class="el" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> b, <a class="el" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> c)</td></tr>
<tr class="memdesc:ae0b83d2696455fea53b1ecf434a0daf8"><td class="mdescLeft"> </td><td class="mdescRight">Vector Multiply-Add Odd Unsigned Doublewords. <a href="#ae0b83d2696455fea53b1ecf434a0daf8">More...</a><br /></td></tr>
<tr class="separator:ae0b83d2696455fea53b1ecf434a0daf8"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:af66cdabc7f2bc00f79579ce90fd483e0"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#af66cdabc7f2bc00f79579ce90fd483e0">vec_vmadd2oud</a> (<a class="el" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> a, <a class="el" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> b, <a class="el" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> c, <a class="el" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> d)</td></tr>
<tr class="memdesc:af66cdabc7f2bc00f79579ce90fd483e0"><td class="mdescLeft"> </td><td class="mdescRight">Vector Multiply-Add2 Odd Unsigned Doublewords. <a href="#af66cdabc7f2bc00f79579ce90fd483e0">More...</a><br /></td></tr>
<tr class="separator:af66cdabc7f2bc00f79579ce90fd483e0"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a43f6f199cdf39641d940f5b8d55dbf6b"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a43f6f199cdf39641d940f5b8d55dbf6b">vec_vmsumeud</a> (<a class="el" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> a, <a class="el" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> b, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> c)</td></tr>
<tr class="memdesc:a43f6f199cdf39641d940f5b8d55dbf6b"><td class="mdescLeft"> </td><td class="mdescRight">Vector Multiply-Sum Even Unsigned Doublewords. <a href="#a43f6f199cdf39641d940f5b8d55dbf6b">More...</a><br /></td></tr>
<tr class="separator:a43f6f199cdf39641d940f5b8d55dbf6b"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a04385860c7a03a9aa57f4a31017caf81"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="vec__int128__ppc_8h.html#a04385860c7a03a9aa57f4a31017caf81">vec_vmsumoud</a> (<a class="el" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> a, <a class="el" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> b, <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> c)</td></tr>
<tr class="memdesc:a04385860c7a03a9aa57f4a31017caf81"><td class="mdescLeft"> </td><td class="mdescRight">Vector Multiply-Sum Odd Unsigned Doublewords. <a href="#a04385860c7a03a9aa57f4a31017caf81">More...</a><br /></td></tr>
<tr class="separator:a04385860c7a03a9aa57f4a31017caf81"><td class="memSeparator" colspan="2"> </td></tr>
</table>
<a name="details" id="details"></a><h2 class="groupheader">Detailed Description</h2>
<div class="textblock"><p>Header package containing a collection of 128-bit computation functions implemented with PowerISA VMX and VSX instructions. </p>
<p>Some of these operations are implemented in a single instruction on newer (POWER8/POWER9) processors. This header serves to fill in functional gaps for older (POWER7, POWER8) processors and provides a in-line assembler implementation for older compilers that do not provide the build-ins. Other operations do not exist as instructions on any current processor but are useful and should be provided. This header serves to provide these operations as inline functions using existing vector built-ins or other pveclib operations.</p>
<p>The original VMX (AKA Altivec) only defined a few instructions that operated on the 128-bit vector as a whole. This included the vector shift left/right (bit), vector shift left/right by octet (byte), vector shift left double by octet (select a contiguous 16-bytes from 2 concatenated vectors) 256-bit), and generalized vector permute (select any 16-bytes from 2 concatenated vectors). Use of these instructions can be complicated when;</p><ul>
<li>the shift amount is more than 8 bits,</li>
<li>the shift amount is not a multiple of 8-bits (octet),</li>
<li>the shift amount is a constant and needs to be generated/loaded before use.</li>
</ul>
<p>These instructions can used in combination to provide generalized vector __int128 shift/rotate operations. Pveclib uses these operations to provide vector __int128 shift / rotate left, shift right and shift algebraic right operations. These operations require pre-conditions to avoid multiple instructions or require a combination of (bit and octet shift) instructions to get the quadword result. The compiler <altivec.h> built-ins only supports individual instructions. So using these operations quickly inspires a need for a header (like this) to contain implementations of the common operations.</p>
<p>The VSX facility (introduced with POWER7) did not add any integer doubleword (64-bit) or quadword (128-bit) operations. However it did add a useful doubleword permute immediate and word wise; merge, shift, and splat immediate operations. Otherwise vector __int128 (128-bit elements) operations have to be implemented using VMX word and halfword element integer operations for POWER7.</p>
<p>POWER8 added multiply word operations that produce the full doubleword product and full quadword add / subtract (with carry extend). The add quadword is useful to sum the partial products for a full 128 x 128-bit multiply. The add quadword write carry and extend forms, simplify extending arithmetic to 256-bits and beyond.</p>
<p>While POWER8 provided quadword integer add and subtract operations, it did not provide quadword Signed/Unsigned integer compare operations. It is possible to implement quadword compare operations using existing word / doubleword compares and the the new quadword subtract write-carry operation. The trick it so convert the carry into a vector bool __int128 via the vec_setb_ncq () operation. This header provides easy to use quadword compare operations.</p>
<p>POWER9 (PowerISA 3.0B) adds the <b>Vector Multiply-Sum unsigned Doubleword Modulo</b> instruction. Aspects of this instruction mean it needs to be used carefully as part of larger quadword multiply. It performs only two of the four required doubleword multiplies. The final quadword modulo sum will discard any overflow/carry from the potential 130-bit result. With careful pre-conditioning of doubleword inputs the results are can not overflow from 128-bits. Then separate add quadword add/write carry operations can be used to complete the sum of partial products. These techniques are used in the POWER9 specific implementations of vec_muleud, vec_muloud, vec_mulluq, and vec_muludq.</p>
<p>PowerISA 3.0B also defined additional: Binary Coded Decimal (BCD) and Zoned character format conversions. String processing operations. Vector Parity operations. Integer Extend Sign Operations. Integer Absolute Difference Operations. All of these seem to useful additions to pveclib for older (POWER7/8) processors and across element sizes (including quadword elements).</p>
<p>Most of these intrinsic (compiler built-in) operations are defined in <altivec.h> and described in the compiler documentation. However it took several compiler releases for all the new POWER8 64-bit and 128-bit integer vector intrinsics to be added to <b>altivec.h</b>. This support started with the GCC 4.9 but was not complete across function/type and bug free until GCC 6.0.</p>
<dl class="section note"><dt>Note</dt><dd>The compiler disables associated <altivec.h> built-ins if the <b>mcpu</b> target does not enable the specific instruction. For example, if you compile with <b>-mcpu=power7</b>, vec_vadduqm and vec_vsubudm will not be defined. But <a class="el" href="vec__int128__ppc_8h.html#a539de2a4426a84102471306acc571ce8" title="Vector Add Unsigned Quadword Modulo. ">vec_adduqm()</a> and <a class="el" href="vec__int64__ppc_8h.html#a0b7aee3c81538f5537680b610d934500" title="Vector Subtract Unsigned Doubleword Modulo. ">vec_subudm()</a> and always be defined in this header, will generate the minimum code, appropriate for the target, and produce correct results.</dd></dl>
<p>Most of these operations are implemented in a single instruction on newer (POWER8/POWER9) processors. So this header serves to fill in functional gaps for older (POWER7, POWER8) processors and provides a in-line assembler implementation for older compilers that do not provide the build-ins.</p>
<p>This header covers operations that are either:</p>
<ul>
<li>Operations implemented in hardware instructions for later processors and useful to programmers, on slightly older processors, even if the equivalent function requires more instructions. Examples include quadword byte reverse, add and subtract.</li>
<li>Defined in the OpenPOWER ABI but <em>not</em> yet defined in <altivec.n> provided by available compilers in common use. Examples include quadword byte reverse, add and subtract.</li>
<li>Are commonly used operations, not covered by the ABI or <altivec.h>, and require multiple instructions or are not obvious. Examples include quadword; Signed and Unsigned compare, shift immediate, multiply, multiply by 10 immediate, count leading zeros and population count.</li>
</ul>
<dl class="section note"><dt>Note</dt><dd>The Multiply sum/even/odd doubleword operations are currently implemented here (in <<a class="el" href="vec__int128__ppc_8h.html" title="Header package containing a collection of 128-bit computation functions implemented with PowerISA VMX...">vec_int128_ppc.h</a>>) which resolves a dependency on Add Quadword. These functions (vec_msumudm, vec_muleud, vec_muloud) all produce a quadword results and may use the vec_adduqm implementation to sum partial products.</dd></dl>
<p>See <a class="el" href="index.html#mainpage_sub_1_3">Returning extended quadword results.</a> for more background on extended quadword computation.</p>
<h1><a class="anchor" id="i128_endian_issues_0_0"></a>
Endian problems with quadword implementations</h1>
<p>Technically operations on quadword elements should not require any endian specific transformation. There is only one element so there can be no confusion about element numbering or order. However some of the more complex quadword operations are constructed from operations on smaller elements. And those operations as provided by <altivec.h> are required by the OpenPOWER ABI to be endian sensitive. See <a class="el" href="vec__int64__ppc_8h.html#i64_endian_issues_0_0">Endian problems with doubleword operations</a> for a more detailed discussion.</p>
<p>In any case the arithmetic (high to low) order of bits in a quadword are defined in the PowerISA (See <a class="el" href="vec__int128__ppc_8h.html#a539de2a4426a84102471306acc571ce8" title="Vector Add Unsigned Quadword Modulo. ">vec_adduqm()</a> and <a class="el" href="vec__int128__ppc_8h.html#a6bafb410404d4f1e10a99263b57d1df0" title="Vector Subtract Unsigned Quadword Modulo. ">vec_subuqm()</a>). So pveclib implementations will need to either:</p><ul>
<li>Nullify little endian transforms of <altivec.h> operations. The <altivec.h> built-ins <a class="el" href="vec__int32__ppc_8h.html#ac93f07d5ad73243db2771da83b50d6d8" title="Vector multiply even unsigned words. ">vec_muleuw()</a>, <a class="el" href="vec__int32__ppc_8h.html#a3ca45c65b9627abfc493d4ad500a961d" title="Vector multiply odd unsigned words. ">vec_mulouw()</a>, vec_mergel(), and vec_mergeh() are endian sensitive and often require nullification that restores the original operation.</li>
<li>Use new operations that are specifically defined to be stable across BE/LE implementations. The pveclib operations; <a class="el" href="vec__int128__ppc_8h.html#a84e6361054b52ac4564bcef25b718151" title="Vector Multiply Even Unsigned Doublewords. ">vec_vmuleud()</a> <a class="el" href="vec__int128__ppc_8h.html#a208744996e7482604ad274b44999d6ce" title="Vector Multiply Odd Unsigned Doublewords. ">vec_vmuloud()</a>, <a class="el" href="vec__int64__ppc_8h.html#acd5bafec6c1c15b0336551e82d1169d4" title="Vector Merge Algebraic High Doublewords. ">vec_mrgahd()</a>, <a class="el" href="vec__int64__ppc_8h.html#a5242d6311cd5ab50377cfeb2cf2ac8bf" title="Vector Merge Algebraic Low Doublewords. ">vec_mrgald()</a>. and <a class="el" href="vec__int64__ppc_8h.html#a8238ba590103ac80fb146a6a2b1aed1c" title="Vector Permute Doubleword Immediate. Combine a doubleword selected from the 1st (vra) vector with a d...">vec_permdi()</a> are defined to be endian stable.</li>
</ul>
<h2><a class="anchor" id="int128_const_0_0_1"></a>
Quadword Integer Constants</h2>
<p>The compilers may not support 128-bit integers for constants and printf (integer to ascii). For example GCC provides ANSI mandated constant and runtime support for integers up to long long which for PowerPC is only 64-bit.</p>
<p>The __int128 type is an extension that provides basic arithmetic operations but does not compile 128-bit constants or support printf formating for integers larger then long long. The following section provides examples and work around's for these restrictions.</p>
<p>The GCC compiler allows integer constants to be assigned/cast to __int128 types. The support also allows __int128 constants to be assigned/cast to vector __int128 types. So the following are allowed: </p><div class="fragment"><div class="line"><span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vec128_zeros = {(<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) ((<span class="keywordtype">unsigned</span> __int128) 0)};</div><div class="line"><span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vec128_10 = {(<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) ((<span class="keywordtype">unsigned</span> __int128) 10)};</div><div class="line"><span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vec128_10to16 = {(<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) ((<span class="keywordtype">unsigned</span> __int128)</div><div class="line"> 10000000000000000UL)};</div><div class="line"><span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vec128_maxLong = {(<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) ((<span class="keywordtype">unsigned</span> __int128)</div><div class="line"> __INT64_MAX__)};</div><div class="line"><span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vec128_max_Long = {(<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) ((<span class="keywordtype">unsigned</span> __int128)</div><div class="line"> 0x7fffffffffffffffL)};</div><div class="line"><span class="comment">// -1 signed extended to __int128 is 0xFFFF...FFFF</span></div><div class="line"><span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vec128_foxes = {(<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) ((__int128) -1L)};</div></div><!-- fragment --><p>It gets more complicated when the constant exceeds the range of a long long value. For example the magic numbers for the multiplicative inverse described in <a class="el" href="vec__int128__ppc_8h.html#int128_examples_0_1_1">Printing Vector __int128 values</a>. The decimal integer constant we need for the quadword multiplier is "76624777043294442917917351357515459181" or the equivalent hexadecimal value "0x39a5652fb1137856d30baf9a1e626a6d". GCC does not allow constants this large to be expressed directly.</p>
<p>GCC supports aggregate initializer lists for the elements of vectors. For example: </p><div class="fragment"><div class="line"><a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a> xyzw = (vector int) { 1, 2, 3, 4 };</div></div><!-- fragment --><p> So it is possible to compose a quadword constant by initializing a vector of word or doubleword elements then casting the result to a quadword type. For example: </p><div class="fragment"><div class="line"><span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> invmul = (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) (vector <span class="keywordtype">unsigned</span> <span class="keywordtype">long</span> <span class="keywordtype">long</span>)</div><div class="line"> { 0x39a5652fb1137856UL, 0xd30baf9a1e626a6dUL };</div></div><!-- fragment --><p> or </p><div class="fragment"><div class="line"><span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> invmul = (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) (vector <span class="keywordtype">unsigned</span> <span class="keywordtype">int</span>)</div><div class="line"> { 0x39a5652f, 0xb1137856, 0xd30baf9a, 0x1e626a6d };</div></div><!-- fragment --><p> There is one small problem with this as element order is endian dependent, while a vector quadword integer is always big endian. So we would need to adjust the element order for endian. For example: </p><div class="fragment"><div class="line"> <span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> invmul = (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) (vector <span class="keywordtype">unsigned</span> <span class="keywordtype">long</span> <span class="keywordtype">long</span>)</div><div class="line"><span class="preprocessor">#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__</span></div><div class="line"> { 0xd30baf9a1e626a6dUL, 0x39a5652fb1137856UL };</div><div class="line"><span class="preprocessor">#else</span></div><div class="line"> { 0x39a5652fb1137856UL, 0xd30baf9a1e626a6dUL };</div><div class="line"><span class="preprocessor">#endif</span></div></div><!-- fragment --><p> or </p><div class="fragment"><div class="line"> <span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> invmul = (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) (vector <span class="keywordtype">unsigned</span> <span class="keywordtype">int</span>)</div><div class="line"><span class="preprocessor">#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__</span></div><div class="line"> { 0x1e626a6d, 0xd30baf9a, 0xb1137856, 0x39a5652f };</div><div class="line"><span class="preprocessor">#else</span></div><div class="line"> { 0x39a5652f, 0xb1137856, 0xd30baf9a, 0x1e626a6d };</div><div class="line"><span class="preprocessor">#endif</span></div></div><!-- fragment --><p> Remembering to add the endian correction for constants used quadword operations is an issue and manually reversing the element order can be error prone. There should be an easier way.</p>
<h2><a class="anchor" id="int128_const_0_0_2"></a>
Support for Quadword Integer Constants</h2>
<p>The <a class="el" href="vec__common__ppc_8h.html" title="Common definitions and typedef used by the collection of Power Vector Library (pveclib) headers...">vec_common_ppc.h</a> header provides some helper macros for when quadword operations need big endian element order on little endian platforms. These macros accept 2, 4, 8, or 16 element constants to form an aggregate initializer for a vector of the corresponding element type. The elements are always arranged left to right, high to low order. These macros are endian sensitive and either effectively pass-through for big endian or reverse the element order for little endian.</p>
<p>For example: </p><div class="fragment"><div class="line"><span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> mul_invs_ten16 = (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) <a class="code" href="vec__common__ppc_8h.html#a9ed8c282b57705c960542ed869de3325">CONST_VINT128_DW</a>(</div><div class="line"> 0x39a5652fb1137856UL, 0xd30baf9a1e626a6dUL);</div></div><!-- fragment --><p> or </p><div class="fragment"><div class="line"><span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> mul_invs_ten16 = (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) <a class="code" href="vec__common__ppc_8h.html#ae4520a89b9b5a292a3e647a6d5b712ad">CONST_VINT128_W</a>(</div><div class="line"> 0x39a5652f, 0xb1137856, 0xd30baf9a, 0x1e626a6d);</div></div><!-- fragment --><p> These macros internally cast to a vector unsigned integer type for the aggregate initializer. This type corresponds to the size and number of elements to fit in a 128-bit vector. This tells the compiler how many elements to expect and the allowed value range for the initializer. A final explicit cast is required to the vector type needed (usually a signed or unsigned __int128). (See: <a class="el" href="vec__common__ppc_8h.html#a9ed8c282b57705c960542ed869de3325" title="Initializer for 128-bits vector, as two unsigned long long elements in high->low order. May require an explicit cast. ">CONST_VINT128_DW()</a>, <a class="el" href="vec__common__ppc_8h.html#ae4520a89b9b5a292a3e647a6d5b712ad" title="Arrange word elements of a unsigned int initializer in high->low order. May require an explicit cast...">CONST_VINT128_W()</a>, <a class="el" href="vec__common__ppc_8h.html#a63ce8985f81cfb37c2bc3f9900dddd51" title="Arrange halfword elements of a unsigned int initializer in high->low order. May require an explicit c...">CONST_VINT128_H()</a>, <a class="el" href="vec__common__ppc_8h.html#a47803d5079cc714a1524f4cd2a05ef72" title="Arrange byte elements of a unsigned int initializer in high->low order. May require an explicit cast...">CONST_VINT128_B()</a> ). Other macros require the programmer to provide a cast to match the element count and size. (See: <a class="el" href="vec__common__ppc_8h.html#af4ffb9244d1aa4482b683d35c3544194" title="Arrange elements of dword initializer in high->low order. ">CONST_VINT64_DW()</a>, <a class="el" href="vec__common__ppc_8h.html#a7e03d3eaeafea2c6613233fd58f98ec1" title="Arrange elements of word initializer in high->low order. ">CONST_VINT32_W()</a>, <a class="el" href="vec__common__ppc_8h.html#a31b39d3ef7e2d5321449f6f2ab2a51b2" title="Arrange elements of halfword initializer in high->low order. ">CONST_VINT16_H()</a>, <a class="el" href="vec__common__ppc_8h.html#af5ff90cc45d590754b2403af4d014d7c" title="Arrange elements of byte initializer in high->low order. ">CONST_VINT8_B()</a> )</p>
<p>The methods above are effectively forming multi-digit constants where each digit is itself a large (word or doubleword) binary coded integer value. Because the digits are radix 2**N it is normal to convert large decimal constants to hexadecimal. This makes it easier to split the large constants into word or doubleword elements for the initializer.</p>
<p>Most compilers support compile time computation on constants. This is an optimization where only the final computed constant result is used in the generated code. Compile time constant computation supports the usual arithmetic operations on the usual types. Some compilers (including GCC) support constant computation on extended types including __int128.</p>
<p>For example: </p><div class="fragment"><div class="line"><span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> ten32_minus1 = (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>)</div><div class="line"> (((<span class="keywordtype">unsigned</span> __int128) 9999999999999999UL) * 10000000000000000UL)</div><div class="line"> + ((<span class="keywordtype">unsigned</span> __int128) 9999999999999999UL);</div></div><!-- fragment --><p> produces the quadword integer value for the decimal constant 99999999999999999999999999999999.</p>
<dl class="section note"><dt>Note</dt><dd>we must cast any int or long long constants to [unsigned] __int128 so the compiler will use 128-bits arithmetic to compute the final constant.</dd></dl>
<p>With this technique we can split large decimal constants into 16, 18, or 19 digit blocks and then compute effective 32, 36, or 38 digit constant. (see <a class="el" href="vec__int128__ppc_8h.html#acd5c20e29b155f8f575d60f6af8f7955" title="Generate a vector unsigned __int128 constant from doublewords. ">CONST_VUINT128_Qx16d()</a>, <a class="el" href="vec__int128__ppc_8h.html#aa9c94b59ae2504f498923ed506a22083" title="Generate a vector unsigned __int128 constant from doublewords. ">CONST_VUINT128_Qx18d()</a>, and <a class="el" href="vec__int128__ppc_8h.html#a25faf0c51245eefdaeda1dc5dd71c516" title="Generate a vector unsigned __int128 constant from doublewords. ">CONST_VUINT128_Qx19d()</a>). For example: </p><div class="fragment"><div class="line"> <span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> ten32_minus1 = <a class="code" href="vec__int128__ppc_8h.html#acd5c20e29b155f8f575d60f6af8f7955">CONST_VUINT128_Qx16d</a></div><div class="line"> ( 9999999999999999UL, 9999999999999999UL );</div><div class="line"><span class="comment">// The quadword multiplicative inverse to divide by 10**16</span></div><div class="line"><span class="comment">// is 76624777043294442917917351357515459181.</span></div><div class="line"><span class="comment">// Which is 38 digits, so we split into 2 consts of 19 digits each.</span></div><div class="line"> <span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> mul_invs_ten16 = <a class="code" href="vec__int128__ppc_8h.html#a25faf0c51245eefdaeda1dc5dd71c516">CONST_VUINT128_Qx19d</a>(</div><div class="line"> 7662477704329444291UL, 7917351357515459181UL);</div></div><!-- fragment --><h1><a class="anchor" id="int128_arith_facts_0"></a>
Some facts about fixed precision integers</h1>
<p>The transition from grade school math to computer programming requires the realization that computers handle numbers in fixed sized chunks. For the PowerISA these chunks are byte, halfword, word, doubleword, and quadword. While computer languages like "C" have integer types like char, short, int, long int, and __int128.</p>
<p>Happily these chunks are large enough to hold the equivalent of several decimal digits and handle most of the grotty details of multiply, divide, add, and subtract. But sometimes the chunk (used) is not large enough to hold all the digits you need. Sums may overflow and multiplies may be truncated (modulo the chunk size).</p>
<p>Sometimes we can simply switch to the next larger size (int to long, word to doubleword) and avoid the problem (overflow of sums or truncation of multiply). But sometimes the largest chunk the compiler or hardware supports is still not large enough for the numbers we are dealing with. This requires <em>multiple precision arithmetic</em> with works a lot like grade school arithmetic but with larger digits represented by the most convenient computer sized chunk.</p>
<p>Most programmers would prefer to use an existing <em>multiple precision arithmetic</em> library and move on. Existing libraries are implemented with scalar instructions and loops over storage arrays. But here we need to provide vector quadword multiply and extended quadword add/subtract operations. Any transfers between the libraries multi-precision storage arrays and vector registers are likely to exceed the timing for a direct vector implementation. </p><dl class="section note"><dt>Note</dt><dd>The PowerISA 2.07 provides direct vector quadword integer add/subtract with carry/extend. PowerISA 3.0 provides unsigned doubleword multiply with quadword product. This exceeds the capability of the PowerISA 64-bit (doubleword) Fixed Point unit which requires multiple instructions to generate quadword results.</dd></dl>
<p>We also want to provide the basis for general <em>multiple quadword precision arithmetic</em> operations (see <a class="el" href="vec__int512__ppc_8h.html" title="Header package containing a collection of multiple precision quadword integer computation functions i...">vec_int512_ppc.h</a>). And for security implementations requiring large multiply products we are motivated to leverage the PowerISA large vector register set to avoid exposing these results (and partial products) to memory/cache side channel attacks.</p>
<h2><a class="anchor" id="int128_arith_facts_0_1"></a>
Some useful arithmetic facts (you may of forgotten)</h2>
<p>First multiplying a M-digits by N-digits number requires up to (M+N)-digits to store the result. This is true independent of the size of your digit, including decimal, hexadecimal, and computer words/doublewords/quadwords. This explains why a 32-bit (word) by 32-bit integer multiply product is either:</p><ul>
<li>Truncated (modulo) to 32-bits, potentially loosing the high order precision.</li>
<li>Expanded to the next larger (double) size (in this case 64-bit doubleword).</li>
</ul>
<p>The hardware has to one or the other.</p>
<p>Let's looks at some examples of multiplying two maximal 4-digit numbers: </p><div class="fragment"><div class="line">Decimal: 9999 x 9999 = 99980001</div><div class="line">Hexadecimal: FFFF x FFFF = FFFE0001</div></div><!-- fragment --><p> And to drive home the point, let's look at the case of multiplying two maximal (32-bit word) 4-digit numbers: </p><div class="fragment"><div class="line">quadword: FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF</div><div class="line"> x FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF</div><div class="line"> = FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE</div><div class="line"> 00000000 00000000 00000000 00000001</div></div><!-- fragment --><p> This is also a (128-bit quadword) digit multiply with a (256-bit) 2 quadword digit result.</p>
<p>Adding asymmetric example; 4-digit by 1 digit multiply: </p><div class="fragment"><div class="line">Decimal: 9999 x 9 = 89991</div><div class="line">Hexadecimal: FFFF x F = EFFF1</div><div class="line">quadword: FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF</div><div class="line"> x FFFFFFFF</div><div class="line"> = FFFFFFFE FFFFFFFF FFFFFFFF FFFFFFFF 00000001</div></div><!-- fragment --><p> This pattern repeats across the all digit bases/size and values of M, N.</p>
<p>Note that the product is not the maximum value for the product width. It seem the product leave <em>room</em> to add another digit or two without overflowing the double wide product. Lets try some 4 digit examples by adding a maximal 4 digit value to the product. </p><div class="fragment"><div class="line">Decimal: 9999 x 9999 = 99980001</div><div class="line"> + 9999</div><div class="line"> = 99990000</div><div class="line"></div><div class="line">Hexadecimal: FFFF x FFFF = FFFE0001</div><div class="line"> + FFFF</div><div class="line"> = FFFF0000</div></div><!-- fragment --><p> Looks like there is still room in the double wide product to add another maximal 4 digit value. </p><div class="fragment"><div class="line">Decimal: 9999 x 9999 = 99980001</div><div class="line"> + 9999</div><div class="line"> + 9999</div><div class="line"> = 99999999</div><div class="line"></div><div class="line">Hexadecimal: FFFF x FFFF = FFFE0001</div><div class="line"> + FFFF</div><div class="line"> + FFFF</div><div class="line"> = FFFFFFFF</div></div><!-- fragment --><p> But any more then that would cause a overflow.</p>
<p>Now we should look addends to asymmetric multiply. For example 4-digit by 1 digit multiply: </p><div class="fragment"><div class="line">Decimal: 9999 x 9 = 89991</div><div class="line"> + 9999</div><div class="line"> + 9</div><div class="line"> = 99999</div><div class="line">Hexadecimal: FFFF x F = EFFF1</div><div class="line"> + FFFF</div><div class="line"> + F</div><div class="line"> = FFFFF</div><div class="line">quadword: FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF</div><div class="line"> x FFFFFFFF</div><div class="line"> = FFFFFFFE FFFFFFFF FFFFFFFF FFFFFFFF 00000001</div><div class="line"> + FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF</div><div class="line"> + FFFFFFFF</div><div class="line"> = FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF</div></div><!-- fragment --><p> Note that when M not equal N then the addends are restrict to size M and/or size N. Two addends of the larger multiplier size can overflow. This pattern repeats across the all digit bases/sizes and values of M, N. For the binary fixed pointer multiply-add or bit sizes M/N we can write the equation:</p>
<p>(2<sup>(M+N)</sup> - 1) = ((2<sup>M</sup> - 1) * (2<sup>N</sup> - 1)) + (2<sup>M</sup> - 1) + (2<sup>N</sup> - 1)</p>
<p>Or in terms of fixed sized "words" of W-bits and M by N words.</p>
<p>(2<sup>(W*(M+N))</sup> - 1) = ((2<sup>(W*M)</sup> - 1) * (2<sup>(W*N)</sup> - 1)) + (2<sup>(W*M)</sup> - 1) + (2<sup>(W*N)</sup> - 1)</p>
<h2><a class="anchor" id="int128_arith_facts_0_2"></a>
Why does this matter?</h2>
<p>Because with modern hardware the actual multiply operations are faster and have less impact while the summation across the partial products becomes the major bottleneck. For recent POWER processors fixed-point are 5-7 cycles latency and dual issue (2/cycle). These multiplies are only dependent on the inputs (multiplicands). This allows the compiler and (super-scalar processor) to schedule the multiply operations early to prepare for summation. In many cases the 3rd and 4th multiplies are complete before the summation of the first two multiplies completes.</p>
<p>The add operations involved in partial product summation are dependent on the current column multiply and the high order word of summation of the previous stage. While add operations are nominally faster (2-3 cycles) than multiplies, they can generate carries that have to be propagated.</p>
<p>The Fixed-Point Unit has a dedicated <em>carry-bit (CA)</em> which becomes the critical resource. This dependency on the carry (in addition to the column multiply and previous summation) limits the compiler's (and hardware's) ability to parallelize stages of the summation. The Vector unit (PowerISA 2.07+) has quadword (vs Fixed point doubleword) binary add/subtract with carry/extend. The Vector Unit requires separate <em>write Carry</em> instructions to detect and return the carry to VRs. The <em>write Carry</em> instructions are paired with <em>Unsigned Quadword Modulo</em> instructions that generates the (modulo) 128-bit result.</p>
<dl class="section note"><dt>Note</dt><dd>In PowerISA 3.0B has a new add extended (addex) instruction that can use the <em>overflow-bit (OF)</em> as a second carry (independent of CA). However the OF must be explicitly cleared (using subfo) before use as a carry flag. </dd>
<dd>
The Vector Unit has the effective use of up to 32 carry bits. The down-side is it requires an extra instruction and whole 128-bit VR ro generate and hold each carry bit.</dd></dl>
<p>So knowing how to avoid overflows and carries in the summation of partial products can be useful. To illustrate we can examine the POWER8 implementation of <a class="el" href="vec__int128__ppc_8h.html#aee5c5b2998ef105b4c6f39739748ffa8" title="Vector Multiply Unsigned Double Quadword. ">vec_muludq()</a>. POWER8 (PowerISA 2.07) does support add quadword but the largest vector fixed-point multiply is 32-bit Vector Multiply Even/Odd Unsigned Words (<a class="el" href="vec__int32__ppc_8h.html#ac93f07d5ad73243db2771da83b50d6d8" title="Vector multiply even unsigned words. ">vec_muleuw()</a> and (<a class="el" href="vec__int32__ppc_8h.html#a3ca45c65b9627abfc493d4ad500a961d" title="Vector multiply odd unsigned words. ">vec_mulouw()</a>). The implementation generates four quadword by word (160-bit) partial products that are summed in four stages to generate the final 256-bit product.</p>
<p>Code for the first stage looks like this:</p>
<div class="fragment"><div class="line"><span class="comment">// Splat the lowest order word of b to tsw for word multiply</span></div><div class="line">tsw = vec_splat ((<a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a>) b, <a class="code" href="vec__common__ppc_8h.html#abf3504d2f86f03f90239a3196da3b3de">VEC_WE_3</a>);</div><div class="line"><span class="comment">// Multiply quadword a by lowest order word of b</span></div><div class="line">t_even = (<a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a>)<a class="code" href="vec__int32__ppc_8h.html#ae30f226bd27241513f0611b50967a080">vec_vmuleuw</a>((<a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a>)a, tsw);</div><div class="line">t_odd = (<a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a>)<a class="code" href="vec__int32__ppc_8h.html#ae52349ced57857d20fb5e06b1b09cc05">vec_vmulouw</a>((<a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a>)a, tsw);</div><div class="line"><span class="comment">// Rotate the low 32-bits (right) into tmq. This is actually</span></div><div class="line"><span class="comment">// implemented as 96-bit (12-byte) shift left.</span></div><div class="line">tmq = vec_sld (t_odd, z, 12);</div><div class="line"><span class="comment">// shift the low 128 bits of partial product right 32-bits</span></div><div class="line">t_odd = vec_sld (z, t_odd, 12);</div><div class="line"><span class="comment">// add the high 128 bits of even / odd partial products</span></div><div class="line">t = (<a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a>) <a class="code" href="vec__int128__ppc_8h.html#a539de2a4426a84102471306acc571ce8">vec_adduqm</a> ((<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) t_even, (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) t_odd);</div></div><!-- fragment --><p> Note in this case we can assume that the sum of aligned even/odd quadwords will not generate a carry. For example with maximum values for multiplicands a,b: </p><div class="fragment"><div class="line">quadword a: FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF</div><div class="line">quadword b x FFFFFFFF[3]</div><div class="line">t_even = FFFFFFFE 00000001 FFFFFFFE 00000001</div><div class="line">t_odd >> 32 + 00000000 FFFFFFFE 00000001 FFFFFFFE</div><div class="line">t = FFFFFFFE FFFFFFFF FFFFFFFF FFFFFFFF</div><div class="line">tmq = 00000001</div></div><!-- fragment --><p> The high order 128-bits of the sum did not overflow.</p>
<p>The next tree stages are more complex. </p><div class="fragment"><div class="line"><span class="comment">// Splat the next word of b to tsw for word multiply</span></div><div class="line">tsw = vec_splat ((<a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a>) b, <a class="code" href="vec__common__ppc_8h.html#ad739666851dfec6cc520c2ee06fd5d41">VEC_WE_2</a>);</div><div class="line"><span class="comment">// Multiply quadword a by next word of b</span></div><div class="line">t_even = (<a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a>)<a class="code" href="vec__int32__ppc_8h.html#ae30f226bd27241513f0611b50967a080">vec_vmuleuw</a>((<a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a>)a, tsw);</div><div class="line">t_odd = (<a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a>)<a class="code" href="vec__int32__ppc_8h.html#ae52349ced57857d20fb5e06b1b09cc05">vec_vmulouw</a>((<a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a>)a, tsw);</div><div class="line"><span class="comment">// Add with carry the odd multiply with previous partial product</span></div><div class="line">tc = (<a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a>) <a class="code" href="vec__int128__ppc_8h.html#ad7aaadba249ce46c4c94f78df1020da3">vec_addcuq</a> ((<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) t_odd, (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) t);</div><div class="line">t_odd = (<a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a>) <a class="code" href="vec__int128__ppc_8h.html#a539de2a4426a84102471306acc571ce8">vec_adduqm</a> ((<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) t_odd, (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) t);</div><div class="line"><span class="comment">// Rotate the low 32-bits (right) into tmq.</span></div><div class="line">tmq = vec_sld (t_odd, tmq, 12);</div><div class="line"><span class="comment">// shift the low 128 bits (with carry) right 32-bits</span></div><div class="line">t_odd = vec_sld (tc, t_odd, 12);</div><div class="line"><span class="comment">// add the high 128 bits of even / odd partial products</span></div><div class="line">t = (<a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a>) <a class="code" href="vec__int128__ppc_8h.html#a539de2a4426a84102471306acc571ce8">vec_adduqm</a> ((<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) t_even, (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) t_odd);</div></div><!-- fragment --><p> Here we need a 3-way sum of the previous partial product, and the odd, even products from this stage. In this case the high 128-bits of previous partial product needs to align with the lower 128-bits of this stages 160-bit product for the first quadword add. This can produce a overflow, so we need to capture the carry and concatenate it the odd sum before shifting right 32-bits. Again we can assume that the sum of aligned even/odd quadwords will not generate a carry. For example stage 2 with maximum values for multiplicands a,b: </p><div class="fragment"><div class="line">quadword a: FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF</div><div class="line">quadword b x FFFFFFFF[2]</div><div class="line"></div><div class="line">t_odd FFFFFFFE 00000001 FFFFFFFE 00000001</div><div class="line">t + FFFFFFFE FFFFFFFF FFFFFFFF FFFFFFFF</div><div class="line">t_odd = FFFFFFFD 00000001 FFFFFFFE 00000000</div><div class="line">tc = 00000000 00000000 00000000 00000001</div><div class="line"></div><div class="line">tc|t_odd>>32 = 00000001 FFFFFFFD 00000001 FFFFFFFE</div><div class="line">t_odd|tmq = 00000000 00000001</div><div class="line"></div><div class="line">t_even = FFFFFFFE 00000001 FFFFFFFE 00000001</div><div class="line">tc|t_odd>>32 + 00000001 FFFFFFFD 00000001 FFFFFFFE</div><div class="line">t = FFFFFFFF FFFFFFFE FFFFFFFF FFFFFFFF</div></div><!-- fragment --><p> For POWER8 this 3-way sum and the required write-carry adds significant latency to stages 2, 3, and 4 of this multiply.</p>
<p>In POWER8 the vector quadword add/subtract instructions are cracked into 2 dependent simple fixed-point (XS) IOPs. So the effective instruction latency is (2+2=4) cycles. Also cracked instructions must be <em>first in group</em>, so back-to-back vaddcuq/vadduqm sequences will be dispatched separately. There no possibility of executing the pair concurrently, so the latency for the pair is 5-6 cycles.</p>
<p>So there is value in finding an alternative summation that avoids/reduces the number write-carry operations. From above (<a class="el" href="vec__int128__ppc_8h.html#int128_arith_facts_0_1">Some useful arithmetic facts (you may of forgotten)</a>) we know it is possible to add one or two unsigned words to each of the doubleword products generated by vmuleuw/vmulouw.</p>
<p>We need to align the words of the quadword addend (zero extended on the left to doublewords) with the corresponding doublewords of the products. We can use Vector Merge Even/Odd Word operations to split and pad the addend into to align with the products. Then we use Vector Add Doubleword for the even/odd product-sums. Finally we use shift and add quadword to produce the 160-bit stage 2 sum. </p><div class="fragment"><div class="line">quadword a: FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF</div><div class="line">quadword b x FFFFFFFF[2]</div><div class="line">quadword t: FFFFFFFE FFFFFFFF FFFFFFFF FFFFFFFF</div><div class="line"></div><div class="line">t_even = FFFFFFFE 00000001 FFFFFFFE 00000001</div><div class="line">mrgew(z,t) + 00000000 FFFFFFFE 00000000 FFFFFFFF</div><div class="line"> = FFFFFFFE FFFFFFFF FFFFFFFF 00000000</div><div class="line"></div><div class="line">t_odd = FFFFFFFE 00000001 FFFFFFFE 00000001</div><div class="line">mrgow(z,t) + 00000000 FFFFFFFF 00000000 FFFFFFFF</div><div class="line"> = FFFFFFFF 00000000 FFFFFFFF 00000000</div><div class="line"></div><div class="line">t_odd>>32 = 00000000 FFFFFFFF 00000000 FFFFFFFF</div><div class="line">t_odd|tmq>>32= 00000000 00000001</div><div class="line"></div><div class="line">t_even = FFFFFFFE FFFFFFFF FFFFFFFF 00000000</div><div class="line">t_odd>>32 + 00000000 FFFFFFFF 00000000 FFFFFFFF</div><div class="line">t = FFFFFFFF FFFFFFFE FFFFFFFF FFFFFFFF</div><div class="line">t_odd|tmq = 00000000 00000001</div></div><!-- fragment --><p> This sequence replaces two instructions (vaddcuq/vadduqm) with four instructions (vmrgew/vmrgow/vaddudm/vaddudm), all of which;</p><ul>
<li>have 2 cycle latency</li>
<li>are dual issue</li>
<li>without dispatch restrictions</li>
</ul>
<p>We expect a latency of 4 cycles over the whole sequence. And splitting the first add into even/odd add blocks allows the compiler (and out-of-order hardware) more flexibility for instruction scheduling.</p>
<h3><a class="anchor" id="int128_arith_facts_0_2_1"></a>
Vector Multiply-Add</h3>
<p>Multiply-add seems to be a useful operation that does not exist in the current PowerISA. But it is simple enough to create an in-line PVECLIB operation that we can use here. For example: </p><div class="fragment"><div class="line"><span class="keyword">static</span> <span class="keyword">inline</span> <a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a></div><div class="line"><a class="code" href="vec__int64__ppc_8h.html#a1e20bdd1df7e3e49dca06d5512ada84b">vec_vmaddeuw</a> (<a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a> a, <a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a> b, <a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a> c)</div><div class="line">{</div><div class="line"> <span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a> zero = { 0, 0, 0, 0 };</div><div class="line"> <a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> res;</div><div class="line"> <a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a> c_euw = <a class="code" href="vec__int32__ppc_8h.html#a0d39dc4278a5e0711e9109746b23f2c7">vec_mrgahw</a> ((<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) zero, (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) c);</div><div class="line"> res = <a class="code" href="vec__int32__ppc_8h.html#ae30f226bd27241513f0611b50967a080">vec_vmuleuw</a> (a, b);</div><div class="line"> <span class="keywordflow">return</span> <a class="code" href="vec__int64__ppc_8h.html#a28052c1907d1f733c9dda8a48039e546">vec_addudm</a> (res, (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) c_euw);</div><div class="line">}</div></div><!-- fragment --><p> Which generates the following instruction sequence: </p><div class="fragment"><div class="line"><__vec_vmaddeuw_PWR8>:</div><div class="line"> d70: vmuleuw v2,v2,v3</div><div class="line"> d74: vspltisw v0,0</div><div class="line"> d78: vmrgew v4,v0,v4</div><div class="line"> d7c: vaddudm v2,v2,v4</div></div><!-- fragment --><p> The vspltisw loads (immediate) the zero vector and the compiler should <em>common</em> this across operations and schedule this instruction once, early in the function. The vmrgew has a latency of 2 cycles and should execute concurrently with vmuleuw. Similarly for <a class="el" href="vec__int64__ppc_8h.html#a32acead723b7867ff4c9f8be9bb708ca" title="Vector Multiply-Add Odd Unsigned Words. ">vec_vmaddouw()</a>.</p>
<p>These operations (<a class="el" href="vec__int64__ppc_8h.html#a1e20bdd1df7e3e49dca06d5512ada84b" title="Vector Multiply-Add Even Unsigned Words. ">vec_vmaddeuw()</a> and <a class="el" href="vec__int64__ppc_8h.html#a32acead723b7867ff4c9f8be9bb708ca" title="Vector Multiply-Add Odd Unsigned Words. ">vec_vmaddouw()</a>) are included in <a class="el" href="vec__int64__ppc_8h.html" title="Header package containing a collection of 128-bit SIMD operations over 64-bit integer elements...">vec_int64_ppc.h</a> as they require <a class="el" href="vec__int64__ppc_8h.html#a28052c1907d1f733c9dda8a48039e546" title="Vector Add Unsigned Doubleword Modulo. ">vec_addudm()</a> and produce doubleword results. With this addition we can improve and simplify the code for stages 2-4 of the _ARCH_PWR8 implementation of <a class="el" href="vec__int128__ppc_8h.html#aee5c5b2998ef105b4c6f39739748ffa8" title="Vector Multiply Unsigned Double Quadword. ">vec_muludq()</a>. For example: </p><div class="fragment"><div class="line"><span class="comment">// Splat the next word of b to tsw for word multiply</span></div><div class="line">tsw = vec_splat ((<a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a>) b, <a class="code" href="vec__common__ppc_8h.html#ad739666851dfec6cc520c2ee06fd5d41">VEC_WE_2</a>);</div><div class="line"><span class="comment">// Multiply quadword a by next word of b and add previous partial</span></div><div class="line"><span class="comment">// product using multiply-add even/odd</span></div><div class="line">t_even = (<a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a>)<a class="code" href="vec__int64__ppc_8h.html#a1e20bdd1df7e3e49dca06d5512ada84b">vec_vmaddeuw</a>((<a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a>)a, tsw, t);</div><div class="line">t_odd = (<a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a>)<a class="code" href="vec__int64__ppc_8h.html#a32acead723b7867ff4c9f8be9bb708ca">vec_vmaddouw</a>((<a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a>)a, tsw, t);</div><div class="line"><span class="comment">// Rotate the low 32-bits (right) into tmq.</span></div><div class="line">tmq = vec_sld (t_odd, tmq, 12);</div><div class="line"><span class="comment">// shift the low 128 bits (with carry) right 32-bits</span></div><div class="line">t_odd = vec_sld (z, t_odd, 12);</div><div class="line"><span class="comment">// add the high 128 bits of even / odd partial products</span></div><div class="line">t = (<a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a>) <a class="code" href="vec__int128__ppc_8h.html#a539de2a4426a84102471306acc571ce8">vec_adduqm</a> ((<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) t_even, (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) t_odd);</div></div><!-- fragment --><h3><a class="anchor" id="int128_arith_facts_0_2_2"></a>
And Vector Multiply-Add2</h3>
<p>From the description above (<a class="el" href="vec__int128__ppc_8h.html#int128_arith_facts_0_1">Some useful arithmetic facts (you may of forgotten)</a>) we know we can add two unsigned words to the doubleword product without overflow. This is another useful operation that does not exist in the current PowerISA. But it is simple enough to create an in-line PVECLIB operation. For example: </p><div class="fragment"><div class="line"><span class="keyword">static</span> <span class="keyword">inline</span> <a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a></div><div class="line"><a class="code" href="vec__int64__ppc_8h.html#a1b046a56d566ec2ea351042fd9dd11de">vec_vmadd2euw</a> (<a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a> a, <a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a> b, <a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a> c, <a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a> d)</div><div class="line">{</div><div class="line"> <span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a> zero = { 0, 0, 0, 0 };</div><div class="line"> <a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> res, sum;</div><div class="line"> <a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a> c_euw = <a class="code" href="vec__int32__ppc_8h.html#a0d39dc4278a5e0711e9109746b23f2c7">vec_mrgahw</a> ((<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) zero, (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) c);</div><div class="line"> <a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a> d_euw = <a class="code" href="vec__int32__ppc_8h.html#a0d39dc4278a5e0711e9109746b23f2c7">vec_mrgahw</a> ((<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) zero, (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) d);</div><div class="line"> res = <a class="code" href="vec__int32__ppc_8h.html#ae30f226bd27241513f0611b50967a080">vec_vmuleuw</a> (a, b);</div><div class="line"> sum = <a class="code" href="vec__int64__ppc_8h.html#a28052c1907d1f733c9dda8a48039e546">vec_addudm</a> ( (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) c_euw, (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) d_euw);</div><div class="line"> <span class="keywordflow">return</span> <a class="code" href="vec__int64__ppc_8h.html#a28052c1907d1f733c9dda8a48039e546">vec_addudm</a> (res, sum);</div><div class="line">}</div></div><!-- fragment --><p> Which generates to following instruction sequence: </p><div class="fragment"><div class="line"><__vec_vmadd2euw_PWR8>:</div><div class="line"> db0: vmuleuw v2,v2,v3</div><div class="line"> db4: vspltisw v0,0</div><div class="line"> db8: vmrgew v4,v0,v4</div><div class="line"> dbc: vmrgew v5,v0,v5</div><div class="line"> dc0: vaddudm v5,v4,v5</div><div class="line"> dc4: vaddudm v2,v2,v5</div></div><!-- fragment --><p> The vspltisw loads (immediate) the zero vector and the compiler should <em>common</em> this across operations and schedule this instruction once, early in the function. The vmrgew/vmrgew/vaddudm sequence has a latency of 4-6 cycles and should execute concurrently with vmuleuw. Similarly for <a class="el" href="vec__int64__ppc_8h.html#a40ab00ed413c1aa1a8148cd9981235bf" title="Vector Multiply-Add2 Odd Unsigned Words. ">vec_vmadd2ouw()</a>.</p>
<h3><a class="anchor" id="int128_arith_facts_0_2_3"></a>
Why not Vector Multiply-Sum</h3>
<p>The PowerISA has a number of Multiply-Sum instructions that look a lot like the Multiply-Add described above? Well not exactly:</p><ul>
<li>The behavior of Multiply-Sum allows overflow without any architected way to detect/capture and propagate the carry.<ul>
<li>Each of the two (even/odd) halves of each "word" element of VRA and VRB: Multiply the even halves of each "word" element. Then multiply the odd halves of each "word" element. This generates two unsigned integer "word" products for each "word" element.</li>
<li>The sum of these two integer "word" products is added to the corresponding integer "word" element in VRC.</li>
<li>This 3-way sum of can overflow without notification.</li>
</ul>
</li>
<li>Multiply-Sum instructions can be used to emulate Multiply Even/Odd and Multiply-Add Even/Odd by constraining the inputs.<ul>
<li>Using Multiply-Sum to add prior partial-sums creates a serial dependency that limits instruction scheduling and slows execution.</li>
</ul>
</li>
<li>The PowerISA does not have Multiply-Sum Word instructions.</li>
<li>The PowerISA 3.0 has a Multiply-Sum Unsigned Doubleword instruction but it does not exist in POWER8.</li>
<li>The base Altivec has Multiply-Sum Halfword/Byte instructions. But using POWER8's Multiply Even/Odd Unsigned Word is better for implementing quadword multiply on POWER8.</li>
</ul>
<p>First we should look at the arithmetic of Multiply-Sum using maximal unsigned integer values. </p><div class="fragment"><div class="line">VRA: FFFF x FFFF</div><div class="line">VRB: FFFF x FFFF</div><div class="line">VRC: FFFF FFFF</div><div class="line"></div><div class="line">Even half: FFFF x FFFF -> FFFE0001</div><div class="line">odd half: FFFF x FFFF -> + FFFE0001</div><div class="line">Word addend -> + FFFFFFFF</div><div class="line"> = 2 FFFC0001</div></div><!-- fragment --><p> Note the sum overflows the word twice and high order bits of the sum will be lost.</p>
<p>For POWER9 we can simulate Vector Multiply Even/Odd Unsigned Doubleword by setting the Odd/Even doubleword of VRB to zero and the whole quadword addend VRC to zero. For example the even doubleword multiply. </p><div class="fragment"><div class="line"><span class="keyword">static</span> <span class="keyword">inline</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a></div><div class="line"><a class="code" href="vec__int128__ppc_8h.html#a84e6361054b52ac4564bcef25b718151">vec_vmuleud</a> (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> a, <a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> b)</div><div class="line">{</div><div class="line"> <span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> zero = { 0, 0 };</div><div class="line"> <a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> b_eud = <a class="code" href="vec__int64__ppc_8h.html#acd5bafec6c1c15b0336551e82d1169d4">vec_mrgahd</a> ((<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) b, (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) zero);</div><div class="line"> <span class="keywordflow">return</span> <a class="code" href="vec__int128__ppc_8h.html#a1d183ebd232e5826be109cdaa421aeed">vec_msumudm</a>(a, b_eud, zero);</div><div class="line">}</div></div><!-- fragment --><p> And similarly for the odd doubleword multiply. </p><div class="fragment"><div class="line"><span class="keyword">static</span> <span class="keyword">inline</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a></div><div class="line"><a class="code" href="vec__int128__ppc_8h.html#a208744996e7482604ad274b44999d6ce">vec_vmuloud</a> (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> a, <a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> b)</div><div class="line">{</div><div class="line"> <span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> zero = { 0, 0 };</div><div class="line"> <a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> b_oud = <a class="code" href="vec__int64__ppc_8h.html#a5242d6311cd5ab50377cfeb2cf2ac8bf">vec_mrgald</a> ((<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) zero, (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) b);</div><div class="line"> <span class="keywordflow">return</span> <a class="code" href="vec__int128__ppc_8h.html#a1d183ebd232e5826be109cdaa421aeed">vec_msumudm</a>(a, b_oud, (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) zero);</div><div class="line">}</div></div><!-- fragment --><p> And review the arithmetic for <a class="el" href="vec__int128__ppc_8h.html#a84e6361054b52ac4564bcef25b718151" title="Vector Multiply Even Unsigned Doublewords. ">vec_vmuleud()</a> using maximal quadword values for a and b. </p><div class="fragment"><div class="line">quadword a: FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF</div><div class="line">quadword b: FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF</div><div class="line">quadword z: 00000000 00000000 00000000 00000000</div><div class="line"></div><div class="line">mrged(b,z) = FFFFFFFF FFFFFFFF 00000000 00000000</div><div class="line"></div><div class="line">Even prod: FFFFFFFF FFFFFFFE 00000000 00000001</div><div class="line">odd prod + 00000000 00000000 00000000 00000000</div><div class="line">Word addend + 00000000 00000000 00000000 00000000</div><div class="line">msumudm = FFFFFFFF FFFFFFFE 00000000 00000001</div></div><!-- fragment --><p> And for vec_vmuldud(). </p><div class="fragment"><div class="line">quadword a: FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF</div><div class="line">quadword b: FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF</div><div class="line">quadword z: 00000000 00000000 00000000 00000000</div><div class="line"></div><div class="line">mrgod(z,b) = 00000000 00000000 FFFFFFFF FFFFFFFF</div><div class="line"></div><div class="line">Even prod: 00000000 00000000 00000000 00000000</div><div class="line">odd prod + FFFFFFFF FFFFFFFE 00000000 00000001</div><div class="line">Word addend + 00000000 00000000 00000000 00000000</div><div class="line">msumudm = FFFFFFFF FFFFFFFE 00000000 00000001</div></div><!-- fragment --><p>We can also simulate Vector Multiply-Add Even/Odd Unsigned Doubleword by setting the odd/even doubleword of VRB to zero and the whole quadword addend to the even/odd double word of VRC. For example the even doubleword multiply-add. </p><div class="fragment"><div class="line"><span class="keyword">static</span> <span class="keyword">inline</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a></div><div class="line"><a class="code" href="vec__int128__ppc_8h.html#a6280736f91cb67eca10b55e750bfe1de">vec_vmaddeud</a> (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> a, <a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> b, <a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> c)</div><div class="line">{</div><div class="line"> <span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> zero = { 0, 0 };</div><div class="line"> <a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> b_eud = <a class="code" href="vec__int64__ppc_8h.html#acd5bafec6c1c15b0336551e82d1169d4">vec_mrgahd</a> ((<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) b, (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) zero);</div><div class="line"> <a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> c_eud = <a class="code" href="vec__int64__ppc_8h.html#acd5bafec6c1c15b0336551e82d1169d4">vec_mrgahd</a> ((<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) zero, (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) c);</div><div class="line"> <span class="keywordflow">return</span> <a class="code" href="vec__int128__ppc_8h.html#a1d183ebd232e5826be109cdaa421aeed">vec_msumudm</a>(a, b_eud, (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) c_eud);</div><div class="line">}</div></div><!-- fragment --><p> And similarly for the odd doubleword multiply-add. </p><div class="fragment"><div class="line"><span class="keyword">static</span> <span class="keyword">inline</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a></div><div class="line"><a class="code" href="vec__int128__ppc_8h.html#ae0b83d2696455fea53b1ecf434a0daf8">vec_vmaddoud</a> (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> a, <a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> b, <a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> c)</div><div class="line">{</div><div class="line"> <span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> zero = { 0, 0 };</div><div class="line"> <a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> b_oud = <a class="code" href="vec__int64__ppc_8h.html#a5242d6311cd5ab50377cfeb2cf2ac8bf">vec_mrgald</a> ((<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) zero, (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) b);</div><div class="line"> <a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> c_oud = <a class="code" href="vec__int64__ppc_8h.html#a5242d6311cd5ab50377cfeb2cf2ac8bf">vec_mrgald</a> ((<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) zero, (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) c);</div><div class="line"> <span class="keywordflow">return</span> <a class="code" href="vec__int128__ppc_8h.html#a1d183ebd232e5826be109cdaa421aeed">vec_msumudm</a>(a, b_oud, (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) c_oud);</div><div class="line">}</div></div><!-- fragment --><p> And review the arithmetic for <a class="el" href="vec__int128__ppc_8h.html#a6280736f91cb67eca10b55e750bfe1de" title="Vector Multiply-Add Even Unsigned Doublewords. ">vec_vmaddeud()</a> using maximal quadword values for a and b. The even/odd doublewords of c have slightly different values for illustrative purposes. </p><div class="fragment"><div class="line">quadword a: FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF</div><div class="line">quadword b: FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF</div><div class="line">quadword c: FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE</div><div class="line"></div><div class="line">mrged(b,z) = FFFFFFFF FFFFFFFF 00000000 00000000</div><div class="line">mrged(z,c) = 00000000 00000000 FFFFFFFF FFFFFFFF</div><div class="line"></div><div class="line">Even prod: FFFFFFFF FFFFFFFE 00000000 00000001</div><div class="line">odd prod + 00000000 00000000 00000000 00000000</div><div class="line">Word addend + 00000000 00000000 FFFFFFFF FFFFFFFF</div><div class="line">msumudm = FFFFFFFF FFFFFFFF 00000000 00000000</div></div><!-- fragment --><p> And for <a class="el" href="vec__int128__ppc_8h.html#ae0b83d2696455fea53b1ecf434a0daf8" title="Vector Multiply-Add Odd Unsigned Doublewords. ">vec_vmaddoud()</a>. </p><div class="fragment"><div class="line">quadword a: FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF</div><div class="line">quadword b: FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF</div><div class="line">quadword c: FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE</div><div class="line"></div><div class="line">mrgod(z,b) = 00000000 00000000 FFFFFFFF FFFFFFFF</div><div class="line">mrgod(z,c) = 00000000 00000000 FFFFFFFF FFFFFFFE</div><div class="line"></div><div class="line">Even prod: 00000000 00000000 00000000 00000000</div><div class="line">odd prod + FFFFFFFF FFFFFFFE 00000000 00000001</div><div class="line">Word addend + 00000000 00000000 FFFFFFFF FFFFFFFE</div><div class="line">msumudm = FFFFFFFF FFFFFFFE FFFFFFFF FFFFFFFF</div></div><!-- fragment --><p> This multiply-add even/odd doulbeword form only adds one additional (xxmrghd AKA xxpermdi) instruction over that required for the base multiply even/odd doubleword operation. </p><div class="fragment"><div class="line"><__vmuleud_PWR9>:</div><div class="line"> 120: xxspltib v0,0</div><div class="line"> 124: xxmrghd v3,v3,v0</div><div class="line"> 128: vmsumudm v2,v2,v3,v0</div><div class="line"></div><div class="line"><__vmaddeud_PWR9>:</div><div class="line"> 1a0: xxspltib v0,0</div><div class="line"> 1a4: xxmrghd v3,v3,v0</div><div class="line"> 1a8: xxmrghd v4,v0,v4</div><div class="line"> 1ac: vmsumudm v2,v2,v3,v4</div></div><!-- fragment --><p> The xxspltib loads (immediate) the zero vector and the compiler should <em>common</em> this across operations and schedule this instruction once, early in the function.</p>
<p>For POWER9 instruction instruction timing is different and there are some unique trade-offs. The implementations above are small and appropriate for single instances of multiply doubleword or implementations of multiply quadword. However using the vmsumudm (operand VRC) addend creates a serial dependency within the multiply quadword implementation. When multiply quadword and multiply-add quadword are used in the implementation of wider multiplies (see <a class="el" href="vec__int512__ppc_8h.html" title="Header package containing a collection of multiple precision quadword integer computation functions i...">vec_int512_ppc.h</a>) these serial dependencies actually slow down the implementation.</p>
<ul>
<li>A full 128 x 128-bit multiply only requires two stages of even/odd doubleword multiplies. This allows some simplification.<ul>
<li>Alignment shifts can be replaced with permute doubleword immediate (xxmrgld/xxmrghd/xxpermdi) operations.</li>
<li>Careful rearrangement of the operations and operands allow the compiler to optimize (as common subexpressions) some of the doubleword masking operations.</li>
</ul>
</li>
<li>The multiply even/odd doubleword operations require explicit masking of the even/odd multiplicands.<ul>
<li>Doubleword masking can be done with xxmrgld/xxmrghd/xxpermdi instructions which are dual issue with a 3 cycle latency.</li>
<li>The multiplies (vmsumudm) are serially dependent on these masking instructions.</li>
<li>In the POWER8 implementation (using vmuleuw/vmulouw) the multiplicand masking is implicit to the instruction.</li>
</ul>
</li>
<li>The vmsumudm with the VRC addend can be used to combine the multiply-add of the partial production from the previous stage.<ul>
<li>This also requires explicit doubleword masking to avoid overflowing the quadword sum.</li>
<li>This can make the masking operation and the multiply itself, serially dependent on the partial product sum from the previous stage.</li>
</ul>
</li>
<li>The add (modulo/write-carry/extend) quadword instructions are dual issue with a 3 cycle latency. So the cost of quadword sums and generating/propagating carries is of less concern (than on POWER8).<ul>
<li>It can be better to use explicit add quadword and avoid the serial dependency on the vmsumudm (VRC) addend.</li>
<li>This allows the compiler (and out-of-order hardware) more flexibility for instruction scheduling.</li>
</ul>
</li>
</ul>
<p>So lets look at some examples using the vmsumudm (VRC) addend and the alternative using VRC (settting VRA to zero) and explicit add quadword. First a 128x128-bit unsigned multiply using vmsumudm and exploiting the VRC addend where appropriate. </p><div class="fragment"><div class="line"><a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a></div><div class="line">__test_muludq_y_PWR9 (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> *mulu, <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> a, <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> b)</div><div class="line">{</div><div class="line"> <a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a> t, tmq;</div><div class="line"> <span class="comment">// compute the 256 bit product of two 128 bit values a, b.</span></div><div class="line"> <span class="comment">// The high 128 bits are accumulated in t and the low 128-bits</span></div><div class="line"> <span class="comment">// in tmq. The high 128-bits of the product are returned to the</span></div><div class="line"> <span class="comment">// address of the 1st parm. The low 128-bits are the return</span></div><div class="line"> <span class="comment">// value.</span></div><div class="line"> <span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> zero = { 0, 0 };</div><div class="line"> <a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> a_swap = <a class="code" href="vec__int64__ppc_8h.html#a42d2b39711c06106097ef869a20420b6">vec_swapd</a> ((<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) a);</div><div class="line"> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> tmh, tab, tba, tb0, tc1, tc2;</div><div class="line"> <span class="comment">// multiply the low 64-bits of a and b. For PWR9 this is just</span></div><div class="line"> <span class="comment">// vmsumudm with conditioned inputs.</span></div><div class="line"> tmq = (<a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a>) <a class="code" href="vec__int128__ppc_8h.html#a208744996e7482604ad274b44999d6ce">vec_vmuloud</a> ((<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>)a, (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>)b);</div><div class="line"> <span class="comment">// compute the 2 middle partial projects. Use vmaddeud to add the</span></div><div class="line"> <span class="comment">// high 64-bits of the low product to one of the middle products.</span></div><div class="line"> <span class="comment">// This can not overflow.</span></div><div class="line"> tab = <a class="code" href="vec__int128__ppc_8h.html#a208744996e7482604ad274b44999d6ce">vec_vmuloud</a> (a_swap, (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) b);</div><div class="line"> tba = <a class="code" href="vec__int128__ppc_8h.html#a6280736f91cb67eca10b55e750bfe1de">vec_vmaddeud</a> (a_swap, (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) b, (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) tmq);</div><div class="line"> <span class="comment">// sum the two middle products (plus the high 64-bits of the low</span></div><div class="line"> <span class="comment">// product. This will generate a carry that we need to capture.</span></div><div class="line"> t = (<a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a>) <a class="code" href="vec__int128__ppc_8h.html#a539de2a4426a84102471306acc571ce8">vec_adduqm</a> (tab, tba);</div><div class="line"> tc1 = <a class="code" href="vec__int128__ppc_8h.html#ad7aaadba249ce46c4c94f78df1020da3">vec_addcuq</a> (tab, tba);</div><div class="line"> <span class="comment">// result = t[l] || tmq[l].</span></div><div class="line"> tmq = (<a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a>) <a class="code" href="vec__int64__ppc_8h.html#a5242d6311cd5ab50377cfeb2cf2ac8bf">vec_mrgald</a> ((<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) t, (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) tmq);</div><div class="line"> <span class="comment">// we can use multiply sum here because the high product plus the</span></div><div class="line"> <span class="comment">// high sum of middle partial products can't overflow.</span></div><div class="line"> t = (<a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a>) <a class="code" href="vec__int64__ppc_8h.html#a8238ba590103ac80fb146a6a2b1aed1c">vec_permdi</a> ((<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) tc1, (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) t, 2);</div><div class="line"> <span class="comment">// This is equivalent to vec_vmadd2eud(a, b, tab, tba)</span></div><div class="line"> <span class="comment">// were (tab_even + tba_even) was pre-computed including the carry,</span></div><div class="line"> <span class="comment">// so no masking is required.</span></div><div class="line"> t = (<a class="code" href="vec__common__ppc_8h.html#a2ff4a776536870e01b7c9e454586544b">vui32_t</a>) <a class="code" href="vec__int128__ppc_8h.html#a43f6f199cdf39641d940f5b8d55dbf6b">vec_vmsumeud</a> ((<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) a, (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) b, (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) t);</div><div class="line"></div><div class="line"> *mulu = (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) t;</div><div class="line"> <span class="keywordflow">return</span> ((<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) tmq);</div><div class="line">}</div></div><!-- fragment --><div class="fragment"><div class="line"><__test_muludq_y_PWR9>:</div><div class="line"> 370: xxspltib v1,0</div><div class="line"> 374: xxswapd v12,v2</div><div class="line"> 378: xxlor v13,v2,v2</div><div class="line"> 37c: xxmrgld v0,v1,v3</div><div class="line"> 380: xxmrghd v3,v3,v1</div><div class="line"> 384: vmsumudm v2,v2,v0,v1</div><div class="line"> 388: vmsumudm v0,v12,v0,v1</div><div class="line"> 38c: xxmrghd v1,v1,v2</div><div class="line"> 390: vmsumudm v1,v12,v3,v1</div><div class="line"> 394: vadduqm v12,v1,v0</div><div class="line"> 398: vaddcuq v0,v0,v1</div><div class="line"> 39c: xxmrgld v2,v12,v2</div><div class="line"> 3a0: xxpermdi v0,v0,v12,2</div><div class="line"> 3a4: vmsumudm v13,v13,v3,v0</div><div class="line"> 3a8: stxv v13,0(r3)</div><div class="line"> 3ac: blr</div></div><!-- fragment --> <dl class="section note"><dt>Note</dt><dd>that first vmsumudm instruction is only dependent on the parameters a, masked b_odd, and const zero. The second vmsumudm instruction is only dependent on the parameters a_swap, masked b_odd, and const zero. The swap/mask operations requires 3-4 cycles and 7 cycles to complete first two vmsumudm's. The third vmsumudm instruction is dependent on the parameters a_swap, masked b_even, and masked tmq_even. The masked tmq_even is dependent on the xxmrghd of the results of the first vmsumudm. This adds another 10 cycles. The forth and final vmsumudm instruction is dependent on the parameters a, masked b_even, and the shifted sum (with carry) of (tab + tba). This is in turn dependent on the results from the second and third vmsumudm instructions. This adds another (6+7= 13) cycles for a total of 34 cycles. When this operation is expanded in-line the stxv and xxspltib will be optimized and can be ignored for this analysis.</dd></dl>
<p>Next a 128x128-bit unsigned multiply using vmsumudm but only passing const zero to the VRC addend. </p><div class="fragment"><div class="line"><a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a></div><div class="line">__test_muludq_x_PWR9 (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> *mulu, <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> a, <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> b)</div><div class="line">{</div><div class="line"> <span class="comment">// compute the 256 bit product of two 128 bit values a, b.</span></div><div class="line"> <span class="comment">// The high 128 bits are accumulated in t and the low 128-bits</span></div><div class="line"> <span class="comment">// in tmq. The high 128-bits of the product are returned to the</span></div><div class="line"> <span class="comment">// address of the 1st parm. The low 128-bits are the return</span></div><div class="line"> <span class="comment">// value.</span></div><div class="line"> <span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> zero = { 0, 0 };</div><div class="line"> <a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> a_swap = <a class="code" href="vec__int64__ppc_8h.html#a42d2b39711c06106097ef869a20420b6">vec_swapd</a> ((<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) a);</div><div class="line"> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> thq, tlq, tx;</div><div class="line"> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> t0l, tc1;</div><div class="line"> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> thh, thl, tlh, tll;</div><div class="line"> <span class="comment">// multiply the low 64-bits of a and b. For PWR9 this is just</span></div><div class="line"> <span class="comment">// vmsumudm with conditioned inputs.</span></div><div class="line"> tll = <a class="code" href="vec__int128__ppc_8h.html#a208744996e7482604ad274b44999d6ce">vec_vmuloud</a> ((<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>)a, (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>)b);</div><div class="line"> thh = <a class="code" href="vec__int128__ppc_8h.html#a84e6361054b52ac4564bcef25b718151">vec_vmuleud</a> ((<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>)a, (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>)b);</div><div class="line"> thl = <a class="code" href="vec__int128__ppc_8h.html#a208744996e7482604ad274b44999d6ce">vec_vmuloud</a> (a_swap, (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>)b);</div><div class="line"> tlh = <a class="code" href="vec__int128__ppc_8h.html#a84e6361054b52ac4564bcef25b718151">vec_vmuleud</a> (a_swap, (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>)b);</div><div class="line"> <span class="comment">// sum the two middle products (plus the high 64-bits of the low</span></div><div class="line"> <span class="comment">// product. This will generate a carry that we need to capture.</span></div><div class="line"> t0l = (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) <a class="code" href="vec__int64__ppc_8h.html#acd5bafec6c1c15b0336551e82d1169d4">vec_mrgahd</a> ( (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) zero, tll);</div><div class="line"> tc1 = <a class="code" href="vec__int128__ppc_8h.html#ad7aaadba249ce46c4c94f78df1020da3">vec_addcuq</a> (thl, tlh);</div><div class="line"> tx = <a class="code" href="vec__int128__ppc_8h.html#a539de2a4426a84102471306acc571ce8">vec_adduqm</a> (thl, tlh);</div><div class="line"> tx = <a class="code" href="vec__int128__ppc_8h.html#a539de2a4426a84102471306acc571ce8">vec_adduqm</a> (tx, t0l);</div><div class="line"> <span class="comment">// result = t[l] || tll[l].</span></div><div class="line"> tlq = (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) <a class="code" href="vec__int64__ppc_8h.html#a5242d6311cd5ab50377cfeb2cf2ac8bf">vec_mrgald</a> ((<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) tx, (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) tll);</div><div class="line"> <span class="comment">// Sum the high product plus the high sum (with carry) of middle</span></div><div class="line"> <span class="comment">// partial products. This can't overflow.</span></div><div class="line"> thq = (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) <a class="code" href="vec__int64__ppc_8h.html#a8238ba590103ac80fb146a6a2b1aed1c">vec_permdi</a> ((<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) tc1, (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) tx, 2);</div><div class="line"> thq = <a class="code" href="vec__int128__ppc_8h.html#a539de2a4426a84102471306acc571ce8">vec_adduqm</a> ( thh, thq);</div><div class="line"></div><div class="line"> *mulu = (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) thq;</div><div class="line"> <span class="keywordflow">return</span> ((<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) tlq);</div><div class="line">}</div></div><!-- fragment --><div class="fragment"><div class="line"><__test_muludq_x_PWR9>:</div><div class="line"> 320: xxspltib v0,0</div><div class="line"> 324: xxswapd v12,v2</div><div class="line"> 328: xxmrgld v13,v0,v3</div><div class="line"> 32c: xxmrghd v3,v3,v0</div><div class="line"> 330: vmsumudm v1,v12,v13,v0</div><div class="line"> 334: vmsumudm v13,v2,v13,v0</div><div class="line"> 338: vmsumudm v12,v12,v3,v0</div><div class="line"> 33c: xxmrghd v10,v0,v13</div><div class="line"> 340: vadduqm v11,v12,v1</div><div class="line"> 344: vmsumudm v3,v2,v3,v0</div><div class="line"> 348: vaddcuq v1,v1,v12</div><div class="line"> 34c: vadduqm v2,v11,v10</div><div class="line"> 350: xxpermdi v1,v1,v2,2</div><div class="line"> 354: xxmrgld v2,v2,v13</div><div class="line"> 358: vadduqm v3,v3,v1</div><div class="line"> 35c: stxv v3,0(r3)</div><div class="line"> 360: blr</div></div><!-- fragment --> <dl class="section note"><dt>Note</dt><dd>that the vmsumudm instructions only depend on the parameters a/a_swap, masked b_odd/b_even, and const zero. After the parameters are conditioned (swapped/masked) the independent vmsumudm's can be scheduled early. The swap/mask operations requires 3-4 cycles and 8 cycles to complete four independent vmsumudm's. The partial product alignment and sums require another 12 cycles, for a total of 24 cycles. When this operation is expanded in-line the stxv and xxspltib will be optimized and can be ignored for this analysis.</dd></dl>
<p>The second example (using explicit add quadword);</p><ul>
<li>only adds 1 instruction over the first example,</li>
<li>and executes 10 cycles faster.</li>
</ul>
<h3><a class="anchor" id="int128_arith_facts_0_2_4"></a>
Vector Multiply-Add Quadword</h3>
<p>We can use multiply-add operation for wider word sizes (quadword and multiple precision quadword). The simplest quadword implementation would create a <a class="el" href="vec__int128__ppc_8h.html#a2a38409db81a0765586023d5874ab6b0" title="Vector Multiply-Add Unsigned Quadword. ">vec_madduq()</a> operation based on <a class="el" href="vec__int128__ppc_8h.html#aee5c5b2998ef105b4c6f39739748ffa8" title="Vector Multiply Unsigned Double Quadword. ">vec_muludq()</a> and add a quadword parameter "c" for the addend. Then modify the first stage of the platform specific multiplies to replace vector multiply even/odd with vector multiply-add even/odd, passing the addend as the the third parameter.</p>
<p>This works well for the POWER8 implementation because the additional vector add doublewords can be scheduled independently of the vector multiply even/odd words. But for POWER9 we need to avoid the serial dependences explained above in <a class="el" href="vec__int128__ppc_8h.html#int128_arith_facts_0_2_3">Why not Vector Multiply-Sum</a>.</p>
<p>For the POWER9 implementation we use an explicit add quadword (and write-Carry) to sum the addend parameter to the first stage Multiply odd doubleword. For example: </p><div class="fragment"><div class="line"> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a></div><div class="line">__test_madduq_y_PWR9 (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> *mulu, <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> a, <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> b, <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> c)</div><div class="line">{</div><div class="line"> <span class="comment">// compute the 256 bit sum of product of two 128 bit values a, b</span></div><div class="line"> <span class="comment">// plus the quadword addend c.</span></div><div class="line"> <a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> a_swap = <a class="code" href="vec__int64__ppc_8h.html#a42d2b39711c06106097ef869a20420b6">vec_swapd</a> ((<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) a);</div><div class="line"> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> thq, tlq, tx;</div><div class="line"> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> t0l, tc1, tcl;</div><div class="line"> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> thh, thl, tlh, tll;</div><div class="line"> <span class="comment">// multiply the four combinations of a_odd/a_even by b_odd/b_even.</span></div><div class="line"> tll = <a class="code" href="vec__int128__ppc_8h.html#a208744996e7482604ad274b44999d6ce">vec_vmuloud</a> ((<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>)a, (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>)b);</div><div class="line"> thh = <a class="code" href="vec__int128__ppc_8h.html#a84e6361054b52ac4564bcef25b718151">vec_vmuleud</a> ((<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>)a, (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>)b);</div><div class="line"> thl = <a class="code" href="vec__int128__ppc_8h.html#a208744996e7482604ad274b44999d6ce">vec_vmuloud</a> (a_swap, (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>)b);</div><div class="line"> tlh = <a class="code" href="vec__int128__ppc_8h.html#a84e6361054b52ac4564bcef25b718151">vec_vmuleud</a> (a_swap, (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>)b);</div><div class="line"> <span class="comment">// Add c to lower 128-bits of the partial product.</span></div><div class="line"> tcl = <a class="code" href="vec__int128__ppc_8h.html#ad7aaadba249ce46c4c94f78df1020da3">vec_addcuq</a> (tll, c);</div><div class="line"> tll = <a class="code" href="vec__int128__ppc_8h.html#a539de2a4426a84102471306acc571ce8">vec_adduqm</a> (tll, c);</div><div class="line"> t0l = (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) <a class="code" href="vec__int64__ppc_8h.html#a8238ba590103ac80fb146a6a2b1aed1c">vec_permdi</a> ((<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) tcl, (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) tll, 2);</div><div class="line"> <span class="comment">// sum the two middle products (plus the high 65-bits of the low</span></div><div class="line"> <span class="comment">// product-sum).</span></div><div class="line"> tc1 = <a class="code" href="vec__int128__ppc_8h.html#ad7aaadba249ce46c4c94f78df1020da3">vec_addcuq</a> (thl, tlh);</div><div class="line"> tx = <a class="code" href="vec__int128__ppc_8h.html#a539de2a4426a84102471306acc571ce8">vec_adduqm</a> (thl, tlh);</div><div class="line"> tx = <a class="code" href="vec__int128__ppc_8h.html#a539de2a4426a84102471306acc571ce8">vec_adduqm</a> (tx, t0l);</div><div class="line"> <span class="comment">// result = tx[l]_odd || tll[l]_odd.</span></div><div class="line"> tlq = (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) <a class="code" href="vec__int64__ppc_8h.html#a5242d6311cd5ab50377cfeb2cf2ac8bf">vec_mrgald</a> ((<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) tx, (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) tll);</div><div class="line"> <span class="comment">// Sum the high product plus the high sum (with carry) of middle</span></div><div class="line"> <span class="comment">// partial products. This can't overflow.</span></div><div class="line"> thq = (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) <a class="code" href="vec__int64__ppc_8h.html#a8238ba590103ac80fb146a6a2b1aed1c">vec_permdi</a> ((<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) tc1, (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) tx, 2);</div><div class="line"> thq = <a class="code" href="vec__int128__ppc_8h.html#a539de2a4426a84102471306acc571ce8">vec_adduqm</a> ( thh, thq);</div><div class="line"></div><div class="line"> *mulu = (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) thq;</div><div class="line"> <span class="keywordflow">return</span> ((<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) tlq);</div><div class="line">}</div></div><!-- fragment --><p> The generated code is the same size as the serially depended version</p>
<p>This is just another example where the shortest instruction sequence or using the most powerful instructions, may not be the fastest implementation. The key point is that avoiding serial dependencies in the code and allowing the compiler to schedule high latency instructions early, allows better performance. This effect is amplified when quadword multiplies (<a class="el" href="vec__int128__ppc_8h.html#aee5c5b2998ef105b4c6f39739748ffa8" title="Vector Multiply Unsigned Double Quadword. ">vec_muludq()</a>, <a class="el" href="vec__int128__ppc_8h.html#a2a38409db81a0765586023d5874ab6b0" title="Vector Multiply-Add Unsigned Quadword. ">vec_madduq()</a>, and <a class="el" href="vec__int128__ppc_8h.html#a7e7f25c382f8016baa6452fc02c03b83" title="Vector Multiply-Add2 Unsigned Quadword. ">vec_madd2uq()</a>) are used to compose wider multiply operations (see <a class="el" href="vec__int512__ppc_8h.html" title="Header package containing a collection of multiple precision quadword integer computation functions i...">vec_int512_ppc.h</a>).</p>
<h1><a class="anchor" id="int128_examples_0_1"></a>
Vector Quadword Examples</h1>
<p>The PowerISA Vector facilities provide logical and integer arithmetic quadword (128-bit) operations. Some operations as direct PowerISA instructions and other operations composed of short instruction sequences. The Power Vector Library provides a higher level and comprehensive API of quadword integer integer arithmetic and support for extended arithmetic to multiple quadwords.</p>
<h2><a class="anchor" id="int128_examples_0_1_1"></a>
Printing Vector __int128 values</h2>
<p>The GCC compiler supports the (vector) __int128 type but the runtime does not support <b>printf()</b> formating for __int128 types. However if we can use divide/modulo operations to split vector __int128 values into modulo 10^16 long int (doubleword) chunks, we can use printf() to convert and concatenate the decimal values into a complete number.</p>
<p>For example, from the __int128 value (39 decimal digits):</p><ul>
<li>Detect the sign and set a char to "+' or '-'</li>
<li>Then from the absolute value, divide/modulo by 10000000000000000. Producing:<ul>
<li>The highest 7 digits (t_high)</li>
<li>The middle 16 digits (t_mid)</li>
<li>The lowest 16 digits (t_low)</li>
</ul>
</li>
</ul>
<p>We can use signed compare to detect the sign and set a char value to print a ' ' or '+' prefix. If the value is negative we want the absolute value before we do the divide/modulo steps. For example: </p><div class="fragment"><div class="line"><span class="keywordflow">if</span> (<a class="code" href="vec__int128__ppc_8h.html#a269401b65405524bb2d971bef595cb0d">vec_cmpsq_all_ge</a> (value, zero128))</div><div class="line"> {</div><div class="line"> sign = <span class="charliteral">' '</span>;</div><div class="line"> val128 = (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) value;</div><div class="line"> }</div><div class="line"><span class="keywordflow">else</span></div><div class="line"> {</div><div class="line"> sign = <span class="charliteral">'-'</span>;</div><div class="line"> val128 = <a class="code" href="vec__int128__ppc_8h.html#a6bafb410404d4f1e10a99263b57d1df0">vec_subuqm</a> ((<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) zero128, (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) value);</div><div class="line"> }</div></div><!-- fragment --><p> Here we use the <b>pveclib</b> operation <a class="el" href="vec__int128__ppc_8h.html#a269401b65405524bb2d971bef595cb0d" title="Vector Compare any Greater Than or Equal Signed Quadword. ">vec_cmpsq_all_ge()</a> because the ABI and compilers do not define compare built-ins operations for the vector __int128 type. For the negative case we use the <b>pveclib</b> operation <a class="el" href="vec__int128__ppc_8h.html#a6bafb410404d4f1e10a99263b57d1df0" title="Vector Subtract Unsigned Quadword Modulo. ">vec_subuqm()</a> instead of vec_abs. Again the ABI and compilers do not define vec_abs built-ins for the vector __int128 type. Using <b>pveclib</b> operations have the additional benefit of supporting older compilers and platform specific implementations for POWER7 and POWER8.</p>
<p>Now we have the absolute value in val128 we can factor it into (3) chunks of 16 digits each. Normally scalar codes would use integer divide/modulo by 10000000000000000. And we are reminded that the PowerISA vector unit does not support integer divide operations and definitely not for quadword integers.</p>
<p>Instead we can use the multiplicative inverse which is a scaled fixed point fraction calculated from the original divisor. This works nicely if the fixed radix point is just before the 128-bit fraction and we have a multiply high (<a class="el" href="vec__int128__ppc_8h.html#ad6be9c8f02e43c39a659d6bbc9c3a2d2" title="Vector Multiply High Unsigned Quadword. ">vec_mulhuq()</a>) operation. Multiplying a 128-bit unsigned integer by a 128-bit unsigned fraction generates a 256-bit product with 128-bits above (integer) and below (fraction) the radix point. The high 128-bits of the product is the integer quotient and we can discard the low order 128-bits.</p>
<p>It turns out that generating the multiplicative inverse can be tricky. To produce correct results over the full range requires, possible pre-scaling and post-shifting, and sometimes a corrective addition is necessary. Fortunately the mathematics are well understood and are commonly used in optimizing compilers. Even better, Henry Warren's book has a whole chapter on this topic. </p><dl class="section see"><dt>See also</dt><dd>"Hacker's Delight, 2nd Edition," Henry S. Warren, Jr, Addison Wesley, 2013. Chapter 10, Integer Division by Constants.</dd></dl>
<p>In the chapter above; </p><blockquote class="doxtable">
<p>Figure 10-2 Computing the magic number for unsigned division.</p>
</blockquote>
<p>provides a sample C function for generating the magic number (actually a struct containing; the magic multiplicative inverse, "add" indicator, and the shift amount.). For quadword and the divisor 10000000000000000,this is { 76624777043294442917917351357515459181, 0 , 51 }:</p><ul>
<li>the multiplier is 76624777043294442917917351357515459181.</li>
<li>no corrective add is required.</li>
<li>the final shift is 51-bits right.</li>
</ul>
<div class="fragment"><div class="line"><span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> mul_ten16 = (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) <a class="code" href="vec__common__ppc_8h.html#a9ed8c282b57705c960542ed869de3325">CONST_VINT128_DW</a>(</div><div class="line"> 0UL, 10000000000000000UL);</div><div class="line"><span class="comment">// Magic numbers for multiplicative inverse to divide by 10**16</span></div><div class="line"><span class="comment">// are 76624777043294442917917351357515459181, no corrective add,</span></div><div class="line"><span class="comment">// and shift right 51 bits.</span></div><div class="line"><span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> mul_invs_ten16 = (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) <a class="code" href="vec__common__ppc_8h.html#a9ed8c282b57705c960542ed869de3325">CONST_VINT128_DW</a>(</div><div class="line"> 0x39a5652fb1137856UL, 0xd30baf9a1e626a6dUL);</div><div class="line"><span class="keyword">const</span> <span class="keywordtype">int</span> shift_ten16 = 51;</div><div class="line">...</div><div class="line"></div><div class="line"><span class="comment">// first divide/modulo the 39 digits __int128 by 10**16.</span></div><div class="line"><span class="comment">// This separates the high/middle 23 digits (tmpq) and low 16 digits.</span></div><div class="line">tmpq = <a class="code" href="vec__int128__ppc_8h.html#ad6be9c8f02e43c39a659d6bbc9c3a2d2">vec_mulhuq</a> (val128, mul_invs_ten16);</div><div class="line">tmpq = <a class="code" href="vec__int128__ppc_8h.html#ac05c640c6a42770cb95466ff4a2d903c">vec_srqi</a> (tmpq, shift_ten16);</div><div class="line"><span class="comment">// Compute remainder of val128 / 10**16</span></div><div class="line"><span class="comment">// t_low = val128 - (tmpq * 10**16)</span></div><div class="line"><span class="comment">// Here we know tmpq and mul_ten16 are less then 64-bits</span></div><div class="line"><span class="comment">// so can use vec_vmuloud instead of vec_mulluq</span></div><div class="line">tmp = <a class="code" href="vec__int128__ppc_8h.html#a208744996e7482604ad274b44999d6ce">vec_vmuloud</a> ((<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) tmpq, (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) mul_ten16);</div><div class="line">t_low = (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) <a class="code" href="vec__int128__ppc_8h.html#a6bafb410404d4f1e10a99263b57d1df0">vec_subuqm</a> (val128, tmp);</div><div class="line"></div><div class="line"><span class="comment">// Next divide/modulo the high/middle digits by 10**16.</span></div><div class="line"><span class="comment">// This separates the high 7 and middle 16 digits.</span></div><div class="line">val128 = tmpq;</div><div class="line">tmpq = <a class="code" href="vec__int128__ppc_8h.html#ad6be9c8f02e43c39a659d6bbc9c3a2d2">vec_mulhuq</a> (tmpq, mul_invs_ten16);</div><div class="line">t_high = (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) <a class="code" href="vec__int128__ppc_8h.html#ac05c640c6a42770cb95466ff4a2d903c">vec_srqi</a> (tmpq, shift_ten16);</div><div class="line">tmp = <a class="code" href="vec__int128__ppc_8h.html#a208744996e7482604ad274b44999d6ce">vec_vmuloud</a> (t_high, (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) mul_ten16);</div><div class="line">t_mid = (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) <a class="code" href="vec__int128__ppc_8h.html#a6bafb410404d4f1e10a99263b57d1df0">vec_subuqm</a> (val128, tmp);</div></div><!-- fragment --><p> All the operations used above are defined and implemented by <b>pveclib</b>. Most of these operations is not defined as single instructions in the PowerISA or as built-ins the ABI or require alternative implementations for older processors.</p>
<p>Now we have three vector unsigned __int128 values (t_low, t_mid, t_high) in the range 0-9999999999999999. Fixed point values in that range fit into the low order doubleword of each quadword. We can access these doublewords with array notation ([VEC_DW_L]) and the compiler will transfer them to fixed point (long int) GPRs. Then use normal char and long int printf() formating. For example: </p><div class="fragment"><div class="line">printf (<span class="stringliteral">"%c%07lld%016lld%016lld"</span>, sign,</div><div class="line"> t_high[VEC_DW_L], t_mid[VEC_DW_L], t_low[VEC_DW_L]);</div></div><!-- fragment --><p>Here is the complete vector __int128 printf example: </p><div class="fragment"><div class="line"> <span class="keywordtype">void</span></div><div class="line">example_print_vint128 (<a class="code" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> value)</div><div class="line">{</div><div class="line"> <span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> max_neg = (<a class="code" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a>) <a class="code" href="vec__common__ppc_8h.html#a9ed8c282b57705c960542ed869de3325">CONST_VINT128_DW</a>(</div><div class="line"> 0x8000000000000000L, 0UL);</div><div class="line"> <span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a> zero128 = (<a class="code" href="vec__common__ppc_8h.html#a3b2bbf9f23490ccca3bdc08bc1dc7831">vi128_t</a>) <a class="code" href="vec__common__ppc_8h.html#a9ed8c282b57705c960542ed869de3325">CONST_VINT128_DW</a>(</div><div class="line"> 0x0L, 0UL);</div><div class="line"> <span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> mul_ten16 = (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) <a class="code" href="vec__common__ppc_8h.html#a9ed8c282b57705c960542ed869de3325">CONST_VINT128_DW</a>(</div><div class="line"> 0UL, 10000000000000000UL);</div><div class="line"> <span class="comment">// Magic numbers for multiplicative inverse to divide by 10**16</span></div><div class="line"> <span class="comment">// are 76624777043294442917917351357515459181, no corrective add,</span></div><div class="line"> <span class="comment">// and shift right 51 bits.</span></div><div class="line"> <span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> mul_invs_ten16 = (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) <a class="code" href="vec__common__ppc_8h.html#a9ed8c282b57705c960542ed869de3325">CONST_VINT128_DW</a>(</div><div class="line"> 0x39a5652fb1137856UL, 0xd30baf9a1e626a6dUL);</div><div class="line"> <span class="keyword">const</span> <span class="keywordtype">int</span> shift_ten16 = 51;</div><div class="line"></div><div class="line"> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> tmpq, tmp;</div><div class="line"> <a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a> t_low, t_mid, t_high;</div><div class="line"> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> val128;</div><div class="line"> <span class="keywordtype">char</span> sign;</div><div class="line"></div><div class="line"> <span class="keywordflow">if</span> (<a class="code" href="vec__int128__ppc_8h.html#a269401b65405524bb2d971bef595cb0d">vec_cmpsq_all_ge</a> (value, zero128))</div><div class="line"> {</div><div class="line"> sign = <span class="charliteral">' '</span>;</div><div class="line"> val128 = (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) value;</div><div class="line"> }</div><div class="line"> <span class="keywordflow">else</span></div><div class="line"> {</div><div class="line"> sign = <span class="charliteral">'-'</span>;</div><div class="line"> val128 = <a class="code" href="vec__int128__ppc_8h.html#a6bafb410404d4f1e10a99263b57d1df0">vec_subuqm</a> ((<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) zero128, (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) value);</div><div class="line"> }</div><div class="line"> <span class="comment">// Convert the absolute (unsigned) value to Decimal and</span></div><div class="line"> <span class="comment">// prefix the sign.</span></div><div class="line"></div><div class="line"> <span class="comment">// first divide/modulo the 39 digits __int128 by 10**16.</span></div><div class="line"> <span class="comment">// This separates the high/middle 23 digits (tmpq) and low 16 digits.</span></div><div class="line"> tmpq = <a class="code" href="vec__int128__ppc_8h.html#ad6be9c8f02e43c39a659d6bbc9c3a2d2">vec_mulhuq</a> (val128, mul_invs_ten16);</div><div class="line"> tmpq = <a class="code" href="vec__int128__ppc_8h.html#ac05c640c6a42770cb95466ff4a2d903c">vec_srqi</a> (tmpq, shift_ten16);</div><div class="line"> <span class="comment">// Compute remainder of val128 / 10**16</span></div><div class="line"> <span class="comment">// t_low = val128 - (tmpq * 10**16)</span></div><div class="line"> <span class="comment">// Here we know tmpq and mul_ten16 are less then 64-bits</span></div><div class="line"> <span class="comment">// so can use vec_vmuloud instead of vec_mulluq</span></div><div class="line"> tmp = <a class="code" href="vec__int128__ppc_8h.html#a208744996e7482604ad274b44999d6ce">vec_vmuloud</a> ((<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) tmpq, (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) mul_ten16);</div><div class="line"> t_low = (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) <a class="code" href="vec__int128__ppc_8h.html#a6bafb410404d4f1e10a99263b57d1df0">vec_subuqm</a> (val128, tmp);</div><div class="line"></div><div class="line"> <span class="comment">// Next divide/modulo the high/middle digits by 10**16.</span></div><div class="line"> <span class="comment">// This separates the high 7 and middle 16 digits.</span></div><div class="line"> val128 = tmpq;</div><div class="line"> tmpq = <a class="code" href="vec__int128__ppc_8h.html#ad6be9c8f02e43c39a659d6bbc9c3a2d2">vec_mulhuq</a> (tmpq, mul_invs_ten16);</div><div class="line"> t_high = (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) <a class="code" href="vec__int128__ppc_8h.html#ac05c640c6a42770cb95466ff4a2d903c">vec_srqi</a> (tmpq, shift_ten16);</div><div class="line"> tmp = <a class="code" href="vec__int128__ppc_8h.html#a208744996e7482604ad274b44999d6ce">vec_vmuloud</a> (t_high, (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) mul_ten16);</div><div class="line"> t_mid = (<a class="code" href="vec__common__ppc_8h.html#a52a773b6353c69a546bdc2e8686a50ec">vui64_t</a>) <a class="code" href="vec__int128__ppc_8h.html#a6bafb410404d4f1e10a99263b57d1df0">vec_subuqm</a> (val128, tmp);</div><div class="line"></div><div class="line"> printf (<span class="stringliteral">"%c%07lld%016lld%016lld"</span>, sign, t_high[VEC_DW_L],</div><div class="line"> t_mid[VEC_DW_L], t_low[VEC_DW_L]);</div><div class="line">}</div></div><!-- fragment --><h2><a class="anchor" id="int128_examples_0_1_2"></a>
Converting Vector __int128 values to BCD</h2>
<p>POWER8 and POWER9 added a number of Binary Code Decimal (BCD) and Zoned Decimal operations that should be helpful for radix conversion and even faster large integer formatting for print. </p><dl class="section see"><dt>See also</dt><dd><a class="el" href="vec__bcd__ppc_8h.html" title="Header package containing a collection of Binary Coded Decimal (BCD) computation and Zoned Character ...">vec_bcd_ppc.h</a></dd></dl>
<p>The issue remains that __int128 values can represent up to 39 decimal digits while Signed BCD supports only 31 digits. POWER9 provides a <b>Decimal Convert From Signed Quadword</b> instruction with the following restriction:</p>
<dl class="section note"><dt>Note</dt><dd>If the signed value of vrb is less then -(10**31-1) or greater than 10**31-1 the result is too large for the BCD format and the result is undefined.</dd></dl>
<p>It would be useful to check for this and if required, factor the __int128 value into to the high order 8 digits and the low order 31 digits. This allows for the safe and correct use of the <a class="el" href="vec__bcd__ppc_8h.html#a5a1aec05a6dadcf5a1a8e028223745df" title="Vector Decimal Convert From Signed Quadword returning up to 31 BCD digits. ">vec_bcdcfsq()</a> and with some decimal shifts/truncates <a class="el" href="vec__bcd__ppc_8h.html#a832d31ded0b33a2b46f6491bcb71ea51" title="Vector Decimal Convert To Zoned. ">vec_bcdctz()</a>. This also enables conversion to multiple precision Vector BCD to represent 39 digits and more for radix conversions.</p>
<p>We first address the factoring by providing <b>Vector Divide by const 10e31 Unsigned Quadword</b> and <b>Vector Modulo by const 10e31 Unsigned Quadword</b> operation. This requires the multiplicative inverse using the <a class="el" href="vec__int128__ppc_8h.html#ad6be9c8f02e43c39a659d6bbc9c3a2d2" title="Vector Multiply High Unsigned Quadword. ">vec_mulhuq()</a> operation.</p>
<div class="fragment"><div class="line"><span class="keyword">static</span> <span class="keyword">inline</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a></div><div class="line"><a class="code" href="vec__int128__ppc_8h.html#a9a6a39212f8a8b9ebf20e0117e1e1e88">vec_divuq_10e31</a> (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra)</div><div class="line"> <span class="comment">// ten32 = +100000000000000000000000000000000UQ</span></div><div class="line"> <span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> ten31 = (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>)</div><div class="line"> { (__int128) 1000000000000000UL * (__int128) 10000000000000000UL };</div><div class="line"> <span class="comment">// Magic numbers for multiplicative inverse to divide by 10**31</span></div><div class="line"> <span class="comment">// are 4804950418589725908363185682083061167, corrective add,</span></div><div class="line"> <span class="comment">// and shift right 107 bits.</span></div><div class="line"> <span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> mul_invs_ten31 = (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) <a class="code" href="vec__common__ppc_8h.html#a9ed8c282b57705c960542ed869de3325">CONST_VINT128_DW</a>(</div><div class="line"> 0x039d66589687f9e9UL, 0x01d59f290ee19dafUL);</div><div class="line"> <span class="keyword">const</span> <span class="keywordtype">int</span> shift_ten31 = 103;</div><div class="line"> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> result, t, q;</div><div class="line"></div><div class="line"> <span class="keywordflow">if</span> (<a class="code" href="vec__int128__ppc_8h.html#af8f06b2c3d612a7cfdeb3bb883c59e19">vec_cmpuq_all_ge</a> (vra, ten31))</div><div class="line"> {</div><div class="line"> q = <a class="code" href="vec__int128__ppc_8h.html#ad6be9c8f02e43c39a659d6bbc9c3a2d2">vec_mulhuq</a> (vra, mul_invs_ten31);</div><div class="line"> <span class="comment">// Need corrective add but want to avoid carry & double quad shift</span></div><div class="line"> <span class="comment">// The following avoids the carry and less instructions</span></div><div class="line"> t = <a class="code" href="vec__int128__ppc_8h.html#a6bafb410404d4f1e10a99263b57d1df0">vec_subuqm</a> (vra, q);</div><div class="line"> t = <a class="code" href="vec__int128__ppc_8h.html#ac05c640c6a42770cb95466ff4a2d903c">vec_srqi</a> (t, 1);</div><div class="line"> t = <a class="code" href="vec__int128__ppc_8h.html#a539de2a4426a84102471306acc571ce8">vec_adduqm</a> (t, q);</div><div class="line"> result = <a class="code" href="vec__int128__ppc_8h.html#ac05c640c6a42770cb95466ff4a2d903c">vec_srqi</a> (t, (shift_ten31 - 1));</div><div class="line"> }</div><div class="line"> <span class="keywordflow">else</span></div><div class="line"> result = (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) { (__int128) 0 };</div><div class="line"></div><div class="line"> <span class="keywordflow">return</span> result;</div><div class="line">}</div></div><!-- fragment --><p> As the <a class="el" href="vec__int128__ppc_8h.html#ad6be9c8f02e43c39a659d6bbc9c3a2d2" title="Vector Multiply High Unsigned Quadword. ">vec_mulhuq()</a> operation is relatively expensive and we expect most __int128 values to 31-digits or less, using a compare to bypass the multiplication and return the 0 quotient, seems a prudent optimization.</p>
<p>So far we only have the quotient (the high order 8 digits) and still need to extract the remainder (the low order 31 digits). This is simply the quotient from above multiplied by 10e31 and subtracted from the original input. To avoid the multiple return value issue we define a modulo operation to take the original value and the quotient from <a class="el" href="vec__int128__ppc_8h.html#a9a6a39212f8a8b9ebf20e0117e1e1e88" title="Vector Divide by const 10e31 Unsigned Quadword. ">vec_divuq_10e31()</a>.</p>
<div class="fragment"><div class="line"><span class="keyword">static</span> <span class="keyword">inline</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a></div><div class="line"><a class="code" href="vec__int128__ppc_8h.html#af4b3b91f7e80522d8a8c0c171e077b99">vec_moduq_10e31</a> (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra, <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> q)</div><div class="line">{</div><div class="line"> <span class="comment">// ten32 = +100000000000000000000000000000000UQ</span></div><div class="line"> <span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> ten31 = (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>)</div><div class="line"> { (__int128) 1000000000000000UL * (__int128) 10000000000000000UL };</div><div class="line"> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> result, t;</div><div class="line"></div><div class="line"> <span class="keywordflow">if</span> (<a class="code" href="vec__int128__ppc_8h.html#af8f06b2c3d612a7cfdeb3bb883c59e19">vec_cmpuq_all_ge</a> (vra, ten31))</div><div class="line"> {</div><div class="line"> t = <a class="code" href="vec__int128__ppc_8h.html#a9aaaf0e4c2705be1e0e8e925b09c52de">vec_mulluq</a> (q, ten31);</div><div class="line"> result = <a class="code" href="vec__int128__ppc_8h.html#a6bafb410404d4f1e10a99263b57d1df0">vec_subuqm</a> (vra, t);</div><div class="line"> }</div><div class="line"> <span class="keywordflow">else</span></div><div class="line"> result = vra;</div><div class="line"></div><div class="line"> <span class="keywordflow">return</span> result;</div><div class="line">}</div></div><!-- fragment --><p> Again as the <a class="el" href="vec__int128__ppc_8h.html#a9aaaf0e4c2705be1e0e8e925b09c52de" title="Vector Multiply Low Unsigned Quadword. ">vec_mulluq()</a> operation is relatively expensive and we expect most __int128 values to 31-digits or less, using a compare to bypass the multiplication and return the input value as the remainder, seems a prudent optimization.</p>
<p>We expect these operations to be used together as in this example. </p><div class="fragment"><div class="line">q = <a class="code" href="vec__int128__ppc_8h.html#a9a6a39212f8a8b9ebf20e0117e1e1e88">vec_divuq_10e31</a> (a);</div><div class="line">r = <a class="code" href="vec__int128__ppc_8h.html#af4b3b91f7e80522d8a8c0c171e077b99">vec_moduq_10e31</a> (a, q);</div></div><!-- fragment --><p> We also expect the compiler to common the various constant loads across the two operations as the code is in-lined. This header also provides variants for factoring by 10e32 (to use with the Zone conversion) and signed variants of the 10e31 operation for direct conversion to extend precision signed BCD. </p><dl class="section see"><dt>See also</dt><dd><a class="el" href="vec__int128__ppc_8h.html#ae2b45341cc9cc918198bb69da0552098" title="Vector Divide by const 10e32 Unsigned Quadword. ">vec_divuq_10e32()</a>, <a class="el" href="vec__int128__ppc_8h.html#aff4f1d8a707289d2271eafad4aeb1e82" title="Vector Modulo by const 10e32 Unsigned Quadword. ">vec_moduq_10e32()</a>, <a class="el" href="vec__int128__ppc_8h.html#ae320909aca43d55b8be1069f38544ee8" title="Vector Divide by const 10e31 Signed Quadword. ">vec_divsq_10e31</a>, <a class="el" href="vec__int128__ppc_8h.html#aab5db88e4608d4a7408df9042adce86c" title="Vector Modulo by const 10e31 Signed Quadword. ">vec_modsq_10e31</a>.</dd></dl>
<h2><a class="anchor" id="int128_examples_0_1_3"></a>
Extending integer operations beyond Quadword</h2>
<p>Some algorithms require even high integer precision than __int128 provides. this includes:</p><ul>
<li>POSIX compliant conversion between __float128 and _Decimal128 types</li>
<li>POSIX compliant conversion from double and __float128 to decimal for print.</li>
<li>Cryptographic operations for Public-key cryptography and Elliptic Curves</li>
</ul>
<p>The POWER8 provides instructions for extending add and subtract to 128-bit integer and beyond with carry/extend operations (see <a class="el" href="vec__int128__ppc_8h.html#ad7aaadba249ce46c4c94f78df1020da3" title="Vector Add & write Carry Unsigned Quadword. ">vec_addcuq()</a>, <a class="el" href="vec__int128__ppc_8h.html#af18b98d2d73f1afbc439e1407c78f305" title="Vector Add Extended & write Carry Unsigned Quadword. ">vec_addecuq()</a>, <a class="el" href="vec__int128__ppc_8h.html#a44e63f70b182d60fe03b43a80647451a" title="Vector Add Extended Unsigned Quadword Modulo. ">vec_addeuqm()</a>, <a class="el" href="vec__int128__ppc_8h.html#a539de2a4426a84102471306acc571ce8" title="Vector Add Unsigned Quadword Modulo. ">vec_adduqm()</a>, (see <a class="el" href="vec__int128__ppc_8h.html#a95d3546b2fd6840b46b031c15b4f60d3" title="Vector Subtract and Write Carry Unsigned Quadword. ">vec_subcuq()</a>, <a class="el" href="vec__int128__ppc_8h.html#a04f6df21399a4e6228eca254611b23c5" title="Vector Subtract Extended and Write Carry Unsigned Quadword. ">vec_subecuq()</a>, <a class="el" href="vec__int128__ppc_8h.html#a2e40f9bf5df59b725cbfb6738c765202" title="Vector Subtract Extended Unsigned Quadword Modulo. ">vec_subeuqm()</a>, <a class="el" href="vec__int128__ppc_8h.html#a6bafb410404d4f1e10a99263b57d1df0" title="Vector Subtract Unsigned Quadword Modulo. ">vec_subuqm()</a>). POWER9 adds instructions to improve decimal / binary conversion to/from 128-bit integer and beyond with carry/extend operations. And while the PowerISA does not yet provide full 128 x 128 bit integer multiply instructions, it has provided wider integer multiply instructions, beginning in POWER8 (see <a class="el" href="vec__int32__ppc_8h.html#add7b91bf6138d029d9d8cc57b0905f1f" title="Vector multiply even signed words. ">vec_mulesw()</a>, <a class="el" href="vec__int32__ppc_8h.html#a415942bd7b8183634e44e56b6a40101b" title="Vector multiply odd signed words. ">vec_mulosw()</a>, <a class="el" href="vec__int32__ppc_8h.html#ac93f07d5ad73243db2771da83b50d6d8" title="Vector multiply even unsigned words. ">vec_muleuw()</a>, <a class="el" href="vec__int32__ppc_8h.html#a3ca45c65b9627abfc493d4ad500a961d" title="Vector multiply odd unsigned words. ">vec_mulouw()</a>) and again in POWER9 (see <a class="el" href="vec__int128__ppc_8h.html#a1d183ebd232e5826be109cdaa421aeed" title="Vector Multiply-Sum Unsigned Doubleword Modulo. ">vec_msumudm()</a>).</p>
<p>This all allows the <b>pveclib</b> to improve (reduce the latency of) the implementation of multiply quadword operations. This includes operations that generate the full 256-bit multiply product (see <a class="el" href="vec__int128__ppc_8h.html#aee5c5b2998ef105b4c6f39739748ffa8" title="Vector Multiply Unsigned Double Quadword. ">vec_muludq()</a>, <a class="el" href="vec__int128__ppc_8h.html#ad6be9c8f02e43c39a659d6bbc9c3a2d2" title="Vector Multiply High Unsigned Quadword. ">vec_mulhuq()</a>. <a class="el" href="vec__int128__ppc_8h.html#a9aaaf0e4c2705be1e0e8e925b09c52de" title="Vector Multiply Low Unsigned Quadword. ">vec_mulluq()</a>). And this in combination with add/subtract with carry extend quadword allows the coding of even wider (multiple quadword) multiply operations.</p>
<h3><a class="anchor" id="int128_examples_0_1_3_0"></a>
Extended Quadword multiply</h3>
<p>The following example performs a 256x256 bit unsigned integer multiply generating a 512-bit product: </p><div class="fragment"><div class="line"><span class="keywordtype">void</span></div><div class="line">test_mul4uq (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> *__restrict__ mulu, <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> m1h, <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> m1l,</div><div class="line"> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> m2h, <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> m2l)</div><div class="line">{</div><div class="line"> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> mc, mp, mq, mqhl;</div><div class="line"> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> mphh, mphl, mplh, mpll;</div><div class="line"> mpll = <a class="code" href="vec__int128__ppc_8h.html#aee5c5b2998ef105b4c6f39739748ffa8">vec_muludq</a> (&mplh, m1l, m2l);</div><div class="line"> mp = <a class="code" href="vec__int128__ppc_8h.html#aee5c5b2998ef105b4c6f39739748ffa8">vec_muludq</a> (&mphl, m1h, m2l);</div><div class="line"> mplh = <a class="code" href="vec__int128__ppc_8h.html#a363fa7103ccd730c47bb34cb9f05e80b">vec_addcq</a> (&mc, mplh, mp);</div><div class="line"> mphl = <a class="code" href="vec__int128__ppc_8h.html#a539de2a4426a84102471306acc571ce8">vec_adduqm</a> (mphl, mc);</div><div class="line"> mp = <a class="code" href="vec__int128__ppc_8h.html#aee5c5b2998ef105b4c6f39739748ffa8">vec_muludq</a> (&mqhl, m2h, m1l);</div><div class="line"> mplh = <a class="code" href="vec__int128__ppc_8h.html#a363fa7103ccd730c47bb34cb9f05e80b">vec_addcq</a> (&mq, mplh, mp);</div><div class="line"> mphl = <a class="code" href="vec__int128__ppc_8h.html#a9e27910c148d525e17d099688aec9ba1">vec_addeq</a> (&mc, mphl, mqhl, mq);</div><div class="line"> mp = <a class="code" href="vec__int128__ppc_8h.html#aee5c5b2998ef105b4c6f39739748ffa8">vec_muludq</a> (&mphh, m2h, m1h);</div><div class="line"> mphl = <a class="code" href="vec__int128__ppc_8h.html#a363fa7103ccd730c47bb34cb9f05e80b">vec_addcq</a> (&mq, mphl, mp);</div><div class="line"> mphh = <a class="code" href="vec__int128__ppc_8h.html#a44e63f70b182d60fe03b43a80647451a">vec_addeuqm</a> (mphh, mq, mc);</div><div class="line"></div><div class="line"> mulu[0] = mpll;</div><div class="line"> mulu[1] = mplh;</div><div class="line"> mulu[2] = mphl;</div><div class="line"> mulu[3] = mphh;</div><div class="line">}</div></div><!-- fragment --><p> This example generates some additional questions:</p><ul>
<li>Why use <a class="el" href="vec__int128__ppc_8h.html#aee5c5b2998ef105b4c6f39739748ffa8" title="Vector Multiply Unsigned Double Quadword. ">vec_muludq()</a> instead of pairing <a class="el" href="vec__int128__ppc_8h.html#ad6be9c8f02e43c39a659d6bbc9c3a2d2" title="Vector Multiply High Unsigned Quadword. ">vec_mulhuq()</a> and <a class="el" href="vec__int128__ppc_8h.html#a9aaaf0e4c2705be1e0e8e925b09c52de" title="Vector Multiply Low Unsigned Quadword. ">vec_mulluq()</a>?</li>
<li>Why use <a class="el" href="vec__int128__ppc_8h.html#a363fa7103ccd730c47bb34cb9f05e80b" title="Vector Add with carry Unsigned Quadword. ">vec_addcq()</a> instead of pairing <a class="el" href="vec__int128__ppc_8h.html#ad7aaadba249ce46c4c94f78df1020da3" title="Vector Add & write Carry Unsigned Quadword. ">vec_addcuq()</a> and <a class="el" href="vec__int128__ppc_8h.html#a539de2a4426a84102471306acc571ce8" title="Vector Add Unsigned Quadword Modulo. ">vec_adduqm()</a>?</li>
<li>Why return the 512-bit product via a pointer instead of returning a struct or array of 4 x vui128_t (<em>homogeneous aggregates</em>)?</li>
</ul>
<p>The detailed rationale for this is documented in section <a class="el" href="index.html#mainpage_sub_1_3">Returning extended quadword results.</a> In this specific case (quadword integer operations that generate two vector values) <b>pveclib</b> provides both alternatives:</p><ul>
<li>separate operations each returning a single (high or low order) vector.</li>
<li>combined operations providing:<ul>
<li>the lower order vector as the function return value.</li>
<li>the high order (carry or high product) vector via a pointer reference parameter.</li>
</ul>
</li>
</ul>
<p>Either method should provide the same results. For example: </p><div class="fragment"><div class="line">mplh = <a class="code" href="vec__int128__ppc_8h.html#a363fa7103ccd730c47bb34cb9f05e80b">vec_addcq</a> (&mc, mplh, mp);</div></div><!-- fragment --><p> is equivalent to </p><div class="fragment"><div class="line">mc = <a class="code" href="vec__int128__ppc_8h.html#ad7aaadba249ce46c4c94f78df1020da3">vec_addcuq</a> (mplh, mp);</div><div class="line">mplh = <a class="code" href="vec__int128__ppc_8h.html#a539de2a4426a84102471306acc571ce8">vec_adduqm</a> (mplh, mp);</div></div><!-- fragment --><p> and </p><div class="fragment"><div class="line">mpll = <a class="code" href="vec__int128__ppc_8h.html#aee5c5b2998ef105b4c6f39739748ffa8">vec_muludq</a> (&mplh, m1l, m2l);</div></div><!-- fragment --><p> is equivalent to </p><div class="fragment"><div class="line">mpll = <a class="code" href="vec__int128__ppc_8h.html#a9aaaf0e4c2705be1e0e8e925b09c52de">vec_mulluq</a> (m1l, m2l);</div><div class="line">mplh = <a class="code" href="vec__int128__ppc_8h.html#a10780cd8a88f18ec564ee6254c179a06">vec_mulhud</a> (m1l, m2l);</div></div><!-- fragment --><p> So is there any advantage to separate versus combined operations?</p>
<p>Functionally it is useful to have separate operations for the cases where only one quadword part is needed. For example if you know that a add/subtract operation can not overflow, why generate the carry? Alternatively the quadword greater/less-than compares are based solely on the carry from the subtract quadword, why generate lower 128-bit (modulo) difference? For multiplication the modulo (multiply low) operation is the expected semantic or is known to be sufficient. Alternatively the multiplicative inverse only uses the high order (multiply high) quadword of the product.</p>
<p>From the performance (instruction latency and throughput) perspective, if the algorithm requires the extended result or full product, the combined operation is usually the better choice. Otherwise use the specific single return operation needed. At best, the separate operations may generate the same instruction sequence as the combined operation, But this depends on the target platform and specific optimizations implemented by the compiler.</p>
<dl class="section note"><dt>Note</dt><dd>For inlined operations the pointer reference in the combined form, is usually optimized to a simple register assignment, by the compiler. </dd>
<dd>
For platform targets where the separate operations each generate a single instruction, we expect the compiler to generate the same instructions as the combined operation. But this is only likely for add/sub quadword on the POWER8 and multiply by 10 quadword on POWER9.</dd></dl>
<h3><a class="anchor" id="int128_examples_0_1_3_1"></a>
Quadword Long Division</h3>
<p>In the section <a class="el" href="vec__int128__ppc_8h.html#int128_examples_0_1_2">Converting Vector __int128 values to BCD</a> above we used multiplicative inverse to factor a binary quadword value in two (high quotient and low remainder) parts. Here we divide by a large power of 10 (10<sup>31</sup> or 10<sup>32</sup>) of a size where the quotient and remainder allow direct conversion to BCD (see <a class="el" href="vec__bcd__ppc_8h.html#a5a1aec05a6dadcf5a1a8e028223745df" title="Vector Decimal Convert From Signed Quadword returning up to 31 BCD digits. ">vec_bcdcfsq()</a>, <a class="el" href="vec__bcd__ppc_8h.html#a7b8b5371d537cd878ffb37337e93ba14" title="Vector Decimal Convert From Unsigned Quadword returning up to 32 BCD digits. ">vec_bcdcfuq()</a>). After conversion, the BCD parts can be concatenated to form the larger (39 digit) decimal radix value equivalent of the 128-bit binary value.</p>
<p>We can extend this technique to larger (multiple quadword) binary values but this requires long division. This is the version of the long division you learned in grade school, where a multi-digit value is divided in stages by a single digit. But the digits we are using are really big (10<sup>31</sup>-1 or 10<sup>32</sup>-1).</p>
<p>The first step is relatively easy. Start by dividing the left-most <em>digit</em> of the dividend by the divisor, generating the integer quotient and remainder. We already have operations to implement that. </p><div class="fragment"><div class="line"><span class="comment">// initial step for the top digits</span></div><div class="line">dn = d[0];</div><div class="line">qh = <a class="code" href="vec__int128__ppc_8h.html#a9a6a39212f8a8b9ebf20e0117e1e1e88">vec_divuq_10e31</a> (dn);</div><div class="line">rh = <a class="code" href="vec__int128__ppc_8h.html#af4b3b91f7e80522d8a8c0c171e077b99">vec_moduq_10e31</a> (dn, qh);</div><div class="line">q[0] = qh;</div></div><!-- fragment --><p> The array <em>d</em> contains the quadwords of the extended precision integer dividend. The array <em>q</em> will contain the quadwords of the extended precision integer quotient. Here we have generated the first <em>quadword q[0]</em> digit of the quotient. The remainder <em>rh</em> will be used in the next step of the long division.</p>
<p>The process repeats except after the first step we have an intermediate dividend formed from:</p><ul>
<li>The remainder from the previous step</li>
<li>Concatenated with the next <em>digit</em> of the extended precision quadword dividend.</li>
</ul>
<p>So for each additional step we need to divide two quadwords (256-bits) by the quadword divisor. Actually this dividend should be less than a full 256-bits because we know the remainder is less than the divisor. So the intermediate dividend is less than ((divisor - 1) * 2<sup>128</sup>). So we know the quotient can not exceed (2<sup>128</sup>-1) or one quadword.</p>
<p>Now we need an operation that will divide this double quadword value and provide quotient and remainder that are correct (or close enough). Remember your grade school long division where you would:</p><ul>
<li>estimate the quotient</li>
<li>multiply the quotient by the divisor</li>
<li>subtract this product from the current 2 digit dividend</li>
<li>check that the remainder is less than the divisor.<ul>
<li>if the remainder is greater than the divisor; the estimated quotient is too small</li>
<li>if the remainder is negative (the product was greater than the dividend); the estimated quotient is too large.</li>
</ul>
</li>
<li>correct the quotient and remainder if needed before doing the next step.</li>
</ul>
<p>So we don't need to be perfect, but close enough. As long as we can detect any problems and (if needed) correct the results, we can implement long division to any size.</p>
<p>We already have an operation for dividing a quadword by 10<sup>31</sup> using the magic numbers for multiplicative inverse. This can easily be extended to multiply double quadword high. For example: </p><div class="fragment"><div class="line"><span class="comment">// Multiply high [vra||vrb] * mul_invs_ten31</span></div><div class="line">q = <a class="code" href="vec__int128__ppc_8h.html#ad6be9c8f02e43c39a659d6bbc9c3a2d2">vec_mulhuq</a> (vrb, mul_invs_ten31);</div><div class="line">q1 = <a class="code" href="vec__int128__ppc_8h.html#aee5c5b2998ef105b4c6f39739748ffa8">vec_muludq</a> (&t, vra, mul_invs_ten31);</div><div class="line">c = <a class="code" href="vec__int128__ppc_8h.html#ad7aaadba249ce46c4c94f78df1020da3">vec_addcuq</a> (q1, q);</div><div class="line">q = <a class="code" href="vec__int128__ppc_8h.html#a539de2a4426a84102471306acc571ce8">vec_adduqm</a> (q1, q);</div><div class="line">q1 = <a class="code" href="vec__int128__ppc_8h.html#a539de2a4426a84102471306acc571ce8">vec_adduqm</a> (t, c);</div><div class="line"><span class="comment">// corrective add [q2||q1||q] = [q1||q] + [vra||vrb]</span></div><div class="line">c = <a class="code" href="vec__int128__ppc_8h.html#ad7aaadba249ce46c4c94f78df1020da3">vec_addcuq</a> (vrb, q);</div><div class="line">q = <a class="code" href="vec__int128__ppc_8h.html#a539de2a4426a84102471306acc571ce8">vec_adduqm</a> (vrb, q);</div><div class="line"><span class="comment">// q2 is the carry-out from the corrective add</span></div><div class="line">q2 = <a class="code" href="vec__int128__ppc_8h.html#af18b98d2d73f1afbc439e1407c78f305">vec_addecuq</a> (q1, vra, c);</div><div class="line">q1 = <a class="code" href="vec__int128__ppc_8h.html#a44e63f70b182d60fe03b43a80647451a">vec_addeuqm</a> (q1, vra, c);</div><div class="line"><span class="comment">// shift 384-bits (including the carry) right 107 bits</span></div><div class="line"><span class="comment">// Using shift left double quadword shift by (128-107)-bits</span></div><div class="line">r2 = <a class="code" href="vec__int128__ppc_8h.html#aaa33904ec4de42f54cceab34adb303c5">vec_sldqi</a> (q2, q1, (128 - shift_ten31));</div><div class="line">result = <a class="code" href="vec__int128__ppc_8h.html#aaa33904ec4de42f54cceab34adb303c5">vec_sldqi</a> (q1, q, (128 - shift_ten31));</div></div><!-- fragment --><p> Here we generate a 256-bit multiply high using the <a class="el" href="vec__int128__ppc_8h.html#ad6be9c8f02e43c39a659d6bbc9c3a2d2" title="Vector Multiply High Unsigned Quadword. ">vec_mulhuq()</a> for the low dividend (vrb) and <a class="el" href="vec__int128__ppc_8h.html#aee5c5b2998ef105b4c6f39739748ffa8" title="Vector Multiply Unsigned Double Quadword. ">vec_muludq()</a> for high dividend (vra). Then sum the partial products ([t||q1] + [0||q]) to get initial 256-bit product [q1||q]. Then apply the corrective add ([q1||q] + [vra||vrb]). This may generate a carry which needs to be included in the final shift.</p>
<p>Technically we only expect a 128-bit quotient after the shift, but we have 3 quadwords (2 quadwords and a carry) going into the shift right. Also our (estimated) quotient may be <em>off by 1</em> and generate a 129-bit result. This is due to using a the magic numbers for 128-bit multiplicative inverse and not regenerating magic numbers for 256-bits. We can't do anything about that now and so return a 256-bit double quadword quotient.</p>
<dl class="section note"><dt>Note</dt><dd>This is where only needing to be "close enough", works in our favor. We will check and correct the quotient in the modulo operation.</dd></dl>
<p>The 256-bits we want are spanning multiple quadwords so we replace a simple quadword shift right with two <b>Shift Left Double Quadword Immediate</b> operations and complement the shift count (128 - shift_ten31). This gives a 256-bit quotient which we expect to have zero in the high quadword.</p>
<p>As this operation will be used in a loop for long division operations and the extended multiplies are fairly expensive, we should check for an short-circuit special conditions. The most important special condition is when the dividend is less that the divisor and the quotient is zero. This also helps when the long division dividend may have leading quadword zeros that need to be skipped over. For the full implementation looks like: </p><div class="fragment"><div class="line"><span class="keyword">static</span> <span class="keyword">inline</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a></div><div class="line"><a class="code" href="vec__int128__ppc_8h.html#afa2db6d665f837f96c746d88027e9e19">vec_divudq_10e31</a> (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> *qh, <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra, <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrb)</div><div class="line">{</div><div class="line"> <span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> ten31 = (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>)</div><div class="line"> { (__int128) 1000000000000000UL * (__int128) 10000000000000000UL };</div><div class="line"> <span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> zero = (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) { (__int128) 0UL };</div><div class="line"> <span class="comment">// Magic numbers for multiplicative inverse to divide by 10**31</span></div><div class="line"> <span class="comment">// are 4804950418589725908363185682083061167, corrective add,</span></div><div class="line"> <span class="comment">// and shift right 103 bits.</span></div><div class="line"> <span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> mul_invs_ten31 = (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) <a class="code" href="vec__common__ppc_8h.html#a9ed8c282b57705c960542ed869de3325">CONST_VINT128_DW</a>(</div><div class="line"> 0x039d66589687f9e9UL, 0x01d59f290ee19dafUL);</div><div class="line"> <span class="keyword">const</span> <span class="keywordtype">int</span> shift_ten31 = 103;</div><div class="line"> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> result, r2, t, q, q1, q2, c;</div><div class="line"></div><div class="line"> <span class="keywordflow">if</span> (<a class="code" href="vec__int128__ppc_8h.html#a1799f860ba79e698c66b171392afde01">vec_cmpuq_all_ne</a> (vra, zero) || <a class="code" href="vec__int128__ppc_8h.html#af8f06b2c3d612a7cfdeb3bb883c59e19">vec_cmpuq_all_ge</a> (vrb, ten31))</div><div class="line"> {</div><div class="line"> <span class="comment">// Multiply high [vra||vrb] * mul_invs_ten31</span></div><div class="line"> q = <a class="code" href="vec__int128__ppc_8h.html#ad6be9c8f02e43c39a659d6bbc9c3a2d2">vec_mulhuq</a> (vrb, mul_invs_ten31);</div><div class="line"> q1 = <a class="code" href="vec__int128__ppc_8h.html#aee5c5b2998ef105b4c6f39739748ffa8">vec_muludq</a> (&t, vra, mul_invs_ten31);</div><div class="line"> c = <a class="code" href="vec__int128__ppc_8h.html#ad7aaadba249ce46c4c94f78df1020da3">vec_addcuq</a> (q1, q);</div><div class="line"> q = <a class="code" href="vec__int128__ppc_8h.html#a539de2a4426a84102471306acc571ce8">vec_adduqm</a> (q1, q);</div><div class="line"> q1 = <a class="code" href="vec__int128__ppc_8h.html#a539de2a4426a84102471306acc571ce8">vec_adduqm</a> (t, c);</div><div class="line"> <span class="comment">// corrective add [q2||q1||q] = [q1||q] + [vra||vrb]</span></div><div class="line"> c = <a class="code" href="vec__int128__ppc_8h.html#ad7aaadba249ce46c4c94f78df1020da3">vec_addcuq</a> (vrb, q);</div><div class="line"> q = <a class="code" href="vec__int128__ppc_8h.html#a539de2a4426a84102471306acc571ce8">vec_adduqm</a> (vrb, q);</div><div class="line"> <span class="comment">// q2 is the carry-out from the corrective add</span></div><div class="line"> q2 = <a class="code" href="vec__int128__ppc_8h.html#af18b98d2d73f1afbc439e1407c78f305">vec_addecuq</a> (q1, vra, c);</div><div class="line"> q1 = <a class="code" href="vec__int128__ppc_8h.html#a44e63f70b182d60fe03b43a80647451a">vec_addeuqm</a> (q1, vra, c);</div><div class="line"> <span class="comment">// shift 384-bits (including the carry) right 103 bits</span></div><div class="line"> <span class="comment">// Using shift left double quadword shift by (128-103)-bits</span></div><div class="line"> r2 = <a class="code" href="vec__int128__ppc_8h.html#aaa33904ec4de42f54cceab34adb303c5">vec_sldqi</a> (q2, q1, (128 - shift_ten31));</div><div class="line"> result = <a class="code" href="vec__int128__ppc_8h.html#aaa33904ec4de42f54cceab34adb303c5">vec_sldqi</a> (q1, q, (128 - shift_ten31));</div><div class="line"> }</div><div class="line"> <span class="keywordflow">else</span></div><div class="line"> {</div><div class="line"> <span class="comment">// Dividend is less than divisor then return zero quotient</span></div><div class="line"> r2 = zero;</div><div class="line"> result = zero;</div><div class="line"> }</div><div class="line"></div><div class="line"> <span class="comment">// return 256-bit quotient</span></div><div class="line"> *qh = r2;</div><div class="line"> <span class="keywordflow">return</span> result;</div><div class="line">}</div></div><!-- fragment --><p>To complete the long division operation we need to perform double quadword modulo operations. Here the dividend is two quadwords and the low quadword of the quotient from the divide double quadword operation above. We use multiply double quadword to compute the remainder ([vra||vrb] - (q * 10<sup>31</sup>). Generating the 256-bit product and difference ensure we can detect the case where the quotient is off-by-1 on the high side.</p>
<div class="fragment"><div class="line">t = <a class="code" href="vec__int128__ppc_8h.html#aee5c5b2998ef105b4c6f39739748ffa8">vec_muludq</a> (&th, *ql, ten31);</div><div class="line">c = <a class="code" href="vec__int128__ppc_8h.html#a95d3546b2fd6840b46b031c15b4f60d3">vec_subcuq</a> (vrb, t);</div><div class="line">t = <a class="code" href="vec__int128__ppc_8h.html#a6bafb410404d4f1e10a99263b57d1df0">vec_subuqm</a> (vrb, t);</div><div class="line">th = <a class="code" href="vec__int128__ppc_8h.html#a2e40f9bf5df59b725cbfb6738c765202">vec_subeuqm</a> (vra, th, c);</div><div class="line"><span class="comment">// The remainder should be less than the divisor</span></div><div class="line"><span class="keywordflow">if</span> (<a class="code" href="vec__int128__ppc_8h.html#a1799f860ba79e698c66b171392afde01">vec_cmpuq_all_ne</a> (th, zero) && <a class="code" href="vec__int128__ppc_8h.html#af8f06b2c3d612a7cfdeb3bb883c59e19">vec_cmpuq_all_ge</a> (t, ten31))</div><div class="line"> {</div><div class="line"> <span class="comment">// Otherwise the estimated quotient is off by 1</span></div><div class="line"> *ql = <a class="code" href="vec__int128__ppc_8h.html#a539de2a4426a84102471306acc571ce8">vec_adduqm</a> (*ql, minus_one);</div><div class="line"> <span class="comment">// And the remainder is negative, so add the divisor</span></div><div class="line"> t = <a class="code" href="vec__int128__ppc_8h.html#a539de2a4426a84102471306acc571ce8">vec_adduqm</a> (t, ten31);</div><div class="line"> }</div><div class="line">result = t;</div></div><!-- fragment --><p> In this case we need to correct both remainder and the (estimated) quotient. This is a bit tricky as the quotient is normally passed by value, but for this operation we need to pass by reference, which allows the corrected quotient to be passed on to the next step.</p>
<p>Again as this operation will be used in a loop for long division operations and the extended multiplies are fairly expensive, we should check for and short-circuit special conditions. The most important special condition is when the dividend is less that the divisor and the remainder is simply the dividend.</p>
<div class="fragment"><div class="line"><span class="keyword">static</span> <span class="keyword">inline</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a></div><div class="line"><a class="code" href="vec__int128__ppc_8h.html#a31a893a75e42f5f6c4dfe793678fea59">vec_modudq_10e31</a> (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vra, <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vrb, <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> *ql)</div><div class="line">{</div><div class="line"> <span class="comment">// ten31 = +100000000000000000000000000000000UQ</span></div><div class="line"> <span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> ten31 = (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>)</div><div class="line"> { (__int128) 1000000000000000UL * (__int128) 10000000000000000UL };</div><div class="line"> <span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> zero = (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) { (__int128) 0UL };</div><div class="line"> <span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> minus_one = (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) { (__int128) -1L };</div><div class="line"> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> result, t, th, c;</div><div class="line"></div><div class="line"> <span class="keywordflow">if</span> (<a class="code" href="vec__int128__ppc_8h.html#a1799f860ba79e698c66b171392afde01">vec_cmpuq_all_ne</a> (vra, zero) || <a class="code" href="vec__int128__ppc_8h.html#af8f06b2c3d612a7cfdeb3bb883c59e19">vec_cmpuq_all_ge</a> (vrb, ten31))</div><div class="line"> {</div><div class="line"> t = <a class="code" href="vec__int128__ppc_8h.html#aee5c5b2998ef105b4c6f39739748ffa8">vec_muludq</a> (&th, *ql, ten31);</div><div class="line"> c = <a class="code" href="vec__int128__ppc_8h.html#a95d3546b2fd6840b46b031c15b4f60d3">vec_subcuq</a> (vrb, t);</div><div class="line"> t = <a class="code" href="vec__int128__ppc_8h.html#a6bafb410404d4f1e10a99263b57d1df0">vec_subuqm</a> (vrb, t);</div><div class="line"> th = <a class="code" href="vec__int128__ppc_8h.html#a2e40f9bf5df59b725cbfb6738c765202">vec_subeuqm</a> (vra, th, c);</div><div class="line"> <span class="comment">// The remainder should be less than the divisor</span></div><div class="line"> <span class="keywordflow">if</span> (<a class="code" href="vec__int128__ppc_8h.html#a1799f860ba79e698c66b171392afde01">vec_cmpuq_all_ne</a> (th, zero) && <a class="code" href="vec__int128__ppc_8h.html#af8f06b2c3d612a7cfdeb3bb883c59e19">vec_cmpuq_all_ge</a> (t, ten31))</div><div class="line"> {</div><div class="line"> <span class="comment">// If not the estimated quotient is off by 1</span></div><div class="line"> *ql = <a class="code" href="vec__int128__ppc_8h.html#a539de2a4426a84102471306acc571ce8">vec_adduqm</a> (*ql, minus_one);</div><div class="line"> <span class="comment">// And the remainder is negative, so add the divisor</span></div><div class="line"> t = <a class="code" href="vec__int128__ppc_8h.html#a539de2a4426a84102471306acc571ce8">vec_adduqm</a> (t, ten31);</div><div class="line"> }</div><div class="line"> result = t;</div><div class="line"> }</div><div class="line"> <span class="keywordflow">else</span></div><div class="line"> result = vrb;</div><div class="line"></div><div class="line"> <span class="keywordflow">return</span> result;</div><div class="line">}</div></div><!-- fragment --><p>Now we have all the operations needed to complete the implementation of long division by the decimal constant (10<sup>31</sup>).</p>
<div class="fragment"><div class="line"><a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a></div><div class="line">example_longdiv_10e31 (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> *q, <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> *d, <span class="keywordtype">long</span> <span class="keywordtype">int</span> _N)</div><div class="line">{</div><div class="line"> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> dn, qh, ql, rh;</div><div class="line"> <span class="keywordtype">long</span> <span class="keywordtype">int</span> i;</div><div class="line"></div><div class="line"> <span class="comment">// initial step for the top digits</span></div><div class="line"> dn = d[0];</div><div class="line"> qh = <a class="code" href="vec__int128__ppc_8h.html#a9a6a39212f8a8b9ebf20e0117e1e1e88">vec_divuq_10e31</a> (dn);</div><div class="line"> rh = <a class="code" href="vec__int128__ppc_8h.html#af4b3b91f7e80522d8a8c0c171e077b99">vec_moduq_10e31</a> (dn, qh);</div><div class="line"> q[0] = qh;</div><div class="line"></div><div class="line"> <span class="comment">// now we know the remainder is less than the divisor.</span></div><div class="line"> <span class="keywordflow">for</span> (i=1; i<_N; i++)</div><div class="line"> {</div><div class="line"> dn = d[i];</div><div class="line"> ql = <a class="code" href="vec__int128__ppc_8h.html#afa2db6d665f837f96c746d88027e9e19">vec_divudq_10e31</a> (&qh, rh, dn);</div><div class="line"> rh = <a class="code" href="vec__int128__ppc_8h.html#a31a893a75e42f5f6c4dfe793678fea59">vec_modudq_10e31</a> (rh, dn, &ql);</div><div class="line"> q[i] = ql;</div><div class="line"> }</div><div class="line"> <span class="comment">// return the final remainder</span></div><div class="line"> <span class="keywordflow">return</span> rh;</div><div class="line">}</div></div><!-- fragment --><p> The result of each call to example_longdiv_10e31() is the output array <em>q</em> of quadwords containing the extended quotient, and the remainder as the return value. The input array <em>d</em> and output array <em>q</em> should not overlap in storage. The remainder is in the range 0-9999999999999999999999999999999 and is suitable for conversion to BCD or decimal characters. (see <a class="el" href="vec__bcd__ppc_8h.html#a5a1aec05a6dadcf5a1a8e028223745df" title="Vector Decimal Convert From Signed Quadword returning up to 31 BCD digits. ">vec_bcdcfsq()</a>). Repeated calls passing the quotient from the previous call as the dividend, reduces the quotient by 31 digits and returns another 31 digits in the remainder for conversion. This continues until the quotient is less than 10<sup>31</sup> which provides the highest order digits of the decimal result.</p>
<dl class="section note"><dt>Note</dt><dd>Similarly for long division in support of unsigned 32-digit BCD conversion using operations; <a class="el" href="vec__int128__ppc_8h.html#ae2b45341cc9cc918198bb69da0552098" title="Vector Divide by const 10e32 Unsigned Quadword. ">vec_divuq_10e32()</a>, <a class="el" href="vec__int128__ppc_8h.html#aff4f1d8a707289d2271eafad4aeb1e82" title="Vector Modulo by const 10e32 Unsigned Quadword. ">vec_moduq_10e32()</a>, <a class="el" href="vec__int128__ppc_8h.html#a917acd42e775f4bb323ba2104c52d7cb" title="Vector Divide Unsigned Double Quadword by const 10e32. ">vec_divudq_10e32()</a>, and <a class="el" href="vec__int128__ppc_8h.html#a2ccbd77900956c01a51b88e672e593c6" title="Vector Modulo Unsigned Double Quadword by const 10e32. ">vec_modudq_10e32()</a>. Long division for other constant divisors or multiple quadword divisors is an exercise for the student.</dd></dl>
<dl class="todo"><dt><b><a class="el" href="todo.html#_todo000003">Todo:</a></b></dt><dd>The implementation above gives correct results for all the cases tested for divide by constants 10<sup>31</sup> and 10<sup>32</sup>). This is not a mathematical proof of correctness, just an observation. Anyone who finds a counter example or offers a mathematical proof should submit a bug report.</dd></dl>
<h1><a class="anchor" id="int128_perf_0_0"></a>
Performance data.</h1>
<p>High level performance estimates are provided as an aid to function selection when evaluating algorithms. For background on how <em>Latency</em> and <em>Throughput</em> are derived see: <a class="el" href="index.html#perf_data">Performance data.</a> </p>
</div><h2 class="groupheader">Macro Definition Documentation</h2>
<a id="acd5c20e29b155f8f575d60f6af8f7955"></a>
<h2 class="memtitle"><span class="permalink"><a href="#acd5c20e29b155f8f575d60f6af8f7955">◆ </a></span>CONST_VUINT128_Qx16d</h2>
<div class="memitem">
<div class="memproto">
<table class="memname">
<tr>
<td class="memname">#define CONST_VUINT128_Qx16d</td>
<td>(</td>
<td class="paramtype"> </td>
<td class="paramname">__q0, </td>
</tr>
<tr>
<td class="paramkey"></td>
<td></td>
<td class="paramtype"> </td>
<td class="paramname">__q1 </td>
</tr>
<tr>
<td></td>
<td>)</td>
<td></td><td></td>
</tr>
</table>
</div><div class="memdoc">
<b>Value:</b><div class="fragment"><div class="line">( (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) \</div><div class="line"> (((<span class="keywordtype">unsigned</span> __int128) __q0) * 10000000000000000UL) \</div><div class="line"> + ((<span class="keywordtype">unsigned</span> __int128) __q1) )</div><div class="ttc" id="vec__common__ppc_8h_html_aaf7a8e92d8ba681dac3d2ec3259c0820"><div class="ttname"><a href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a></div><div class="ttdeci">__vector unsigned __int128 vui128_t</div><div class="ttdoc">vector of one 128-bit unsigned __int128 element. </div><div class="ttdef"><b>Definition:</b> vec_common_ppc.h:237</div></div>
</div><!-- fragment -->
<p>Generate a vector unsigned __int128 constant from doublewords. </p>
<p>Combine 2 x 16 decimal digit long long constants into a single 32 decimal digit __int128 constant. The 2 parameters are long integer constant values in high to low order. This order is consistent for big and little endian and the result loaded into vector registers is correct for quadword integer operations.</p>
<p>For example</p>
<div class="fragment"><div class="line"><span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> ten32 = <a class="code" href="vec__int128__ppc_8h.html#acd5c20e29b155f8f575d60f6af8f7955">CONST_VUINT128_Qx16d</a> (10000000000000000UL, 0UL);</div></div><!-- fragment -->
</div>
</div>
<a id="aa9c94b59ae2504f498923ed506a22083"></a>
<h2 class="memtitle"><span class="permalink"><a href="#aa9c94b59ae2504f498923ed506a22083">◆ </a></span>CONST_VUINT128_Qx18d</h2>
<div class="memitem">
<div class="memproto">
<table class="memname">
<tr>
<td class="memname">#define CONST_VUINT128_Qx18d</td>
<td>(</td>
<td class="paramtype"> </td>
<td class="paramname">__q0, </td>
</tr>
<tr>
<td class="paramkey"></td>
<td></td>
<td class="paramtype"> </td>
<td class="paramname">__q1 </td>
</tr>
<tr>
<td></td>
<td>)</td>
<td></td><td></td>
</tr>
</table>
</div><div class="memdoc">
<b>Value:</b><div class="fragment"><div class="line">( (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) \</div><div class="line"> (((<span class="keywordtype">unsigned</span> __int128) __q0) * 1000000000000000000UL) \</div><div class="line"> + ((<span class="keywordtype">unsigned</span> __int128) __q1) )</div><div class="ttc" id="vec__common__ppc_8h_html_aaf7a8e92d8ba681dac3d2ec3259c0820"><div class="ttname"><a href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a></div><div class="ttdeci">__vector unsigned __int128 vui128_t</div><div class="ttdoc">vector of one 128-bit unsigned __int128 element. </div><div class="ttdef"><b>Definition:</b> vec_common_ppc.h:237</div></div>
</div><!-- fragment -->
<p>Generate a vector unsigned __int128 constant from doublewords. </p>
<p>Combine 2 x 18 decimal digit long long constants into a single 36 decimal digit __int128 constant. The 2 parameters are long integer constant values in high to low order. This order is consistent for big and little endian and the result loaded into vector registers is correct for quadword integer operations.</p>
<p>For example</p>
<div class="fragment"><div class="line"><a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> ten36-1 = <a class="code" href="vec__int128__ppc_8h.html#aa9c94b59ae2504f498923ed506a22083">CONST_VUINT128_Qx18d</a> (999999999999999999UL, 999999999999999999UL);</div></div><!-- fragment -->
</div>
</div>
<a id="a25faf0c51245eefdaeda1dc5dd71c516"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a25faf0c51245eefdaeda1dc5dd71c516">◆ </a></span>CONST_VUINT128_Qx19d</h2>
<div class="memitem">
<div class="memproto">
<table class="memname">
<tr>
<td class="memname">#define CONST_VUINT128_Qx19d</td>
<td>(</td>
<td class="paramtype"> </td>
<td class="paramname">__q0, </td>
</tr>
<tr>
<td class="paramkey"></td>
<td></td>
<td class="paramtype"> </td>
<td class="paramname">__q1 </td>
</tr>
<tr>
<td></td>
<td>)</td>
<td></td><td></td>
</tr>
</table>
</div><div class="memdoc">
<b>Value:</b><div class="fragment"><div class="line">( (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) \</div><div class="line"> (((<span class="keywordtype">unsigned</span> __int128) __q0) * 10000000000000000000UL) \</div><div class="line"> + ((<span class="keywordtype">unsigned</span> __int128) __q1) )</div><div class="ttc" id="vec__common__ppc_8h_html_aaf7a8e92d8ba681dac3d2ec3259c0820"><div class="ttname"><a href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a></div><div class="ttdeci">__vector unsigned __int128 vui128_t</div><div class="ttdoc">vector of one 128-bit unsigned __int128 element. </div><div class="ttdef"><b>Definition:</b> vec_common_ppc.h:237</div></div>
</div><!-- fragment -->
<p>Generate a vector unsigned __int128 constant from doublewords. </p>
<p>Combine 2 x 19 decimal digit long long constants into a single 38 decimal digit __int128 constant. The 2 parameters are long integer constant values in high to low order. This order is consistent for big and little endian and the result loaded into vector registers is correct for quadword integer operations.</p>
<p>For example</p>
<div class="fragment"><div class="line"><span class="keyword">const</span> <a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> mul_invs_ten16 = <a class="code" href="vec__int128__ppc_8h.html#a25faf0c51245eefdaeda1dc5dd71c516">CONST_VUINT128_Qx19d</a>(</div><div class="line"> 7662477704329444291UL, 7917351357515459181UL);</div></div><!-- fragment -->
</div>
</div>
<a id="a12118674c4e47eb7c939bb29a379d381"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a12118674c4e47eb7c939bb29a379d381">◆ </a></span>CONST_VUINT128_QxD</h2>
<div class="memitem">
<div class="memproto">
<table class="memname">
<tr>
<td class="memname">#define CONST_VUINT128_QxD</td>
<td>(</td>
<td class="paramtype"> </td>
<td class="paramname">__q0, </td>
</tr>
<tr>
<td class="paramkey"></td>
<td></td>
<td class="paramtype"> </td>
<td class="paramname">__q1 </td>
</tr>
<tr>
<td></td>
<td>)</td>
<td></td><td></td>
</tr>
</table>
</div><div class="memdoc">
<b>Value:</b><div class="fragment"><div class="line">( (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) \</div><div class="line"> (((<span class="keywordtype">unsigned</span> __int128) __q0) << 64) \</div><div class="line"> + ((<span class="keywordtype">unsigned</span> __int128) __q1) )</div><div class="ttc" id="vec__common__ppc_8h_html_aaf7a8e92d8ba681dac3d2ec3259c0820"><div class="ttname"><a href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a></div><div class="ttdeci">__vector unsigned __int128 vui128_t</div><div class="ttdoc">vector of one 128-bit unsigned __int128 element. </div><div class="ttdef"><b>Definition:</b> vec_common_ppc.h:237</div></div>
</div><!-- fragment -->
<p>Generate a vector unsigned __int128 constant from doublewords. </p>
<p>Combine 2 x 64-bit long long constants into a single __int128 constant. The 2 parameters are long integer constant values in high to low order. This order is consistent for big and little endian and the result loaded into vector registers is correct for quadword integer operations.</p>
<p>For example</p>
<div class="fragment"><div class="line"><a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> ten32 = <a class="code" href="vec__int128__ppc_8h.html#a12118674c4e47eb7c939bb29a379d381">CONST_VUINT128_QxD</a> (0x000004ee2d6d415bUL, 0x85acef8100000000UL);</div></div><!-- fragment -->
</div>
</div>
<a id="a0f75e65180e68c4753f3d9c2f42d1a31"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a0f75e65180e68c4753f3d9c2f42d1a31">◆ </a></span>CONST_VUINT128_QxW</h2>
<div class="memitem">
<div class="memproto">
<table class="memname">
<tr>
<td class="memname">#define CONST_VUINT128_QxW</td>
<td>(</td>
<td class="paramtype"> </td>
<td class="paramname">__q0, </td>
</tr>
<tr>
<td class="paramkey"></td>
<td></td>
<td class="paramtype"> </td>
<td class="paramname">__q1, </td>
</tr>
<tr>
<td class="paramkey"></td>
<td></td>
<td class="paramtype"> </td>
<td class="paramname">__q2, </td>
</tr>
<tr>
<td class="paramkey"></td>
<td></td>
<td class="paramtype"> </td>
<td class="paramname">__q3 </td>
</tr>
<tr>
<td></td>
<td>)</td>
<td></td><td></td>
</tr>
</table>
</div><div class="memdoc">
<b>Value:</b><div class="fragment"><div class="line">( (<a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a>) \</div><div class="line"> (((<span class="keywordtype">unsigned</span> __int128) __q0) << 96) \</div><div class="line"> + (((<span class="keywordtype">unsigned</span> __int128) __q1) << 64) \</div><div class="line"> + (((<span class="keywordtype">unsigned</span> __int128) __q2) << 32) \</div><div class="line"> + ((<span class="keywordtype">unsigned</span> __int128) __q3) )</div><div class="ttc" id="vec__common__ppc_8h_html_aaf7a8e92d8ba681dac3d2ec3259c0820"><div class="ttname"><a href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a></div><div class="ttdeci">__vector unsigned __int128 vui128_t</div><div class="ttdoc">vector of one 128-bit unsigned __int128 element. </div><div class="ttdef"><b>Definition:</b> vec_common_ppc.h:237</div></div>
</div><!-- fragment -->
<p>Generate a vector unsigned __int128 constant from words. </p>
<p>Combine 4 x 32-bit int constants into a single __int128 constant. The 4 parameters are integer constant values in high to low order. This order is consistent for big and little endian and the result loaded into vector registers is correct for quadword integer operations.</p>
<p>The effect is to compute an unsigned __int128 constant from 4 x 32-bit unsigned int constants. </p><div class="fragment"><div class="line">int128 = (__q0 << 96) + (__q1 << 64) + (__q2 << 32) + q3</div></div><!-- fragment --><p>For example </p><div class="fragment"><div class="line"><span class="comment">// const for 100000000000000000000000000000000 (AKA 10**32)</span></div><div class="line"><a class="code" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> ten32 = <a class="code" href="vec__int128__ppc_8h.html#a0f75e65180e68c4753f3d9c2f42d1a31">CONST_VUINT128_QxW</a> (0x000004ee, 0x2d6d415b,</div><div class="line"> 0x85acef81, 0x00000000);</div></div><!-- fragment -->
</div>
</div>
<h2 class="groupheader">Function Documentation</h2>
<a id="abf1707d712cc191915a8f558eaaa1fe7"></a>
<h2 class="memtitle"><span class="permalink"><a href="#abf1707d712cc191915a8f558eaaa1fe7">◆ </a></span>vec_absduq()</h2>
<div class="memitem">
<div class="memproto">
<table class="mlabels">
<tr>
<td class="mlabels-left">
<table class="memname">
<tr>
<td class="memname">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vec_absduq </td>
<td>(</td>
<td class="paramtype"><a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td>
<td class="paramname"><em>vra</em>, </td>
</tr>
<tr>
<td class="paramkey"></td>
<td></td>
<td class="paramtype"><a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td>
<td class="paramname"><em>vrb</em> </td>
</tr>
<tr>
<td></td>
<td>)</td>
<td></td><td></td>
</tr>
</table>
</td>
<td class="mlabels-right">
<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span></span> </td>
</tr>
</table>
</div><div class="memdoc">
<p>Vector Absolute Difference Unsigned Quadword. </p>
<p>Compute the absolute difference of the quadwords. For each unsigned quadword, subtract VRB from VRA and return the absolute value of the difference.</p>
<table class="doxtable">
<tr>
<th align="right">processor</th><th align="center">Latency</th><th align="left">Throughput </th></tr>
<tr>
<td align="right">power8 </td><td align="center">14 </td><td align="left">1/cycle </td></tr>
<tr>
<td align="right">power9 </td><td align="center">11 </td><td align="left">1/cycle </td></tr>
</table>
<dl class="params"><dt>Parameters</dt><dd>
<table class="params">
<tr><td class="paramname">vra</td><td>vector of unsigned __int128 </td></tr>
<tr><td class="paramname">vrb</td><td>vector of unsigned __int128 </td></tr>
</table>
</dd>
</dl>
<dl class="section return"><dt>Returns</dt><dd>vector of the absolute difference. </dd></dl>
</div>
</div>
<a id="a363fa7103ccd730c47bb34cb9f05e80b"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a363fa7103ccd730c47bb34cb9f05e80b">◆ </a></span>vec_addcq()</h2>
<div class="memitem">
<div class="memproto">
<table class="mlabels">
<tr>
<td class="mlabels-left">
<table class="memname">
<tr>
<td class="memname">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vec_addcq </td>
<td>(</td>
<td class="paramtype"><a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> * </td>
<td class="paramname"><em>cout</em>, </td>
</tr>
<tr>
<td class="paramkey"></td>
<td></td>
<td class="paramtype"><a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td>
<td class="paramname"><em>a</em>, </td>
</tr>
<tr>
<td class="paramkey"></td>
<td></td>
<td class="paramtype"><a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td>
<td class="paramname"><em>b</em> </td>
</tr>
<tr>
<td></td>
<td>)</td>
<td></td><td></td>
</tr>
</table>
</td>
<td class="mlabels-right">
<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span></span> </td>
</tr>
</table>
</div><div class="memdoc">
<p>Vector Add with carry Unsigned Quadword. </p>
<p>Add two vector __int128 values and return sum and the carry out.</p>
<table class="doxtable">
<tr>
<th align="right">processor</th><th align="center">Latency</th><th align="left">Throughput </th></tr>
<tr>
<td align="right">power8 </td><td align="center">8 </td><td align="left">1/2 cycles </td></tr>
<tr>
<td align="right">power9 </td><td align="center">6 </td><td align="left">2/cycle </td></tr>
</table>
<dl class="params"><dt>Parameters</dt><dd>
<table class="params">
<tr><td class="paramname">*cout</td><td>carry out from the sum of a and b. </td></tr>
<tr><td class="paramname">a</td><td>128-bit vector treated a __int128. </td></tr>
<tr><td class="paramname">b</td><td>128-bit vector treated a __int128. </td></tr>
</table>
</dd>
</dl>
<dl class="section return"><dt>Returns</dt><dd>__int128 (lower 128-bits) sum of a and b. </dd></dl>
</div>
</div>
<a id="ad7aaadba249ce46c4c94f78df1020da3"></a>
<h2 class="memtitle"><span class="permalink"><a href="#ad7aaadba249ce46c4c94f78df1020da3">◆ </a></span>vec_addcuq()</h2>
<div class="memitem">
<div class="memproto">
<table class="mlabels">
<tr>
<td class="mlabels-left">
<table class="memname">
<tr>
<td class="memname">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vec_addcuq </td>
<td>(</td>
<td class="paramtype"><a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td>
<td class="paramname"><em>a</em>, </td>
</tr>
<tr>
<td class="paramkey"></td>
<td></td>
<td class="paramtype"><a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> </td>
<td class="paramname"><em>b</em> </td>
</tr>
<tr>
<td></td>
<td>)</td>
<td></td><td></td>
</tr>
</table>
</td>
<td class="mlabels-right">
<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span></span> </td>
</tr>
</table>
</div><div class="memdoc">
<p>Vector Add & write Carry Unsigned Quadword. </p>
<p>Add two vector __int128 values and return the carry out.</p>
<table class="doxtable">
<tr>
<th align="right">processor</th><th align="center">Latency</th><th align="left">Throughput </th></tr>
<tr>
<td align="right">power8 </td><td align="center">4 </td><td align="left">2/2 cycles </td></tr>
<tr>
<td align="right">power9 </td><td align="center">3 </td><td align="left">2/cycle </td></tr>
</table>
<dl class="params"><dt>Parameters</dt><dd>
<table class="params">
<tr><td class="paramname">a</td><td>128-bit vector treated a __int128. </td></tr>
<tr><td class="paramname">b</td><td>128-bit vector treated a __int128. </td></tr>
</table>
</dd>
</dl>
<dl class="section return"><dt>Returns</dt><dd>__int128 carry of the sum of a and b. </dd></dl>
</div>
</div>
<a id="af18b98d2d73f1afbc439e1407c78f305"></a>
<h2 class="memtitle"><span class="permalink"><a href="#af18b98d2d73f1afbc439e1407c78f305">◆ </a></span>vec_addecuq()</h2>
<div class="memitem">
<div class="memproto">
<table class="mlabels">
<tr>
<td class="mlabels-left">
<table class="memname">
<tr>
<td class="memname">static <a class="el" href="vec__common__ppc_8h.html#aaf7a8e92d8ba681dac3d2ec3259c0820">vui128_t</a> vec_addecuq </td>
<td>(</td>