Commit 110523d6f311d7ee81835566b5594e6feb2ce9dd

Authored by Jim Plank
1 parent 79a46d18
Exists in master and in 3 other branches v1, v2, v3

GF-Complete Release 1.0.

Please see the user's manual for details.
GNUmakefile
1 1 #
2 2 # GNUmakefile for Galois field library
3 3 #
4   -#
  4 +# The default flags do *not* have the SSE instructions enabled.
  5 +# Please cd to flag_tester and run which_compile_flags.sh to see which SSE instructions
  6 +# your machine and compiler support, and which flags you should include below.
  7 +
  8 +CFLAGS = -O3
  9 +LDFLAGS = -O3
5 10  
6 11 SRCS = gf_w4.c gf_w8.c gf_w16.c gf_w32.c gf_w64.c gf_w128.c gf_wgen.c gf.c gf_unit.c \
7 12 gf_time.c gf_mult.c gf_method.c gf_methods.c gf_div.c gf_rand.c gf_general.c \
8 13 gf_poly.c gf_example_1.c gf_add.c gf_example_2.c gf_example_3.c gf_example_4.c \
9   - gf_inline_time.c
  14 + gf_inline_time.c gf_example_5.c gf_example_6.c gf_example_7.c
10 15  
11 16 HDRS = gf_complete.h gf_int.h
12 17  
13 18 EXECUTABLES = gf_mult gf_div gf_add gf_unit gf_time gf_methods gf_poly \
14   - gf_example_1 gf_example_2 gf_example_3 gf_example_4 gf_inline_time
15   -
16   -CFLAGS = -O3 -msse4 -maes -mpclmul -DINTEL_SSE4 -DINTEL_PCLMUL
17   -LDFLAGS = -O3 -msse4 -maes -mpclmul
18   -
19   -# Use these if you don't have INTEL_PCLMUL
20   -# CFLAGS = -O3 -msse4 -DINTEL_SSE4
21   -# LDFLAGS = -O3 -msse4
  19 + gf_example_1 gf_example_2 gf_example_3 gf_example_4 gf_inline_time \
  20 + gf_example_5 gf_example_6 gf_example_7
22 21  
23 22 RM = /bin/rm -f
24 23  
... ... @@ -45,6 +44,9 @@ gf_example_1: gf_example_1.o gf_complete.a
45 44 gf_example_2: gf_example_2.o gf_complete.a
46 45 gf_example_3: gf_example_3.o gf_complete.a
47 46 gf_example_4: gf_example_4.o gf_complete.a
  47 +gf_example_5: gf_example_5.o gf_complete.a
  48 +gf_example_6: gf_example_6.o gf_complete.a
  49 +gf_example_7: gf_example_7.o gf_complete.a
48 50 gf_mult: gf_mult.o gf_complete.a
49 51 gf_div: gf_div.o gf_complete.a
50 52 gf_poly: gf_poly.o gf_complete.a
... ... @@ -54,7 +56,8 @@ clean:
54 56 $(RM) $(OBJS) gf_div.c
55 57  
56 58 spotless: clean
57   - $(RM) *~ $(EXECUTABLES)
  59 + $(RM) *~ $(EXECUTABLES) which_compile_flags
  60 + $(RM) gf_complete.a
58 61  
59 62 gf_div.o: gf_complete.h gf_method.h
60 63 gf_methods.o: gf_complete.h gf_method.h
... ... @@ -71,8 +74,12 @@ gf_example_1.o: gf_complete.h gf_rand.h
71 74 gf_example_2.o: gf_complete.h gf_rand.h
72 75 gf_example_3.o: gf_complete.h gf_rand.h
73 76 gf_example_4.o: gf_complete.h gf_rand.h
  77 +gf_example_5.o: gf_complete.h gf_rand.h
  78 +gf_example_6.o: gf_complete.h gf_rand.h
  79 +gf_example_7.o: gf_complete.h gf_rand.h
74 80 gf_general.o: gf_complete.h gf_int.h gf_general.h gf_rand.h
75 81 gf_mult.o: gf_complete.h gf_method.h
  82 +gf.o: gf_complete.h gf_int.h
76 83 gf_method.o: gf_complete.h
77 84  
78 85 gf_div.c: gf_mult.c
... ...
Log-Zero-for-w=8.odg
No preview for this file type
Manual.pdf 0 → 100644
No preview for this file type
... ... @@ -1 +0,0 @@
1   -This is a README file.
README.txt
1   -This is GF-Complete, Revision 0.1.
  1 +This is GF-Complete, Revision 1.0.
  2 +
  3 +The user's manual is in the file Manual.pdf.
  4 +
  5 +There are two online homes for GF-Complete:
  6 +
  7 + - https://bitbucket.org/jimplank/gf-complete
  8 + - http://www.cs.utk.edu/~plank/plank/papers/CS-13-716.html
  9 +
  10 +When compiling this for the first time, cd to flag_tester, and
  11 +do "sh which_compile_flags.sh xxx", where xxx is the compiler
  12 +that you will use in the GNUMakefile.
2 13  
3   -Please see http://www.cs.utk.edu/~plank/plank/papers/CS-13-703.html for the user's
4   -manual and other important documentation about this library, including more
5   -recent revisions.
... ...
explanation.html
... ... @@ -1,777 +0,0 @@
1   -<h3>Code structure as of 7/20/2012</h3>
2   -
3   -written by Jim.
4   -<p>
5   -Ok -- once again, I have messed with the structure. My goal is flexible and efficient.
6   -It's similar to the stuff before, but better because it makes things like Euclid's
7   -method much cleaner.
8   -<p>
9   -I think we're ready to hack.
10   -<p>
11   -<p>
12   -<hr>
13   -<h3>Files</h3>
14   -<UL>
15   -<LI> <a href=GNUmakefile><b>GNUmakefile</b></a>: Makefile
16   -<LI> <a href=README><b>README</b></a>: Empty readme
17   -<LI> <a href=explanation.html><b>explanation.html</b></a>: This file.
18   -<LI> <a href=gf.c><b>gf.c</b></a>: Main gf routines
19   -<LI> <a href=gf.h><b>gf.h</b></a>: Main gf prototypes and typedefs
20   -<LI> <a href=gf_int.h><b>gf_int.h</b></a>: Prototypes and typedefs for common routines for the
21   - internal gf implementations.
22   -<LI> <a href=gf_method.c><b>gf_method.c</b></a>: Code to help parse argc/argv to define the method.
23   - This way, various programs can be consistent with how they handle the command line.
24   -<LI> <a href=gf_method.h><b>gf_method.h</b></a>: Prototypes for ibid.
25   -<LI> <a href=gf_methods.c><b>gf_methods.c</b></a>: This program prints out how to define
26   - the various methods on the command line. My idea is to beef this up so that you can
27   - give it a method spec on the command line, and it will tell you whether it's valid, or
28   - why it's invalid. I haven't written that part yet.
29   -<LI> <a href=gf_mult.c><b>gf_mult.c</b></a>: Program to do single multiplication.
30   -<LI> <a href=gf_mult.c><b>gf_mult.c</b></a>: Program to do single divisions -- it's created
31   - in the makefile with a sed script on gf_mult.c.
32   -<LI> <a href=gf_time.c><b>gf_time.c</b></a>: Time tester
33   -<LI> <a href=gf_unit.c><b>gf_unit.c</b></a>: Unit tester
34   -<LI> <a href=gf_54.c><b>gf_54.c</b></a>: A simple example program that multiplies
35   - 5 and 4 in GF(2^4).
36   -<LI> <a href=gf_w4.c><b>gf_w4.c</b></a>: Implementation of code for <i>w</i> = 4.
37   -(For now, only SHIFT and LOG, plus EUCLID & MATRIX).
38   -<LI> <a href=gf_w8.c><b>gf_w8.c</b></a>: Implementation of code for <i>w</i> = 8.
39   -(For now, only SHIFT plus EUCLID & MATRIX).
40   -<LI> <a href=gf_w16.c><b>gf_w16.c</b></a>: Implementation of code for <i>w</i> = 16.
41   -(For now, only SHIFT plus EUCLID & MATRIX).
42   -<LI> <a href=gf_w32.c><b>gf_w32.c</b></a>: Implementation of code for <i>w</i> = 32.
43   -(For now, only SHIFT plus EUCLID & MATRIX).
44   -<LI> <a href=gf_w64.c><b>gf_w64.c</b></a>: Implementation of code for <i>w</i> = 64.
45   -(For now, only SHIFT and EUCLID.
46   -<LI> I don't have gf_w128.c or gf_gen.c yet.
47   -</UL>
48   -
49   -<hr>
50   -<h3>Prototypes and typedefs in gf.h</h3>
51   -
52   -The main structure that users will see is in <b>gf.h</b>, and it is of type
53   -<b>gf_t</b>:
54   -
55   -<p><center><table border=3 cellpadding=3><td><pre>
56   -typedef struct gf {
57   - gf_func_a_b multiply;
58   - gf_func_a_b divide;
59   - gf_func_a inverse;
60   - gf_region multiply_region;
61   - void *scratch;
62   -} gf_t;
63   -</pre></td></table></center><p>
64   -
65   -We can beef it up later with buf-buf or buf-acc. The problem is that the paper is
66   -already bloated, so right now, I want to keep it lean.
67   -<p>
68   -The types of the procedures are big unions, so that they work with the following
69   -types of arguments:
70   -
71   -<p><center><table border=3 cellpadding=3><td><pre>
72   -typedef uint8_t gf_val_4_t;
73   -typedef uint8_t gf_val_8_t;
74   -typedef uint16_t gf_val_16_t;
75   -typedef uint32_t gf_val_32_t;
76   -typedef uint64_t gf_val_64_t;
77   -typedef uint64_t *gf_val_128_t;
78   -typedef uint32_t gf_val_gen_t; /* The intent here is for general values <= 32 */
79   -</pre></td></table></center><p>
80   -
81   -To use one of these, you need to create one with <b>gf_init_easy()</b> or
82   -<b>gf_init_hard()</b>. Let's concentrate on the former:
83   -
84   -<p><center><table border=3 cellpadding=3><td><pre>
85   -extern int gf_init_easy(gf_t *gf, int w, int mult_type);
86   -</pre></td></table></center><p>
87   -
88   -You pass it memory for a <b>gf_t</b>, a value of <b>w</b> and
89   -a variable that says how to do multiplication. The valid values of <b>mult_type</b>
90   -are enumerated in <b>gf.h</b>:
91   -
92   -<p><center><table border=3 cellpadding=3><td><pre>
93   -typedef enum {GF_MULT_DEFAULT,
94   - GF_MULT_SHIFT,
95   - GF_MULT_GROUP,
96   - GF_MULT_BYTWO_p,
97   - GF_MULT_BYTWO_b,
98   - GF_MULT_TABLE,
99   - GF_MULT_LOG_TABLE,
100   - GF_MULT_SPLIT_TABLE,
101   - GF_MULT_COMPOSITE } gf_mult_type_t;
102   -</pre></td></table></center><p>
103   -
104   -After creating the <b>gf_t</b>, you use its <b>multiply</b> method
105   -to multiply, using the union's fields to work with the various types.
106   -It looks easier than my explanation. For example, suppose you wanted to multiply 5 and 4 in <i>GF(2<sup>4</sup>)</i>.
107   -You can do it as in
108   -<b><a href=gf_54.c>gf_54.c</a></b>
109   -
110   -<p><center><table border=3 cellpadding=3><td><pre>
111   -#include "gf.h"
112   -
113   -main()
114   -{
115   - gf_t gf;
116   -
117   - gf_init_easy(&gf, 4, GF_MULT_DEFAULT);
118   - printf("%d\n", gf.multiply.w4(&gf, 5, 4));
119   - exit(0);
120   -}
121   -</pre></td></table></center><p>
122   -
123   -
124   -If you wanted to multiply in <i>GF(2<sup>8</sup>)</i>, then you'd have to use 8 as a parameter
125   -to <b>gf_init_easy</b>, and call the multiplier as <b>gf.mult.w8()</b>.
126   -<p>
127   -When you're done with your <b>gf_t</b>, you should call <b>gf_free()</b> on it so
128   -that it can free memory that it has allocated. We'll talk more about memory later, but if you
129   -create your <b>gf_t</b> with <b>gf_init_easy</b>, then it calls <b>malloc()</b>, and
130   -if you care about freeing memory, you'll have to call <b>gf_free()</b>.
131   -<p>
132   -
133   -<hr>
134   -<h3>Memory allocation</h3>
135   -
136   -Each implementation of a multiplication technique keeps around its
137   -own data. For example, <b>GF_MULT_TABLE</b> keeps around
138   -multiplication and division tables, and <b>GF_MULT_LOG</b> maintains log and
139   -antilog tables. This data is stored in the pointer <b>scratch</b>. My intent
140   -is that the memory that is there is all that's required. In other
141   -words, the <b>multiply()</b>, <b>divide()</b>, <b>inverse()</b> and
142   -<b>multiply_region()</b> calls don't do any memory allocation.
143   -Moreover, <b>gf_init_easy()</b> only allocates one chunk of memory --
144   -the one in <b>scratch</b>.
145   -<p>
146   -If you don't want to have the initialization call allocate memory, you can use <b>gf_init_hard()</b>:
147   -
148   -<p><center><table border=3 cellpadding=3><td><pre>
149   -extern int gf_init_hard(gf_t *gf,
150   - int w,
151   - int mult_type,
152   - int region_type,
153   - int divide_type,
154   - uint64_t prim_poly,
155   - int arg1,
156   - int arg2,
157   - gf_t *base_gf,
158   - void *scratch_memory);
159   -</pre></td></table></center><p>
160   -
161   -The first three parameters are the same as <b>gf_init_easy()</b>.
162   -You can add additional arguments for performing <b>multiply_region</b>, and
163   -for performing division in the <b>region_type</b> and <b>divide_type</b>
164   -arguments. Their values are also defined in <b>gf.h</b>. You can
165   -mix the <b>region_type</b> values (e.g. "DOUBLE" and "SSE"):
166   -
167   -<p><center><table border=3 cellpadding=3><td><pre>
168   -#define GF_REGION_DEFAULT (0x0)
169   -#define GF_REGION_SINGLE_TABLE (0x1)
170   -#define GF_REGION_DOUBLE_TABLE (0x2)
171   -#define GF_REGION_QUAD_TABLE (0x4)
172   -#define GF_REGION_LAZY (0x8)
173   -#define GF_REGION_SSE (0x10)
174   -#define GF_REGION_NOSSE (0x20)
175   -#define GF_REGION_STDMAP (0x40)
176   -#define GF_REGION_ALTMAP (0x80)
177   -#define GF_REGION_CAUCHY (0x100)
178   -
179   -typedef uint32_t gf_region_type_t;
180   -
181   -typedef enum { GF_DIVIDE_DEFAULT,
182   - GF_DIVIDE_MATRIX,
183   - GF_DIVIDE_EUCLID } gf_division_type_t;
184   -</pre></td></table></center><p>
185   -You can change
186   -the primitive polynomial with <b>prim_poly</b>, give additional arguments with
187   -<b>arg1</b> and <b>arg2</b> and give a base Galois Field for composite fields.
188   -Finally, you can pass it a pointer to memory in <b>scratch_memory</b>. That
189   -way, you can avoid having <b>gf_init_hard()</b> call <b>malloc()</b>.
190   -<p>
191   -There is a procedure called <b>gf_scratch_size()</b> that lets you know the minimum
192   -size for <b>scratch_memory</b>, depending on <i>w</i>, the multiplication type
193   -and the arguments:
194   -
195   -<p><center><table border=3 cellpadding=3><td><pre>
196   -extern int gf_scratch_size(int w,
197   - int mult_type,
198   - int region_type,
199   - int divide_type,
200   - int arg1,
201   - int arg2);
202   -</pre></td></table></center><p>
203   -
204   -You can specify default arguments in <b>gf_init_hard()</b>:
205   -<UL>
206   -<LI> <b>region_type</b> = <b>GF_REGION_DEFAULT</b>
207   -<LI> <b>divide_type</b> = <b>GF_REGION_DEFAULT</b>
208   -<LI> <b>prim_poly</b> = 0
209   -<LI> <b>arg1</b> = 0
210   -<LI> <b>arg2</b> = 0
211   -<LI> <b>base_gf</b> = <b>NULL</b>
212   -<LI> <b>scratch_memory</b> = <b>NULL</b>
213   -</UL>
214   -If any argument is equal to its default, then default actions are taken (e.g. a
215   -standard primitive polynomial is used, or memory is allocated for <b>scratch_memory</b>).
216   -In fact, <b>gf_init_easy()</b> simply calls <b>gf_init_hard()</b> with the default
217   -parameters.
218   -<p>
219   -<b>gf_free()</b> frees memory that was allocated with <b>gf_init_easy()</b>
220   -or <b>gf_init_hard()</b>. The <b>recursive</b> parameter is in case you
221   -use composite fields, and want to recursively free the base fields.
222   -If you pass <b>scratch_memory</b> to <b>gf_init_hard()</b>, then you typically
223   -don't need to call <b>gf_free()</b>. It won't hurt to call it, though.
224   -
225   -<hr>
226   -<h3>gf_mult and gf_div</h3>
227   -
228   -For the moment, I have few things completely implemented, but that's because I want
229   -to be able to explain the structure, and how to specify methods. In particular, for
230   -<i>w=4</i>, I have implemented <b>SHIFT</b> and <b>LOG</b>. For <i>w=8, 16, 32, 64</i>
231   -I have implemented <b>SHIFT</b>. For all <i>w &le; 32</i>, I have implemented both
232   -Euclid's algorithm for inversion, and the matrix method for inversion. For
233   -<i>w=64</i>, it's just Euclid. You can
234   -test these all with <b>gf_mult</b> and <b>gf_div</b>. Here are a few calls:
235   -
236   -<pre>
237   -UNIX> <font color=darkred><b>gf_mult 7 11 4</b></font> - Default
238   -4
239   -UNIX> <font color=darkred><b>gf_mult 7 11 4 SHIFT - -</b></font> - Use shift
240   -4
241   -UNIX> <font color=darkred><b>gf_mult 7 11 4 LOG - -</b></font> - Use logs
242   -4
243   -UNIX> <font color=darkred><b>gf_div 4 7 4</b></font> - Default
244   -11
245   -UNIX> <font color=darkred><b>gf_div 4 7 4 LOG - -</b></font> - Use logs
246   -11
247   -UNIX> <font color=darkred><b>gf_div 4 7 4 LOG - EUCLID</b></font> - Use Euclid instead of logs
248   -11
249   -UNIX> <font color=darkred><b>gf_div 4 7 4 LOG - MATRIX</b></font> - Use Matrix inversion instead of logs
250   -11
251   -UNIX> <font color=darkred><b>gf_div 4 7 4 SHIFT - -</b></font> - Default
252   -11
253   -UNIX> <font color=darkred><b>gf_div 4 7 4 SHIFT - EUCLID</b></font> - Use Euclid (which is the default)
254   -11
255   -UNIX> <font color=darkred><b>gf_div 4 7 4 SHIFT - MATRIX</b></font> - Use Matrix inversion instead of logs
256   -11
257   -UNIX> <font color=darkred><b>gf_mult 200 211 8</b></font> - The remainder are shift/Euclid
258   -201
259   -UNIX> <font color=darkred><b>gf_div 201 211 8</b></font>
260   -200
261   -UNIX> <font color=darkred><b>gf_mult 60000 65111 16</b></font>
262   -63515
263   -UNIX> <font color=darkred><b>gf_div 63515 65111 16</b></font>
264   -60000
265   -UNIX> <font color=darkred><b>gf_mult abcd0001 9afbf788 32h</b></font>
266   -b0359681
267   -UNIX> <font color=darkred><b>gf_div b0359681 9afbf788 32h</b></font>
268   -abcd0001
269   -UNIX> <font color=darkred><b>gf_mult abcd00018c8b8c8a 9afbf7887f6d8e5b 64h</b></font>
270   -3a7def35185bd571
271   -UNIX> <font color=darkred><b>gf_mult abcd00018c8b8c8a 9afbf7887f6d8e5b 64h</b></font>
272   -3a7def35185bd571
273   -UNIX> <font color=darkred><b>gf_div 3a7def35185bd571 9afbf7887f6d8e5b 64h</b></font>
274   -abcd00018c8b8c8a
275   -UNIX> <font color=darkred><b></b></font>
276   -</pre>
277   -
278   -You can see all the methods with <b>gf_methods</b>. We have a lot of implementing to do:
279   -
280   -<pre>
281   -UNIX> <font color=darkred><b>gf_methods</b></font>
282   -To specify the methods, do one of the following:
283   - - leave empty to use defaults
284   - - use a single dash to use defaults
285   - - specify MULTIPLY REGION DIVIDE
286   -
287   -Legal values of MULTIPLY:
288   - SHIFT: shift
289   - GROUP g_mult g_reduce: the Group technique - see the paper
290   - BYTWO_p: BYTWO doubling the product.
291   - BYTWO_b: BYTWO doubling b (more efficient thatn BYTWO_p)
292   - TABLE: Full multiplication table
293   - LOG: Discrete logs
294   - LOG_ZERO: Discrete logs with a large table for zeros
295   - SPLIT g_a g_b: Split tables defined by g_a and g_b
296   - COMPOSITE k l [METHOD]: Composite field, recursively specify the
297   - method of the base field in GF(2^l)
298   -
299   -Legal values of REGION: Specify multiples with commas e.g. 'DOUBLE,LAZY'
300   - -: Use defaults
301   - SINGLE/DOUBLE/QUAD: Expand tables
302   - LAZY: Lazily create table (only applies to TABLE and SPLIT)
303   - SSE/NOSSE: Use 128-bit SSE instructions if you can
304   - CAUCHY/ALTMAP/STDMAP: Use different memory mappings
305   -
306   -Legal values of DIVIDE:
307   - -: Use defaults
308   - MATRIX: Use matrix inversion
309   - EUCLID: Use the extended Euclidian algorithm.
310   -
311   -See the user's manual for more information.
312   -There are many restrictions, so it is better to simply use defaults in most cases.
313   -UNIX> <font color=darkred><b></b></font>
314   -</pre>
315   -
316   -<hr>
317   -<h3>gf_unit and gf_time</h3>
318   -
319   -<b><a href=gf_unit.c>gf_unit.c</a></b> is a unit tester, and
320   -<b><a href=gf_time.c>gf_time.c</a></b> is a time tester.
321   -
322   -They are called as follows:
323   -
324   -<p><center><table border=3 cellpadding=3><td><pre>
325   -UNIX> <font color=darkred><b>gf_unit w tests seed [METHOD] </b></font>
326   -UNIX> <font color=darkred><b>gf_time w tests seed size(bytes) iterations [METHOD] </b></font>
327   -</pre></td></table></center><p>
328   -
329   -The <b>tests</b> parameter is one or more of the following characters:
330   -
331   -<UL>
332   -<LI> A: Do all tests
333   -<LI> S: Test only single operations (multiplication/division)
334   -<LI> R: Test only region operations
335   -<LI> V: Verbose Output
336   -</UL>
337   -
338   -<b>seed</b> is a seed for <b>srand48()</b> -- using -1 defaults to the current time.
339   -<p>
340   -For example, testing the defaults with w=4:
341   -
342   -<pre>
343   -UNIX> <font color=darkred><b>gf_unit 4 AV 1 LOG - -</b></font>
344   -Seed: 1
345   -Testing single multiplications/divisions.
346   -Testing Inversions.
347   -Testing buffer-constant, src != dest, xor = 0
348   -Testing buffer-constant, src != dest, xor = 1
349   -Testing buffer-constant, src == dest, xor = 0
350   -Testing buffer-constant, src == dest, xor = 1
351   -UNIX> <font color=darkred><b>gf_unit 4 AV 1 SHIFT - -</b></font>
352   -Seed: 1
353   -Testing single multiplications/divisions.
354   -Testing Inversions.
355   -No multiply_region.
356   -UNIX> <font color=darkred><b></b></font>
357   -</pre>
358   -
359   -There is no <b>multiply_region()</b> method defined for <b>SHIFT</b>.
360   -Thus, the procedures are <b>NULL</b> and the unit tester ignores them.
361   -<p>
362   -At the moment, I only have the unit tester working for w=4.
363   -<p>
364   -<b>gf_time</b> takes the size of an array (in bytes) and a number of iterations, and
365   -tests the speed of both single and region operations. The tests are:
366   -
367   -<UL>
368   -<LI> A: All
369   -<LI> S: All Single Operations
370   -<LI> R: All Region Operations
371   -<LI> M: Single: Multiplications
372   -<LI> D: Single: Divisions
373   -<LI> I: Single: Inverses
374   -<LI> B: Region: Multipy_Region
375   -</UL>
376   -
377   -Here are some examples with <b>SHIFT</b> and <b>LOG</b> on my mac.
378   -
379   -<pre>
380   -UNIX> <font color=darkred><b>gf_time 4 A 1 102400 1024 LOG - -</b></font>
381   -Seed: 1
382   -Multiply: 0.538126 s 185.830 Mega-ops/s
383   -Divide: 0.520825 s 192.003 Mega-ops/s
384   -Inverse: 0.631198 s 158.429 Mega-ops/s
385   -Buffer-Const,s!=d,xor=0: 0.478395 s 209.032 MB/s
386   -Buffer-Const,s!=d,xor=1: 0.524245 s 190.751 MB/s
387   -Buffer-Const,s==d,xor=0: 0.471851 s 211.931 MB/s
388   -Buffer-Const,s==d,xor=1: 0.528275 s 189.295 MB/s
389   -UNIX> <font color=darkred><b>gf_time 4 A 1 102400 1024 LOG - EUCLID</b></font>
390   -Seed: 1
391   -Multiply: 0.555512 s 180.014 Mega-ops/s
392   -Divide: 5.359434 s 18.659 Mega-ops/s
393   -Inverse: 4.911719 s 20.359 Mega-ops/s
394   -Buffer-Const,s!=d,xor=0: 0.496097 s 201.573 MB/s
395   -Buffer-Const,s!=d,xor=1: 0.538536 s 185.689 MB/s
396   -Buffer-Const,s==d,xor=0: 0.485564 s 205.946 MB/s
397   -Buffer-Const,s==d,xor=1: 0.540227 s 185.107 MB/s
398   -UNIX> <font color=darkred><b>gf_time 4 A 1 102400 1024 LOG - MATRIX</b></font>
399   -Seed: 1
400   -Multiply: 0.544005 s 183.822 Mega-ops/s
401   -Divide: 7.602822 s 13.153 Mega-ops/s
402   -Inverse: 7.000564 s 14.285 Mega-ops/s
403   -Buffer-Const,s!=d,xor=0: 0.474868 s 210.585 MB/s
404   -Buffer-Const,s!=d,xor=1: 0.527588 s 189.542 MB/s
405   -Buffer-Const,s==d,xor=0: 0.473130 s 211.358 MB/s
406   -Buffer-Const,s==d,xor=1: 0.529877 s 188.723 MB/s
407   -UNIX> <font color=darkred><b>gf_time 4 A 1 102400 1024 SHIFT - -</b></font>
408   -Seed: 1
409   -Multiply: 2.708842 s 36.916 Mega-ops/s
410   -Divide: 8.756882 s 11.420 Mega-ops/s
411   -Inverse: 5.695511 s 17.558 Mega-ops/s
412   -UNIX> <font color=darkred><b></b></font>
413   -</pre>
414   -
415   -At the moment, I only have the timer working for w=4.
416   -
417   -<hr>
418   -<h3>Walking you through <b>LOG</b></h3>
419   -
420   -To see how <b>scratch</b> is used to store data, let's look at what happens when
421   -you call <b>gf_init_easy(&gf, 4, GF_MULT_LOG);</b>
422   -First, <b>gf_init_easy()</b> calls <b>gf_init_hard()</b> with default parameters.
423   -This is in <b><a href=gf.c>gf.c</a></b>.
424   -<p>
425   -<b>gf_init_hard()</b>' first job is to set up the scratch.
426   -The scratch's type is <b>gf_internal_t</b>, defined in
427   -<b><a href=gf_int.h>gf_int.h</a></b>:
428   -
429   -<p><center><table border=3 cellpadding=3><td><pre>
430   -typedef struct {
431   - int mult_type;
432   - int region_type;
433   - int divide_type;
434   - int w;
435   - uint64_t prim_poly;
436   - int free_me;
437   - int arg1;
438   - int arg2;
439   - gf_t *base_gf;
440   - void *private;
441   -} gf_internal_t;
442   -</pre></td></table></center><p>
443   -
444   -All the fields are straightfoward, with the exception of <b>private</b>. That is
445   -a <b>(void *)</b> which points to the implementation's private data.
446   -<p>
447   -Here's the code for
448   -<b>gf_init_hard()</b>:
449   -
450   -<p><center><table border=3 cellpadding=3><td><pre>
451   -int gf_init_hard(gf_t *gf, int w, int mult_type,
452   - int region_type,
453   - int divide_type,
454   - uint64_t prim_poly,
455   - int arg1, int arg2,
456   - gf_t *base_gf,
457   - void *scratch_memory)
458   -{
459   - int sz;
460   - gf_internal_t *h;
461   -
462   -
463   - if (scratch_memory == NULL) {
464   - sz = gf_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2);
465   - if (sz &lt;= 0) return 0;
466   - h = (gf_internal_t *) malloc(sz);
467   - h-&gt;free_me = 1;
468   - } else {
469   - h = scratch_memory;
470   - h-&gt;free_me = 0;
471   - }
472   - gf-&gt;scratch = (void *) h;
473   - h-&gt;mult_type = mult_type;
474   - h-&gt;region_type = region_type;
475   - h-&gt;divide_type = divide_type;
476   - h-&gt;w = w;
477   - h-&gt;prim_poly = prim_poly;
478   - h-&gt;arg1 = arg1;
479   - h-&gt;arg2 = arg2;
480   - h-&gt;base_gf = base_gf;
481   - h-&gt;private = (void *) gf-&gt;scratch;
482   - h-&gt;private += (sizeof(gf_internal_t));
483   -
484   - switch(w) {
485   - case 4: return gf_w4_init(gf);
486   - case 8: return gf_w8_init(gf);
487   - case 16: return gf_w16_init(gf);
488   - case 32: return gf_w32_init(gf);
489   - case 64: return gf_w64_init(gf);
490   - case 128: return gf_dummy_init(gf);
491   - default: return 0;
492   - }
493   -}
494   -</pre></td></table></center><p>
495   -
496   -The first thing it does is determine if it has to allocate space for <b>scratch</b>.
497   -If it must, it uses <b>gf_scratch_size()</b> to figure out how big the space must be.
498   -It then sets <b>gf->scratch</b> to this space, and sets all of the fields of the
499   -scratch to the arguments in <b>gf_init_hard()</b>. The <b>private</b> pointer is
500   -set to be the space just after the pointer <b>gf->private</b>. Again, it is up to
501   -<b>gf_scratch_size()</b> to make sure there is enough space for the scratch, and
502   -for all of the private data needed by the implementation.
503   -<p>
504   -Once the scratch is set up, <b>gf_init_hard()</b> calls <b>gf_w4_init()</b>. This is
505   -in <b><a href=gf_w4.c>gf_w4.c</a></b>, and it is a
506   -simple dispatcher to the various initialization routines, plus it
507   -sets <b>EUCLID</b> and <b>MATRIX</b> if need be:
508   -
509   -<p><center><table border=3 cellpadding=3><td><pre>
510   -int gf_w4_init(gf_t *gf)
511   -{
512   - gf_internal_t *h;
513   -
514   - h = (gf_internal_t *) gf-&gt;scratch;
515   - if (h-&gt;prim_poly == 0) h-&gt;prim_poly = 0x13;
516   -
517   - gf-&gt;multiply.w4 = NULL;
518   - gf-&gt;divide.w4 = NULL;
519   - gf-&gt;inverse.w4 = NULL;
520   - gf-&gt;multiply_region.w4 = NULL;
521   -
522   - switch(h-&gt;mult_type) {
523   - case GF_MULT_SHIFT: if (gf_w4_shift_init(gf) == 0) return 0; break;
524   - case GF_MULT_LOG_TABLE: if (gf_w4_log_init(gf) == 0) return 0; break;
525   - case GF_MULT_DEFAULT: if (gf_w4_log_init(gf) == 0) return 0; break;
526   - default: return 0;
527   - }
528   - if (h-&gt;divide_type == GF_DIVIDE_EUCLID) {
529   - gf-&gt;divide.w4 = gf_w4_divide_from_inverse;
530   - gf-&gt;inverse.w4 = gf_w4_euclid;
531   - } else if (h-&gt;divide_type == GF_DIVIDE_MATRIX) {
532   - gf-&gt;divide.w4 = gf_w4_divide_from_inverse;
533   - gf-&gt;inverse.w4 = gf_w4_matrix;
534   - }
535   -
536   - if (gf-&gt;inverse.w4 != NULL && gf-&gt;divide.w4 == NULL) {
537   - gf-&gt;divide.w4 = gf_w4_divide_from_inverse;
538   - }
539   - if (gf-&gt;inverse.w4 == NULL && gf-&gt;divide.w4 != NULL) {
540   - gf-&gt;inverse.w4 = gf_w4_inverse_from_divide;
541   - }
542   - return 1;
543   -}
544   -</pre></td></table></center><p>
545   -
546   -The code in <b>gf_w4_log_init()</b> sets up the log and antilog tables, and sets
547   -the <b>multiply.w4</b>, <b>divide.w4</b> etc routines to be the ones for logs. The
548   -tables are put into <b>gf->scratch->private</b>, which is typecast to a <b>struct
549   -gf_logtable_data *</b>:
550   -
551   -<p><center><table border=3 cellpadding=3><td><pre>
552   -struct gf_logtable_data {
553   - gf_val_4_t log_tbl[GF_FIELD_SIZE];
554   - gf_val_4_t antilog_tbl[GF_FIELD_SIZE * 2];
555   - gf_val_4_t *antilog_tbl_div;
556   -};
557   -.......
558   -
559   -static
560   -int gf_w4_log_init(gf_t *gf)
561   -{
562   - gf_internal_t *h;
563   - struct gf_logtable_data *ltd;
564   - int i, b;
565   -
566   - h = (gf_internal_t *) gf-&gt;scratch;
567   - ltd = h-&gt;private;
568   -
569   - ltd-&gt;log_tbl[0] = 0;
570   -
571   - ltd-&gt;antilog_tbl_div = ltd-&gt;antilog_tbl + (GF_FIELD_SIZE-1);
572   - b = 1;
573   - for (i = 0; i &lt; GF_FIELD_SIZE-1; i++) {
574   - ltd-&gt;log_tbl[b] = (gf_val_8_t)i;
575   - ltd-&gt;antilog_tbl[i] = (gf_val_8_t)b;
576   - ltd-&gt;antilog_tbl[i+GF_FIELD_SIZE-1] = (gf_val_8_t)b;
577   - b &lt;&lt;= 1;
578   - if (b & GF_FIELD_SIZE) {
579   - b = b ^ h-&gt;prim_poly;
580   - }
581   - }
582   -
583   - gf-&gt;inverse.w4 = gf_w4_inverse_from_divide;
584   - gf-&gt;divide.w4 = gf_w4_log_divide;
585   - gf-&gt;multiply.w4 = gf_w4_log_multiply;
586   - gf-&gt;multiply_region.w4 = gf_w4_log_multiply_region;
587   - return 1;
588   -}
589   -</pre></td></table></center><p>
590   -
591   -And of course the individual routines use <b>h->private</b> to access the tables:
592   -
593   -<p><center><table border=3 cellpadding=3><td><pre>
594   -static
595   -inline
596   -gf_val_8_t gf_w4_log_multiply (gf_t *gf, gf_val_8_t a, gf_val_8_t b)
597   -{
598   - struct gf_logtable_data *ltd;
599   -
600   - ltd = (struct gf_logtable_data *) ((gf_internal_t *) (gf-&gt;scratch))-&gt;private;
601   - return (a == 0 || b == 0) ? 0 : ltd-&gt;antilog_tbl[(unsigned)(ltd-&gt;log_tbl[a] + ltd-&gt;log_tbl[b])];
602   -}
603   -</pre></td></table></center><p>
604   -
605   -Finally, it's important that the proper sizes are put into
606   -<b>gf_w4_scratch_size()</b> for each implementation:
607   -
608   -<p><center><table border=3 cellpadding=3><td><pre>
609   -int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
610   -{
611   - int region_tbl_size;
612   - switch(mult_type)
613   - {
614   - case GF_MULT_DEFAULT:
615   - case GF_MULT_LOG_TABLE:
616   - return sizeof(gf_internal_t) + sizeof(struct gf_logtable_data) + 64;
617   - break;
618   - case GF_MULT_SHIFT:
619   - return sizeof(gf_internal_t);
620   - break;
621   - default:
622   - return -1;
623   - }
624   -}
625   -</pre></td></table></center><p>
626   -I hope that's enough explanation for y'all to start implementing. Let me know if you have
627   -problems -- thanks -- Jim
628   -
629   -<hr>
630   -The initial structure has been set for w=4, 8, 16, 32 and 64, with implementations of SHIFT and EUCLID, and for w <= 32, MATRIX. There are some weird caveats:
631   -
632   -<UL>
633   -<LI> For w=32 and w=64, the primitive polynomial does not have the leading one.
634   -<LI> I'd like for naming to be:
635   -<p>
636   -<UL>
637   - <b>gf_w</b><i>w</i><b>_</b><i>technique</i></i><b>_</b><i>funcationality</i><b>()</b>.
638   -</UL>
639   -<p>
640   -For example, the log techniques for w=4 are:
641   -<pre>
642   -gf_w4_log_multiply()
643   -gf_w4_log_divide()
644   -gf_w4_log_multiply_region()
645   -gf_w4_log_init()
646   -</pre>
647   -<p>
648   -<LI> I'd also like a header block on implementations that says who wrote it.
649   -</UL>
650   -
651   -<hr>
652   -<h3>Things we need to Implement: <i>w=4</i></h3>
653   -
654   -<p><table border=3 cellpadding=2>
655   -<tr> <td> SHIFT </td> <td> Done - Jim </td> </tr>
656   -<tr> <td> BYTWO_p </td> <td>Done - Jim</td> </tr>
657   -<tr> <td> BYTWO_b </td> <td>Done - Jim</td> </tr>
658   -<tr> <td> BYTWO_p, SSE </td> <td>Done - Jim</td> </tr>
659   -<tr> <td> BYTWO_b, SSE </td> <td>Done - Jim</td> </tr>
660   -<tr> <td> Single TABLE </td> <td> Done - Jim </td> </tr>
661   -<tr> <td> Double TABLE </td> <td> Done - Jim </td> </tr>
662   -<tr> <td> Double TABLE, SSE </td> <td> Done - Jim </td> </tr>
663   -<tr> <td> Quad TABLE </td> <td>Done - Jim</td> </tr>
664   -<tr> <td> Lazy Quad TABLE </td> <td>Done - Jim</td> </tr>
665   -<tr> <td> LOG </td> <td> Done - Jim </td> </tr>
666   -</table><p>
667   -
668   -<hr>
669   -<h3>Things we need to Implement: <i>w=8</i></h3>
670   -
671   -<p><table border=3 cellpadding=2>
672   -<tr> <td> SHIFT </td> <td> Done - Jim </td> </tr>
673   -<tr> <td> BYTWO_p </td> <td>Done - Jim </td> </tr>
674   -<tr> <td> BYTWO_b </td> <td>Done - Jim </td> </tr>
675   -<tr> <td> BYTWO_p, SSE </td> <td>Done - Jim </td> </tr>
676   -<tr> <td> BYTWO_b, SSE </td> <td>Done - Jim </td> </tr>
677   -<tr> <td> Single TABLE </td> <td> Done - Kevin </td> </tr>
678   -<tr> <td> Double TABLE </td> <td> Done - Jim </td> </tr>
679   -<tr> <td> Lazy Double TABLE </td> <td> Done - Jim </td> </tr>
680   -<tr> <td> Split 2 1 (Half) SSE </td> <td>Done - Jim</td> </tr>
681   -<tr> <td> Composite, k=2 </td> <td> Done - Kevin (alt mapping not passing unit test) </td> </tr>
682   -<tr> <td> LOG </td> <td> Done - Kevin </td> </tr>
683   -<tr> <td> LOG ZERO</td> <td> Done - Jim</td> </tr>
684   -</table><p>
685   -
686   -<hr>
687   -<h3>Things we need to Implement: <i>w=16</i></h3>
688   -
689   -<p><table border=3 cellpadding=2>
690   -<tr> <td> SHIFT </td> <td> Done - Jim </td> </tr>
691   -<tr> <td> BYTWO_p </td> <td>Done - Jim</td> </tr>
692   -<tr> <td> BYTWO_b </td> <td>Done - Jim</td> </tr>
693   -<tr> <td> BYTWO_p, SSE </td> <td>Done - Jim</td> </tr>
694   -<tr> <td> BYTWO_b, SSE </td> <td>Done - Jim</td> </tr>
695   -<tr> <td> Lazy TABLE </td> <td>Done - Jim</td> </tr>
696   -<tr> <td> Split 4 16 No-SSE, lazy </td> <td>Done - Jim</td> </tr>
697   -<tr> <td> Split 4 16 SSE, lazy </td> <td>Done - Jim</td> </tr>
698   -<tr> <td> Split 4 16 SSE, lazy, alternate mapping </td> <td>Done - Jim</td> </tr>
699   -<tr> <td> Split 8 16, lazy </td> <td>Done - Jim</td> </tr>
700   -<tr> <td> Composite, k=2, stdmap recursive </td> <td> Done - Kevin</td> </tr>
701   -<tr> <td> Composite, k=2, altmap recursive </td> <td> Done - Kevin</td> </tr>
702   -<tr> <td> Composite, k=2, stdmap inline </td> <td> Done - Kevin</td> </tr>
703   -<tr> <td> LOG </td> <td> Done - Kevin </td> </tr>
704   -<tr> <td> LOG ZERO</td> <td> Done - Kevin </td> </tr>
705   -<tr> <td> Group 4 4 </td> <td>Done - Jim: I don't see a reason to implement others, although 4-8 will be faster, and 8 8 will have faster region ops. They'll never beat SPLIT.</td> </tr>
706   -</table><p>
707   -
708   -<hr>
709   -<h3>Things we need to Implement: <i>w=32</i></h3>
710   -
711   -<p><table border=3 cellpadding=2>
712   -<tr> <td> SHIFT </td> <td> Done - Jim </td> </tr>
713   -<tr> <td> BYTWO_p </td> <td>Done - Jim</td> </tr>
714   -<tr> <td> BYTWO_b </td> <td>Done - Jim</td> </tr>
715   -<tr> <td> BYTWO_p, SSE </td> <td>Done - Jim</td> </tr>
716   -<tr> <td> BYTWO_b, SSE </td> <td>Done - Jim</td> </tr>
717   -<tr> <td> Split 2 32,lazy </td> <td>Done - Jim</td> </tr>
718   -<tr> <td> Split 2 32, SSE, lazy </td> <td>Done - Jim</td> </tr>
719   -<tr> <td> Split 4 32, lazy </td> <td>Done - Jim</td> </tr>
720   -<tr> <td> Split 4 32, SSE,ALTMAP lazy </td> <td>Done - Jim</td> </tr>
721   -<tr> <td> Split 4 32, SSE, lazy </td> <td>Done - Jim</td> </tr>
722   -<tr> <td> Split 8 8 </td> <td>Done - Jim </td> </tr>
723   -<tr> <td> Group, g_s == g_r </td> <td>Done - Jim</td></tr>
724   -<tr> <td> Group, any g_s and g_r</td> <td>Done - Jim</td></tr>
725   -<tr> <td> Composite, k=2, stdmap recursive </td> <td> Done - Kevin</td> </tr>
726   -<tr> <td> Composite, k=2, altmap recursive </td> <td> Done - Kevin</td> </tr>
727   -<tr> <td> Composite, k=2, stdmap inline </td> <td> Done - Kevin</td> </tr>
728   -</table><p>
729   -<hr>
730   -<h3>Things we need to Implement: <i>w=64</i></h3>
731   -
732   -<p><table border=3 cellpadding=2>
733   -<tr> <td> SHIFT </td> <td> Done - Jim </td> </tr>
734   -<tr> <td> BYTWO_p </td> <td> - </td> </tr>
735   -<tr> <td> BYTWO_b </td> <td> - </td> </tr>
736   -<tr> <td> BYTWO_p, SSE </td> <td> - </td> </tr>
737   -<tr> <td> BYTWO_b, SSE </td> <td> - </td> </tr>
738   -<tr> <td> Split 16 1 SSE, maybe lazy </td> <td> - </td> </tr>
739   -<tr> <td> Split 8 1 lazy </td> <td> - </td> </tr>
740   -<tr> <td> Split 8 8 </td> <td> - </td> </tr>
741   -<tr> <td> Split 8 8 lazy </td> <td> - </td> </tr>
742   -<tr> <td> Group </td> <td> - </td> </tr>
743   -<tr> <td> Composite, k=2, alternate mapping </td> <td> - </td> </tr>
744   -</table><p>
745   -<hr>
746   -<h3>Things we need to Implement: <i>w=128</i></h3>
747   -
748   -<p><table border=3 cellpadding=2>
749   -<tr> <td> SHIFT </td> <td> Done - Will </td> </tr>
750   -<tr> <td> BYTWO_p </td> <td> - </td> </tr>
751   -<tr> <td> BYTWO_b </td> <td> - </td> </tr>
752   -<tr> <td> BYTWO_p, SSE </td> <td> - </td> </tr>
753   -<tr> <td> BYTWO_b, SSE </td> <td> - </td> </tr>
754   -<tr> <td> Split 32 1 SSE, maybe lazy </td> <td> - </td> </tr>
755   -<tr> <td> Split 16 1 lazy </td> <td> - </td> </tr>
756   -<tr> <td> Split 16 16 - Maybe that's insanity</td> <td> - </td> </tr>
757   -<tr> <td> Split 16 16 lazy </td> <td> - </td> </tr>
758   -<tr> <td> Group (SSE) </td> <td> - </td> </tr>
759   -<tr> <td> Composite, k=?, alternate mapping </td> <td> - </td> </tr>
760   -</table><p>
761   -<hr>
762   -<h3>Things we need to Implement: <i>w=general between 1 & 32</i></h3>
763   -
764   -<p><table border=3 cellpadding=2>
765   -<tr> <td> CAUCHY Region (SSE XOR)</td> <td> Done - Jim </td> </tr>
766   -<tr> <td> SHIFT </td> <td> Done - Jim </td> </tr>
767   -<tr> <td> TABLE </td> <td> Done - Jim </td> </tr>
768   -<tr> <td> LOG </td> <td> Done - Jim </td> </tr>
769   -<tr> <td> BYTWO_p </td> <td>Done - Jim</td> </tr>
770   -<tr> <td> BYTWO_b </td> <td>Done - Jim</td> </tr>
771   -<tr> <td> Group, g_s == g_r </td> <td>Done - Jim</td></tr>
772   -<tr> <td> Group, any g_s and g_r</td> <td>Done - Jim</td></tr>
773   -<tr> <td> Split - do we need it?</td> <td>Done - Jim</td></tr>
774   -<tr> <td> Composite - do we need it?</td> <td> - </td></tr>
775   -<tr> <td> Split - do we need it?</td> <td> - </td></tr>
776   -<tr> <td> Logzero?</td> <td> - </td></tr>
777   -</table><p>
flag_tester/README.txt 0 → 100644
... ... @@ -0,0 +1,10 @@
  1 +Run which_compile_flags.sh and it will print out the compile flags to use in
  2 + GNUmakefile. By default, this script uses "cc" as its compiler but you can
  3 + pass in the name of your compiler as an argument.
  4 +
  5 +EXAMPLE: "./which_compile_flags.sh clang"
  6 +
  7 +This script will run "clang" in the above example so be warned that if you type
  8 +something like "rm" for that argument, you get what you asked for. Also, make
  9 +sure that the compiler that you pass to which_compile_flags.sh is the same as
  10 +the compiler in GNUmakefile.
... ...
flag_tester/flag_test.c 0 → 100644
... ... @@ -0,0 +1,120 @@
  1 +/*
  2 + * flag_test.c - copied from whats_my_sse.c to output proper compile
  3 + * flags for the GNUmakefile
  4 + *
  5 + */
  6 +
  7 +#include <stdio.h>
  8 +#include <stdlib.h>
  9 +#include <string.h>
  10 +#include "intel_cpu_capabilities.h"
  11 +
  12 +void usage()
  13 +{
  14 + fprintf(stderr, "usage: flag_test <compiler name>\n");
  15 + exit(EXIT_FAILURE);
  16 +}
  17 +
  18 +int main(int argc, char **argv)
  19 +{
  20 + //make sure to extend these buffers if more flags are added to this program
  21 + char cflags[1000], ldflags[1000], buf[1000];
  22 + FILE *file;
  23 + char sse_found = 0;
  24 +
  25 + if(argc != 2)
  26 + usage();
  27 +
  28 + sprintf(cflags, "CFLAGS = -O3");
  29 + sprintf(ldflags, "LDFLAGS = -O3");
  30 +
  31 + if(cpu_has_feature(CPU_CAP_SSE42))
  32 + {
  33 + sprintf(buf, "%s sse_test.c -o sse4 -msse4 -DSSE4 2> /dev/null", argv[1]);
  34 + system(buf);
  35 + if(file = fopen("sse4", "r"))
  36 + {
  37 + fclose(file);
  38 +
  39 + //run program and compare to the included output
  40 + system("./sse4 > temp.txt 2> /dev/null");
  41 + system("diff sse4_test.txt temp.txt > diff.txt 2> /dev/null");
  42 + file = fopen("diff.txt", "r");
  43 + if(fgetc(file) == EOF)
  44 + {
  45 + strcat(cflags, " -msse4 -DINTEL_SSE4");
  46 + strcat(ldflags, " -msse4");
  47 + sse_found = 1;
  48 + }
  49 + fclose(file);
  50 + }
  51 + }
  52 +
  53 + if(cpu_has_feature(CPU_CAP_SSSE3) && !sse_found)
  54 + {
  55 + sprintf(buf, "%s sse_test.c -o ssse3 -mssse3 -DSSSE3 2> /dev/null", argv[1]);
  56 + system(buf);
  57 + if(file = fopen("ssse3", "r"))
  58 + {
  59 + fclose(file);
  60 +
  61 + //run program and compare to the included output
  62 + system("./ssse3 > temp.txt 2> /dev/null");
  63 + system("diff ssse3_test.txt temp.txt > diff.txt 2> /dev/null");
  64 + file = fopen("diff.txt", "r");
  65 + if(fgetc(file) == EOF)
  66 + {
  67 + strcat(cflags, " -mssse3 -DINTEL_SSSE3");
  68 + strcat(ldflags, " -mssse3");
  69 + sse_found = 1;
  70 + }
  71 + fclose(file);
  72 + }
  73 + }
  74 +
  75 + if(cpu_has_feature(CPU_CAP_SSE2) && !sse_found)
  76 + {
  77 + sprintf(buf, "%s sse_test.c -o sse2 -msse2 -DSSE2 2> /dev/null", argv[1]);
  78 + system(buf);
  79 + if(file = fopen("sse2", "r"))
  80 + {
  81 + fclose(file);
  82 +
  83 + //run program and compare to the included output
  84 + system("./sse2 > temp.txt 2> /dev/null");
  85 + system("diff sse2_test.txt temp.txt > diff.txt 2> /dev/null");
  86 + file = fopen("diff.txt", "r");
  87 + if(fgetc(file) == EOF)
  88 + {
  89 + strcat(cflags, " -msse2 -DINTEL_SSE2");
  90 + strcat(ldflags, " -msse2");
  91 + sse_found = 1;
  92 + }
  93 + fclose(file);
  94 + }
  95 + }
  96 +
  97 + if(cpu_has_feature(CPU_CAP_PCLMULQDQ) && sse_found)
  98 + {
  99 + sprintf(buf, "%s pclmul_test.c -o pclmul -maes -mpclmul 2> /dev/null"
  100 + , argv[1]);
  101 + system(buf);
  102 + if(file = fopen("pclmul", "r"))
  103 + {
  104 + fclose(file);
  105 +
  106 + //run program and compare to the included output
  107 + system("./pclmul > temp.txt 2> /dev/null");
  108 + system("diff pclmul_test.txt temp.txt > diff.txt 2> /dev/null");
  109 + file = fopen("diff.txt", "r");
  110 + if(fgetc(file) == EOF)
  111 + {
  112 + strcat(cflags, " -maes -mpclmul -DINTEL_PCLMUL");
  113 + strcat(ldflags, " -maes -mpclmul");
  114 + }
  115 + fclose(file);
  116 + }
  117 + }
  118 +
  119 + printf("%s\n%s\n", cflags, ldflags);
  120 +}
... ...
flag_tester/intel_cpu_capabilities.h 0 → 100644
... ... @@ -0,0 +1,43 @@
  1 +/*
  2 + * Routines to figure out what an Intel CPU's capabilities are.
  3 + */
  4 +
  5 +#pragma once
  6 +
  7