1 /*
2  * Hunt - A refined core library for D programming language.
3  *
4  * Copyright (C) 2018-2019 HuntLabs
5  *
6  * Website: https://www.huntlabs.net/
7  *
8  * Licensed under the Apache-2.0 License.
9  *
10  */
11 
12 module hunt.collection.ArrayTrie;
13 
14 import hunt.collection.AbstractTrie;
15 import hunt.util.Common;
16 import hunt.collection.ByteBuffer;
17 import hunt.collection.HashSet;
18 import hunt.collection.Set;
19 import hunt.collection.Trie;
20 
21 import hunt.Exceptions;
22 import hunt.text.StringBuilder;
23 
24 import hunt.logging;
25 import std.conv;
26 
27 /**
28  * <p>A Trie string lookup data structure using a fixed size array.</p>
29  * <p>This implementation is always case insensitive and is optimal for
30  * a small number of fixed strings with few special characters.  The
31  * Trie is stored in an array of lookup tables, each indexed by the
32  * next character of the key.   Frequently used characters are directly
33  * indexed in each lookup table, whilst infrequently used characters
34  * must use a big character table.
35  * </p>
36  * <p>This Trie is very space efficient if the key characters are
37  * from ' ', '+', '-', ':', ';', '.', 'A' to 'Z' or 'a' to 'z'.
38  * Other ISO-8859-1 characters can be used by the key, but less space
39  * efficiently.
40  * </p>
41  * <p>This Trie is not Threadsafe and contains no mutual exclusion
42  * or deliberate memory barriers.  It is intended for an ArrayTrie to be
43  * built by a single thread and then used concurrently by multiple threads
44  * and not mutated during that access.  If concurrent mutations of the
45  * Trie is required external locks need to be applied.
46  * </p>
47  *
48  * @param (V) the element of entry
49  */
50 class ArrayTrie(V) : AbstractTrie!(V) {
51     /**
52      * The Size of a Trie row is how many characters can be looked
53      * up directly without going to a big index.  This is set at
54      * 32 to cover case insensitive alphabet and a few other common
55      * characters.
56      */
57     private enum int ROW_SIZE = 32;
58 
59     /**
60      * The index lookup table, this maps a character as a byte
61      * (ISO-8859-1 or UTF8) to an index within a Trie row
62      */
63     private enum int[] __lookup =
64     [ // 0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
65    /*0*/-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
66    /*1*/-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
67    /*2*/31, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 26, -1, 27, 30, -1,
68    /*3*/-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 28, 29, -1, -1, -1, -1,
69    /*4*/-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
70    /*5*/15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
71    /*6*/-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
72    /*7*/15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
73     ];
74 
75     /**
76      * The Trie rows in a single array which allows a lookup of row,character
77      * to the next row in the Trie.  This is actually a 2 dimensional
78      * array that has been flattened to achieve locality of reference.
79      * The first ROW_SIZE entries are for row 0, then next ROW_SIZE
80      * entries are for row 1 etc.   So in general instead of using
81      * _rows[row][index], we use _rows[row*ROW_SIZE+index] to look up
82      * the next row for a given character.
83      * <p>
84      * The array is of characters rather than integers to save space.
85      */
86     private byte[] _rowIndex;
87 
88     /**
89      * The key (if any) for a Trie row.
90      * A row may be a leaf, a node or both in the Trie tree.
91      */
92     private string[] _key;
93 
94     /**
95      * The value (if any) for a Trie row.
96      * A row may be a leaf, a node or both in the Trie tree.
97      */
98     private V[] _value;
99 
100     /**
101      * A big index for each row.
102      * If a character outside of the lookup map is needed,
103      * then a big index will be created for the row, with
104      * 256 entries, one for each possible byte.
105      */
106     private byte[][] _bigIndex;
107 
108     /**
109      * The number of rows allocated
110      */
111     private byte _rows;
112 
113     this() {
114         this(128);
115     }
116 
117     /* ------------------------------------------------------------ */
118 
119     /**
120      * @param capacity The capacity of the trie, which at the worst case
121      *                 is the total number of characters of all keys stored in the Trie.
122      *                 The capacity needed is dependent of the shared prefixes of the keys.
123      *                 For example, a capacity of 6 nodes is required to store keys "foo"
124      *                 and "bar", but a capacity of only 4 is required to
125      *                 store "bar" and "bat".
126      */
127     this(int capacity) {
128         super(true);
129         _value = new V[capacity];
130         _rowIndex = new byte[capacity * 32];
131         _key = new string[capacity];
132     }
133 
134     /* ------------------------------------------------------------ */
135     override
136     void clear() {
137         _rows = 0;
138         _value[] = V.init;
139         _rowIndex[] = 0;
140         _key[] = null;
141     }
142 
143     /* ------------------------------------------------------------ */
144     override
145     bool put(string s, V v) {
146         int t = 0;
147         size_t k;
148         size_t limit = s.length;
149         for (k = 0; k < limit; k++) {
150             byte c = s[k];
151 
152             int index = __lookup[c & 0x7f];
153             if (index >= 0) {
154                 int idx = t * ROW_SIZE + index;
155                 t = _rowIndex[idx];
156                 if (t == 0) {
157                     if (++_rows >= _value.length)
158                         return false;
159                     t = _rowIndex[idx] = _rows;
160                 }
161             } else if (c > 127)
162                 throw new IllegalArgumentException("non ascii character");
163             else {
164                 if (_bigIndex is null)
165                     _bigIndex = new byte[][_value.length];
166                 if (t >= _bigIndex.length)
167                     return false;
168                 byte[] big = _bigIndex[t];
169                 if (big is null)
170                     big = _bigIndex[t] = new byte[128];
171                 t = big[c];
172                 if (t == 0) {
173                     if (_rows == _value.length)
174                         return false;
175                     t = big[c] = ++_rows;
176                 }
177             }
178         }
179 
180         if (t >= _key.length) {
181             _rows = cast(byte) _key.length;
182             return false;
183         }
184 
185         static if(is(V == class)) {
186             _key[t] = v is null ? null : s;
187         } else {
188             _key[t] = v == V.init ? null : s;
189         }
190         _value[t] = v;
191         return true;
192     }
193 
194     /* ------------------------------------------------------------ */
195     override
196     V get(string s, int offset, int len) {
197         int t = 0;
198         for (int i = 0; i < len; i++) {
199             byte c = s[offset + i];
200             int index = __lookup[c & 0x7f];
201             if (index >= 0) {
202                 int idx = t * ROW_SIZE + index;
203                 t = _rowIndex[idx];
204                 if (t == 0)
205                     return V.init;
206             } else {
207                 byte[] big = _bigIndex is null ? null : _bigIndex[t];
208                 if (big is null)
209                     return V.init;
210                 t = big[c];
211                 if (t == 0)
212                     return V.init;
213             }
214         }
215         return _value[t];
216     }
217 
218     /* ------------------------------------------------------------ */
219     override
220     V get(ByteBuffer b, int offset, int len) {
221         int t = 0;
222         for (int i = 0; i < len; i++) {
223             byte c = b.get(offset + i);
224             int index = __lookup[c & 0x7f];
225             if (index >= 0) {
226                 int idx = t * ROW_SIZE + index;
227                 t = _rowIndex[idx];
228                 if (t == 0)
229                     return V.init;
230             } else {
231                 byte[] big = _bigIndex is null ? null : _bigIndex[t];
232                 if (big is null)
233                     return V.init;
234                 t = big[c];
235                 if (t == 0)
236                     return V.init;
237             }
238         }
239         return _value[t];
240     }
241 
242     /* ------------------------------------------------------------ */
243     override
244     V getBest(byte[] b, int offset, int len) {
245         return getBest(0, b, offset, len);
246     }
247 
248     /* ------------------------------------------------------------ */
249     override
250     V getBest(ByteBuffer b, int offset, int len) {
251         if (b.hasArray())
252             return getBest(0, b.array(), b.arrayOffset() + b.position() + offset, len);
253         return getBest(0, b, offset, len);
254     }
255 
256     /* ------------------------------------------------------------ */
257     override
258     V getBest(string s, int offset, int len) {
259         return getBest(0, s, offset, len);
260     }
261 
262     /* ------------------------------------------------------------ */
263     private V getBest(int t, string s, int offset, int len) {
264         int pos = offset;
265         for (int i = 0; i < len; i++) {
266             byte c = s[pos++];
267             int index = __lookup[c & 0x7f];
268             if (index >= 0) {
269                 int idx = t * ROW_SIZE + index;
270                 int nt = _rowIndex[idx];
271                 if (nt == 0)
272                     break;
273                 t = nt;
274             } else {
275                 byte[] big = _bigIndex is null ? null : _bigIndex[t];
276                 if (big is null)
277                     return V.init;
278                 int nt = big[c];
279                 if (nt == 0)
280                     break;
281                 t = nt;
282             }
283 
284             // Is the next Trie is a match
285             if (_key[t] !is null) {
286                 // Recurse so we can remember this possibility
287                 V best = getBest(t, s, offset + i + 1, len - i - 1);
288                 
289                 static if(is(V == class)) {
290                     if (best !is null)
291                         return best;
292                 } else {
293                     if (best != V.init)
294                         return best;
295                 }
296 
297                 return _value[t];
298             }
299         }
300         return _value[t];
301     }
302 
303     /* ------------------------------------------------------------ */
304     private V getBest(int t, byte[] b, int offset, int len) {
305         for (int i = 0; i < len; i++) {
306             byte c = b[offset + i];
307             int index = __lookup[c & 0x7f];
308             if (index >= 0) {
309                 int idx = t * ROW_SIZE + index;
310                 int nt = _rowIndex[idx];
311                 if (nt == 0)
312                     break;
313                 t = nt;
314             } else {
315                 byte[] big = _bigIndex is null ? null : _bigIndex[t];
316                 if (big is null)
317                     return V.init;
318                 int nt = big[c];
319                 if (nt == 0)
320                     break;
321                 t = nt;
322             }
323 
324             // Is the next Trie is a match
325             if (_key[t] !is null) {
326                 // Recurse so we can remember this possibility
327                 V best = getBest(t, b, offset + i + 1, len - i - 1);
328                 static if(is(V == class)) {
329                     if (best !is null)
330                         return best;
331                 } else {
332                     if (best != V.init)
333                         return best;
334                 }
335 
336                 break;
337             }
338         }
339         return _value[t];
340     }
341 
342     private V getBest(int t, ByteBuffer b, int offset, int len) {
343         int pos = b.position() + offset;
344         for (int i = 0; i < len; i++) {
345             byte c = b.get(pos++);
346             int index = __lookup[c & 0x7f];
347             if (index >= 0) {
348                 int idx = t * ROW_SIZE + index;
349                 int nt = _rowIndex[idx];
350                 if (nt == 0)
351                     break;
352                 t = nt;
353             } else {
354                 byte[] big = _bigIndex is null ? null : _bigIndex[t];
355                 if (big is null)
356                     return V.init;
357                 int nt = big[c];
358                 if (nt == 0)
359                     break;
360                 t = nt;
361             }
362 
363             // Is the next Trie is a match
364             if (_key[t] !is null) {
365                 // Recurse so we can remember this possibility
366                 V best = getBest(t, b, offset + i + 1, len - i - 1);
367                 
368                 static if(is(V == class)) {
369                     if (best !is null)
370                         return best;
371                 } else {
372                     if (best != V.init)
373                         return best;
374                 }
375 
376                 break;
377             }
378         }
379         return _value[t];
380     }
381 
382 
383     override
384     string toString() {
385         StringBuilder buf = new StringBuilder();
386         toString(buf, 0);
387 
388         if (buf.length() == 0)
389             return "{}";
390 
391         buf.setCharAt(0, '{');
392         buf.append('}');
393         return buf.toString();
394     }
395 
396 
397     private void toString(Appendable ot, int t) {
398         void doAppend() {
399             try {
400                 ot.append(',');
401                 ot.append(_key[t]);
402                 ot.append('=');
403                 ot.append(_value[t].to!string());
404             } catch (IOException e) {
405                 throw new RuntimeException(e);
406             }
407         }
408 
409         static if(is(V == class)) {
410             if (_value[t] !is null)
411                 doAppend();
412         } else {
413             if (_value[t] != V.init)
414                 doAppend();
415         }
416 
417         for (int i = 0; i < ROW_SIZE; i++) {
418             int idx = t * ROW_SIZE + i;
419             if (_rowIndex[idx] != 0)
420                 toString(ot, _rowIndex[idx]);
421         }
422 
423         byte[] big = _bigIndex is null ? null : _bigIndex[t];
424         if (big !is null) {
425             foreach (int i ; big)
426                 if (i != 0)
427                     toString(ot, i);
428         }
429 
430     }
431 
432     override
433     Set!(string) keySet() {
434         Set!(string) keys = new HashSet!(string)();
435         keySet(keys, 0);
436         return keys;
437     }
438 
439     private void keySet(Set!(string) set, int t) {
440         static if(is(V == class)) {
441             if (t < _value.length && _value[t] !is null)
442                 set.add(_key[t]);
443         } else {
444             if (t < _value.length && _value[t] != V.init)
445                 set.add(_key[t]);
446         }
447 
448         for (int i = 0; i < ROW_SIZE; i++) {
449             int idx = t * ROW_SIZE + i;
450             if (idx < _rowIndex.length && _rowIndex[idx] != 0)
451                 keySet(set, _rowIndex[idx]);
452         }
453 
454         byte[] big = _bigIndex is null || t >= _bigIndex.length ? null : _bigIndex[t];
455         // if (big !is null) {
456             foreach (int i ; big)
457                 if (i != 0)
458                     keySet(set, i);
459         // }
460     }
461 
462     override
463     bool isFull() {
464         return _rows + 1 >= _key.length;
465     }
466 }