1 /* 2 * Hunt - A refined core library for D programming language. 3 * 4 * Copyright (C) 2018-2019 HuntLabs 5 * 6 * Website: https://www.huntlabs.net/ 7 * 8 * Licensed under the Apache-2.0 License. 9 * 10 */ 11 12 module hunt.collection.ArrayTrie; 13 14 import hunt.collection.AbstractTrie; 15 import hunt.util.Common; 16 import hunt.collection.ByteBuffer; 17 import hunt.collection.HashSet; 18 import hunt.collection.Set; 19 import hunt.collection.Trie; 20 21 import hunt.Exceptions; 22 import hunt.text.StringBuilder; 23 24 import hunt.logging; 25 import std.conv; 26 27 /** 28 * <p>A Trie string lookup data structure using a fixed size array.</p> 29 * <p>This implementation is always case insensitive and is optimal for 30 * a small number of fixed strings with few special characters. The 31 * Trie is stored in an array of lookup tables, each indexed by the 32 * next character of the key. Frequently used characters are directly 33 * indexed in each lookup table, whilst infrequently used characters 34 * must use a big character table. 35 * </p> 36 * <p>This Trie is very space efficient if the key characters are 37 * from ' ', '+', '-', ':', ';', '.', 'A' to 'Z' or 'a' to 'z'. 38 * Other ISO-8859-1 characters can be used by the key, but less space 39 * efficiently. 40 * </p> 41 * <p>This Trie is not Threadsafe and contains no mutual exclusion 42 * or deliberate memory barriers. It is intended for an ArrayTrie to be 43 * built by a single thread and then used concurrently by multiple threads 44 * and not mutated during that access. If concurrent mutations of the 45 * Trie is required external locks need to be applied. 46 * </p> 47 * 48 * @param (V) the element of entry 49 */ 50 class ArrayTrie(V) : AbstractTrie!(V) { 51 /** 52 * The Size of a Trie row is how many characters can be looked 53 * up directly without going to a big index. This is set at 54 * 32 to cover case insensitive alphabet and a few other common 55 * characters. 56 */ 57 private enum int ROW_SIZE = 32; 58 59 /** 60 * The index lookup table, this maps a character as a byte 61 * (ISO-8859-1 or UTF8) to an index within a Trie row 62 */ 63 private enum int[] __lookup = 64 [ // 0 1 2 3 4 5 6 7 8 9 A B C D E F 65 /*0*/-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 66 /*1*/-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 67 /*2*/31, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 26, -1, 27, 30, -1, 68 /*3*/-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 28, 29, -1, -1, -1, -1, 69 /*4*/-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 70 /*5*/15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, 71 /*6*/-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 72 /*7*/15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, 73 ]; 74 75 /** 76 * The Trie rows in a single array which allows a lookup of row,character 77 * to the next row in the Trie. This is actually a 2 dimensional 78 * array that has been flattened to achieve locality of reference. 79 * The first ROW_SIZE entries are for row 0, then next ROW_SIZE 80 * entries are for row 1 etc. So in general instead of using 81 * _rows[row][index], we use _rows[row*ROW_SIZE+index] to look up 82 * the next row for a given character. 83 * <p> 84 * The array is of characters rather than integers to save space. 85 */ 86 private byte[] _rowIndex; 87 88 /** 89 * The key (if any) for a Trie row. 90 * A row may be a leaf, a node or both in the Trie tree. 91 */ 92 private string[] _key; 93 94 /** 95 * The value (if any) for a Trie row. 96 * A row may be a leaf, a node or both in the Trie tree. 97 */ 98 private V[] _value; 99 100 /** 101 * A big index for each row. 102 * If a character outside of the lookup map is needed, 103 * then a big index will be created for the row, with 104 * 256 entries, one for each possible byte. 105 */ 106 private byte[][] _bigIndex; 107 108 /** 109 * The number of rows allocated 110 */ 111 private byte _rows; 112 113 this() { 114 this(128); 115 } 116 117 /* ------------------------------------------------------------ */ 118 119 /** 120 * @param capacity The capacity of the trie, which at the worst case 121 * is the total number of characters of all keys stored in the Trie. 122 * The capacity needed is dependent of the shared prefixes of the keys. 123 * For example, a capacity of 6 nodes is required to store keys "foo" 124 * and "bar", but a capacity of only 4 is required to 125 * store "bar" and "bat". 126 */ 127 this(int capacity) { 128 super(true); 129 _value = new V[capacity]; 130 _rowIndex = new byte[capacity * 32]; 131 _key = new string[capacity]; 132 } 133 134 /* ------------------------------------------------------------ */ 135 override 136 void clear() { 137 _rows = 0; 138 _value[] = V.init; 139 _rowIndex[] = 0; 140 _key[] = null; 141 } 142 143 /* ------------------------------------------------------------ */ 144 override 145 bool put(string s, V v) { 146 int t = 0; 147 size_t k; 148 size_t limit = s.length; 149 for (k = 0; k < limit; k++) { 150 byte c = s[k]; 151 152 int index = __lookup[c & 0x7f]; 153 if (index >= 0) { 154 int idx = t * ROW_SIZE + index; 155 t = _rowIndex[idx]; 156 if (t == 0) { 157 if (++_rows >= _value.length) 158 return false; 159 t = _rowIndex[idx] = _rows; 160 } 161 } else if (c > 127) 162 throw new IllegalArgumentException("non ascii character"); 163 else { 164 if (_bigIndex is null) 165 _bigIndex = new byte[][_value.length]; 166 if (t >= _bigIndex.length) 167 return false; 168 byte[] big = _bigIndex[t]; 169 if (big is null) 170 big = _bigIndex[t] = new byte[128]; 171 t = big[c]; 172 if (t == 0) { 173 if (_rows == _value.length) 174 return false; 175 t = big[c] = ++_rows; 176 } 177 } 178 } 179 180 if (t >= _key.length) { 181 _rows = cast(byte) _key.length; 182 return false; 183 } 184 185 static if(is(V == class)) { 186 _key[t] = v is null ? null : s; 187 } else { 188 _key[t] = v == V.init ? null : s; 189 } 190 _value[t] = v; 191 return true; 192 } 193 194 /* ------------------------------------------------------------ */ 195 override 196 V get(string s, int offset, int len) { 197 int t = 0; 198 for (int i = 0; i < len; i++) { 199 byte c = s[offset + i]; 200 int index = __lookup[c & 0x7f]; 201 if (index >= 0) { 202 int idx = t * ROW_SIZE + index; 203 t = _rowIndex[idx]; 204 if (t == 0) 205 return V.init; 206 } else { 207 byte[] big = _bigIndex is null ? null : _bigIndex[t]; 208 if (big is null) 209 return V.init; 210 t = big[c]; 211 if (t == 0) 212 return V.init; 213 } 214 } 215 return _value[t]; 216 } 217 218 /* ------------------------------------------------------------ */ 219 override 220 V get(ByteBuffer b, int offset, int len) { 221 int t = 0; 222 for (int i = 0; i < len; i++) { 223 byte c = b.get(offset + i); 224 int index = __lookup[c & 0x7f]; 225 if (index >= 0) { 226 int idx = t * ROW_SIZE + index; 227 t = _rowIndex[idx]; 228 if (t == 0) 229 return V.init; 230 } else { 231 byte[] big = _bigIndex is null ? null : _bigIndex[t]; 232 if (big is null) 233 return V.init; 234 t = big[c]; 235 if (t == 0) 236 return V.init; 237 } 238 } 239 return _value[t]; 240 } 241 242 /* ------------------------------------------------------------ */ 243 override 244 V getBest(byte[] b, int offset, int len) { 245 return getBest(0, b, offset, len); 246 } 247 248 /* ------------------------------------------------------------ */ 249 override 250 V getBest(ByteBuffer b, int offset, int len) { 251 if (b.hasArray()) 252 return getBest(0, b.array(), b.arrayOffset() + b.position() + offset, len); 253 return getBest(0, b, offset, len); 254 } 255 256 /* ------------------------------------------------------------ */ 257 override 258 V getBest(string s, int offset, int len) { 259 return getBest(0, s, offset, len); 260 } 261 262 /* ------------------------------------------------------------ */ 263 private V getBest(int t, string s, int offset, int len) { 264 int pos = offset; 265 for (int i = 0; i < len; i++) { 266 byte c = s[pos++]; 267 int index = __lookup[c & 0x7f]; 268 if (index >= 0) { 269 int idx = t * ROW_SIZE + index; 270 int nt = _rowIndex[idx]; 271 if (nt == 0) 272 break; 273 t = nt; 274 } else { 275 byte[] big = _bigIndex is null ? null : _bigIndex[t]; 276 if (big is null) 277 return V.init; 278 int nt = big[c]; 279 if (nt == 0) 280 break; 281 t = nt; 282 } 283 284 // Is the next Trie is a match 285 if (_key[t] !is null) { 286 // Recurse so we can remember this possibility 287 V best = getBest(t, s, offset + i + 1, len - i - 1); 288 289 static if(is(V == class)) { 290 if (best !is null) 291 return best; 292 } else { 293 if (best != V.init) 294 return best; 295 } 296 297 return _value[t]; 298 } 299 } 300 return _value[t]; 301 } 302 303 /* ------------------------------------------------------------ */ 304 private V getBest(int t, byte[] b, int offset, int len) { 305 for (int i = 0; i < len; i++) { 306 byte c = b[offset + i]; 307 int index = __lookup[c & 0x7f]; 308 if (index >= 0) { 309 int idx = t * ROW_SIZE + index; 310 int nt = _rowIndex[idx]; 311 if (nt == 0) 312 break; 313 t = nt; 314 } else { 315 byte[] big = _bigIndex is null ? null : _bigIndex[t]; 316 if (big is null) 317 return V.init; 318 int nt = big[c]; 319 if (nt == 0) 320 break; 321 t = nt; 322 } 323 324 // Is the next Trie is a match 325 if (_key[t] !is null) { 326 // Recurse so we can remember this possibility 327 V best = getBest(t, b, offset + i + 1, len - i - 1); 328 static if(is(V == class)) { 329 if (best !is null) 330 return best; 331 } else { 332 if (best != V.init) 333 return best; 334 } 335 336 break; 337 } 338 } 339 return _value[t]; 340 } 341 342 private V getBest(int t, ByteBuffer b, int offset, int len) { 343 int pos = b.position() + offset; 344 for (int i = 0; i < len; i++) { 345 byte c = b.get(pos++); 346 int index = __lookup[c & 0x7f]; 347 if (index >= 0) { 348 int idx = t * ROW_SIZE + index; 349 int nt = _rowIndex[idx]; 350 if (nt == 0) 351 break; 352 t = nt; 353 } else { 354 byte[] big = _bigIndex is null ? null : _bigIndex[t]; 355 if (big is null) 356 return V.init; 357 int nt = big[c]; 358 if (nt == 0) 359 break; 360 t = nt; 361 } 362 363 // Is the next Trie is a match 364 if (_key[t] !is null) { 365 // Recurse so we can remember this possibility 366 V best = getBest(t, b, offset + i + 1, len - i - 1); 367 368 static if(is(V == class)) { 369 if (best !is null) 370 return best; 371 } else { 372 if (best != V.init) 373 return best; 374 } 375 376 break; 377 } 378 } 379 return _value[t]; 380 } 381 382 383 override 384 string toString() { 385 StringBuilder buf = new StringBuilder(); 386 toString(buf, 0); 387 388 if (buf.length() == 0) 389 return "{}"; 390 391 buf.setCharAt(0, '{'); 392 buf.append('}'); 393 return buf.toString(); 394 } 395 396 397 private void toString(Appendable ot, int t) { 398 void doAppend() { 399 try { 400 ot.append(','); 401 ot.append(_key[t]); 402 ot.append('='); 403 ot.append(_value[t].to!string()); 404 } catch (IOException e) { 405 throw new RuntimeException(e); 406 } 407 } 408 409 static if(is(V == class)) { 410 if (_value[t] !is null) 411 doAppend(); 412 } else { 413 if (_value[t] != V.init) 414 doAppend(); 415 } 416 417 for (int i = 0; i < ROW_SIZE; i++) { 418 int idx = t * ROW_SIZE + i; 419 if (_rowIndex[idx] != 0) 420 toString(ot, _rowIndex[idx]); 421 } 422 423 byte[] big = _bigIndex is null ? null : _bigIndex[t]; 424 if (big !is null) { 425 foreach (int i ; big) 426 if (i != 0) 427 toString(ot, i); 428 } 429 430 } 431 432 override 433 Set!(string) keySet() { 434 Set!(string) keys = new HashSet!(string)(); 435 keySet(keys, 0); 436 return keys; 437 } 438 439 private void keySet(Set!(string) set, int t) { 440 static if(is(V == class)) { 441 if (t < _value.length && _value[t] !is null) 442 set.add(_key[t]); 443 } else { 444 if (t < _value.length && _value[t] != V.init) 445 set.add(_key[t]); 446 } 447 448 for (int i = 0; i < ROW_SIZE; i++) { 449 int idx = t * ROW_SIZE + i; 450 if (idx < _rowIndex.length && _rowIndex[idx] != 0) 451 keySet(set, _rowIndex[idx]); 452 } 453 454 byte[] big = _bigIndex is null || t >= _bigIndex.length ? null : _bigIndex[t]; 455 // if (big !is null) { 456 foreach (int i ; big) 457 if (i != 0) 458 keySet(set, i); 459 // } 460 } 461 462 override 463 bool isFull() { 464 return _rows + 1 >= _key.length; 465 } 466 }