标签:style blog http io color os ar 使用 for
在定义后缀树(Suffix Tree)时,我们给出了一段简洁的描述:
A suffix tree is a compressed trie for all the suffixes of a text.
后缀数组(Suffix Array)的定义也同样简洁:
A suffix array is a sorted array of all suffixes of a text.
后缀数组由 Manber & Myers 在 1990 年首先提出《Suffix arrays: a new method for on-line string searches》,用以替代后缀树,并且改进了存储空间的需求。其与后缀树的的关系有:
下面用一个例子来解释后缀数组结构。假设文本 Text = "banana",这里增加一个末尾字符 "$" 以使所有的后缀与前缀不重复,要求 "$" 为 Text 中不存在并且按字典序排序要小于所有其他字符。(假设数组从 1 开始)
那么对于 Text = "banana$" 的后缀列表如下:
对所有后缀进行字典序(Lexicographical Order)排序:
上表中右侧列即为后缀数组(Suffix Array),SA = [7, 6, 4, 2, 1, 5, 3]。
这样,例如 SA[3] 的值为 4,就代表 Text = "banana$" 中从位置 4 开始的后缀 "ana$"。
在 2009 年,Nong, Ge; Zhang, Sen; Chan, Wai Hong 等发表了论文《Linear Suffix Array Construction by Almost Pure Induced-Sorting》,首次实现了如下 3 个目标:
Yuta Mori 对该算法做了具体的实现。
1 /* 2 * SAIS.cs for SAIS-CSharp 3 * Copyright (c) 2010 Yuta Mori. All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person 6 * obtaining a copy of this software and associated documentation 7 * files (the "Software"), to deal in the Software without 8 * restriction, including without limitation the rights to use, 9 * copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following 12 * conditions: 13 * 14 * The above copyright notice and this permission notice shall be 15 * included in all copies or substantial portions of the Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 19 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 21 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 22 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 24 * OTHER DEALINGS IN THE SOFTWARE. 25 */ 26 27 namespace SuffixArray 28 { 29 internal interface BaseArray 30 { 31 int this[int i] 32 { 33 set; 34 get; 35 } 36 } 37 38 internal class ByteArray : BaseArray 39 { 40 private byte[] m_array; 41 private int m_pos; 42 public ByteArray(byte[] array, int pos) 43 { 44 m_pos = pos; 45 m_array = array; 46 } 47 ~ByteArray() { m_array = null; } 48 public int this[int i] 49 { 50 set { m_array[i + m_pos] = (byte)value; } 51 get { return (int)m_array[i + m_pos]; } 52 } 53 } 54 55 internal class CharArray : BaseArray 56 { 57 private char[] m_array; 58 private int m_pos; 59 public CharArray(char[] array, int pos) 60 { 61 m_pos = pos; 62 m_array = array; 63 } 64 ~CharArray() { m_array = null; } 65 public int this[int i] 66 { 67 set { m_array[i + m_pos] = (char)value; } 68 get { return (int)m_array[i + m_pos]; } 69 } 70 } 71 72 internal class IntArray : BaseArray 73 { 74 private int[] m_array; 75 private int m_pos; 76 public IntArray(int[] array, int pos) 77 { 78 m_pos = pos; 79 m_array = array; 80 } 81 ~IntArray() { m_array = null; } 82 public int this[int i] 83 { 84 set { m_array[i + m_pos] = value; } 85 get { return m_array[i + m_pos]; } 86 } 87 } 88 89 internal class StringArray : BaseArray 90 { 91 private string m_array; 92 private int m_pos; 93 public StringArray(string array, int pos) 94 { 95 m_pos = pos; 96 m_array = array; 97 } 98 ~StringArray() { m_array = null; } 99 public int this[int i] 100 { 101 set { } 102 get { return (int)m_array[i + m_pos]; } 103 } 104 } 105 106 /// <summary> 107 /// An implementation of the induced sorting based suffix array construction algorithm. 108 /// </summary> 109 public static class SAIS 110 { 111 private const int MINBUCKETSIZE = 256; 112 113 private static 114 void 115 getCounts(BaseArray T, BaseArray C, int n, int k) 116 { 117 int i; 118 for (i = 0; i < k; ++i) { C[i] = 0; } 119 for (i = 0; i < n; ++i) { C[T[i]] = C[T[i]] + 1; } 120 } 121 private static 122 void 123 getBuckets(BaseArray C, BaseArray B, int k, bool end) 124 { 125 int i, sum = 0; 126 if (end != false) { for (i = 0; i < k; ++i) { sum += C[i]; B[i] = sum; } } 127 else { for (i = 0; i < k; ++i) { sum += C[i]; B[i] = sum - C[i]; } } 128 } 129 130 /* sort all type LMS suffixes */ 131 private static 132 void 133 LMSsort(BaseArray T, int[] SA, BaseArray C, BaseArray B, int n, int k) 134 { 135 int b, i, j; 136 int c0, c1; 137 /* compute SAl */ 138 if (C == B) { getCounts(T, C, n, k); } 139 getBuckets(C, B, k, false); /* find starts of buckets */ 140 j = n - 1; 141 b = B[c1 = T[j]]; 142 --j; 143 SA[b++] = (T[j] < c1) ? ~j : j; 144 for (i = 0; i < n; ++i) 145 { 146 if (0 < (j = SA[i])) 147 { 148 if ((c0 = T[j]) != c1) { B[c1] = b; b = B[c1 = c0]; } 149 --j; 150 SA[b++] = (T[j] < c1) ? ~j : j; 151 SA[i] = 0; 152 } 153 else if (j < 0) 154 { 155 SA[i] = ~j; 156 } 157 } 158 /* compute SAs */ 159 if (C == B) { getCounts(T, C, n, k); } 160 getBuckets(C, B, k, true); /* find ends of buckets */ 161 for (i = n - 1, b = B[c1 = 0]; 0 <= i; --i) 162 { 163 if (0 < (j = SA[i])) 164 { 165 if ((c0 = T[j]) != c1) { B[c1] = b; b = B[c1 = c0]; } 166 --j; 167 SA[--b] = (T[j] > c1) ? ~(j + 1) : j; 168 SA[i] = 0; 169 } 170 } 171 } 172 private static 173 int 174 LMSpostproc(BaseArray T, int[] SA, int n, int m) 175 { 176 int i, j, p, q, plen, qlen, name; 177 int c0, c1; 178 bool diff; 179 180 /* compact all the sorted substrings into the first m items of SA 181 2*m must be not larger than n (proveable) */ 182 for (i = 0; (p = SA[i]) < 0; ++i) { SA[i] = ~p; } 183 if (i < m) 184 { 185 for (j = i, ++i; ; ++i) 186 { 187 if ((p = SA[i]) < 0) 188 { 189 SA[j++] = ~p; SA[i] = 0; 190 if (j == m) { break; } 191 } 192 } 193 } 194 195 /* store the length of all substrings */ 196 i = n - 1; j = n - 1; c0 = T[n - 1]; 197 do { c1 = c0; } while ((0 <= --i) && ((c0 = T[i]) >= c1)); 198 for (; 0 <= i; ) 199 { 200 do { c1 = c0; } while ((0 <= --i) && ((c0 = T[i]) <= c1)); 201 if (0 <= i) 202 { 203 SA[m + ((i + 1) >> 1)] = j - i; j = i + 1; 204 do { c1 = c0; } while ((0 <= --i) && ((c0 = T[i]) >= c1)); 205 } 206 } 207 208 /* find the lexicographic names of all substrings */ 209 for (i = 0, name = 0, q = n, qlen = 0; i < m; ++i) 210 { 211 p = SA[i]; plen = SA[m + (p >> 1)]; diff = true; 212 if ((plen == qlen) && ((q + plen) < n)) 213 { 214 for (j = 0; (j < plen) && (T[p + j] == T[q + j]); ++j) { } 215 if (j == plen) { diff = false; } 216 } 217 if (diff != false) { ++name; q = p; qlen = plen; } 218 SA[m + (p >> 1)] = name; 219 } 220 221 return name; 222 } 223 224 /* compute SA and BWT */ 225 private static 226 void 227 induceSA(BaseArray T, int[] SA, BaseArray C, BaseArray B, int n, int k) 228 { 229 int b, i, j; 230 int c0, c1; 231 /* compute SAl */ 232 if (C == B) { getCounts(T, C, n, k); } 233 getBuckets(C, B, k, false); /* find starts of buckets */ 234 j = n - 1; 235 b = B[c1 = T[j]]; 236 SA[b++] = ((0 < j) && (T[j - 1] < c1)) ? ~j : j; 237 for (i = 0; i < n; ++i) 238 { 239 j = SA[i]; SA[i] = ~j; 240 if (0 < j) 241 { 242 if ((c0 = T[--j]) != c1) { B[c1] = b; b = B[c1 = c0]; } 243 SA[b++] = ((0 < j) && (T[j - 1] < c1)) ? ~j : j; 244 } 245 } 246 /* compute SAs */ 247 if (C == B) { getCounts(T, C, n, k); } 248 getBuckets(C, B, k, true); /* find ends of buckets */ 249 for (i = n - 1, b = B[c1 = 0]; 0 <= i; --i) 250 { 251 if (0 < (j = SA[i])) 252 { 253 if ((c0 = T[--j]) != c1) { B[c1] = b; b = B[c1 = c0]; } 254 SA[--b] = ((j == 0) || (T[j - 1] > c1)) ? ~j : j; 255 } 256 else 257 { 258 SA[i] = ~j; 259 } 260 } 261 } 262 private static 263 int 264 computeBWT(BaseArray T, int[] SA, BaseArray C, BaseArray B, int n, int k) 265 { 266 int b, i, j, pidx = -1; 267 int c0, c1; 268 /* compute SAl */ 269 if (C == B) { getCounts(T, C, n, k); } 270 getBuckets(C, B, k, false); /* find starts of buckets */ 271 j = n - 1; 272 b = B[c1 = T[j]]; 273 SA[b++] = ((0 < j) && (T[j - 1] < c1)) ? ~j : j; 274 for (i = 0; i < n; ++i) 275 { 276 if (0 < (j = SA[i])) 277 { 278 SA[i] = ~(c0 = T[--j]); 279 if (c0 != c1) { B[c1] = b; b = B[c1 = c0]; } 280 SA[b++] = ((0 < j) && (T[j - 1] < c1)) ? ~j : j; 281 } 282 else if (j != 0) 283 { 284 SA[i] = ~j; 285 } 286 } 287 /* compute SAs */ 288 if (C == B) { getCounts(T, C, n, k); } 289 getBuckets(C, B, k, true); /* find ends of buckets */ 290 for (i = n - 1, b = B[c1 = 0]; 0 <= i; --i) 291 { 292 if (0 < (j = SA[i])) 293 { 294 SA[i] = (c0 = T[--j]); 295 if (c0 != c1) { B[c1] = b; b = B[c1 = c0]; } 296 SA[--b] = ((0 < j) && (T[j - 1] > c1)) ? ~((int)T[j - 1]) : j; 297 } 298 else if (j != 0) 299 { 300 SA[i] = ~j; 301 } 302 else 303 { 304 pidx = i; 305 } 306 } 307 return pidx; 308 } 309 310 /* find the suffix array SA of T[0..n-1] in {0..k-1}^n 311 use a working space (excluding T and SA) of at most 2n+O(1) for a constant alphabet */ 312 private static 313 int 314 sais_main(BaseArray T, int[] SA, int fs, int n, int k, bool isbwt) 315 { 316 BaseArray C, B, RA; 317 int i, j, b, m, p, q, name, pidx = 0, newfs; 318 int c0, c1; 319 uint flags = 0; 320 321 if (k <= MINBUCKETSIZE) 322 { 323 C = new IntArray(new int[k], 0); 324 if (k <= fs) { B = new IntArray(SA, n + fs - k); flags = 1; } 325 else { B = new IntArray(new int[k], 0); flags = 3; } 326 } 327 else if (k <= fs) 328 { 329 C = new IntArray(SA, n + fs - k); 330 if (k <= (fs - k)) { B = new IntArray(SA, n + fs - k * 2); flags = 0; } 331 else if (k <= (MINBUCKETSIZE * 4)) { B = new IntArray(new int[k], 0); flags = 2; } 332 else { B = C; flags = 8; } 333 } 334 else 335 { 336 C = B = new IntArray(new int[k], 0); 337 flags = 4 | 8; 338 } 339 340 /* stage 1: reduce the problem by at least 1/2 341 sort all the LMS-substrings */ 342 getCounts(T, C, n, k); getBuckets(C, B, k, true); /* find ends of buckets */ 343 for (i = 0; i < n; ++i) { SA[i] = 0; } 344 b = -1; i = n - 1; j = n; m = 0; c0 = T[n - 1]; 345 do { c1 = c0; } while ((0 <= --i) && ((c0 = T[i]) >= c1)); 346 for (; 0 <= i; ) 347 { 348 do { c1 = c0; } while ((0 <= --i) && ((c0 = T[i]) <= c1)); 349 if (0 <= i) 350 { 351 if (0 <= b) { SA[b] = j; } b = --B[c1]; j = i; ++m; 352 do { c1 = c0; } while ((0 <= --i) && ((c0 = T[i]) >= c1)); 353 } 354 } 355 if (1 < m) 356 { 357 LMSsort(T, SA, C, B, n, k); 358 name = LMSpostproc(T, SA, n, m); 359 } 360 else if (m == 1) 361 { 362 SA[b] = j + 1; 363 name = 1; 364 } 365 else 366 { 367 name = 0; 368 } 369 370 /* stage 2: solve the reduced problem 371 recurse if names are not yet unique */ 372 if (name < m) 373 { 374 if ((flags & 4) != 0) { C = null; B = null; } 375 if ((flags & 2) != 0) { B = null; } 376 newfs = (n + fs) - (m * 2); 377 if ((flags & (1 | 4 | 8)) == 0) 378 { 379 if ((k + name) <= newfs) { newfs -= k; } 380 else { flags |= 8; } 381 } 382 for (i = m + (n >> 1) - 1, j = m * 2 + newfs - 1; m <= i; --i) 383 { 384 if (SA[i] != 0) { SA[j--] = SA[i] - 1; } 385 } 386 RA = new IntArray(SA, m + newfs); 387 sais_main(RA, SA, newfs, m, name, false); 388 RA = null; 389 390 i = n - 1; j = m * 2 - 1; c0 = T[n - 1]; 391 do { c1 = c0; } while ((0 <= --i) && ((c0 = T[i]) >= c1)); 392 for (; 0 <= i; ) 393 { 394 do { c1 = c0; } while ((0 <= --i) && ((c0 = T[i]) <= c1)); 395 if (0 <= i) 396 { 397 SA[j--] = i + 1; 398 do { c1 = c0; } while ((0 <= --i) && ((c0 = T[i]) >= c1)); 399 } 400 } 401 402 for (i = 0; i < m; ++i) { SA[i] = SA[m + SA[i]]; } 403 if ((flags & 4) != 0) { C = B = new IntArray(new int[k], 0); } 404 if ((flags & 2) != 0) { B = new IntArray(new int[k], 0); } 405 } 406 407 /* stage 3: induce the result for the original problem */ 408 if ((flags & 8) != 0) { getCounts(T, C, n, k); } 409 /* put all left-most S characters into their buckets */ 410 if (1 < m) 411 { 412 getBuckets(C, B, k, true); /* find ends of buckets */ 413 i = m - 1; j = n; p = SA[m - 1]; c1 = T[p]; 414 do 415 { 416 q = B[c0 = c1]; 417 while (q < j) { SA[--j] = 0; } 418 do 419 { 420 SA[--j] = p; 421 if (--i < 0) { break; } 422 p = SA[i]; 423 } while ((c1 = T[p]) == c0); 424 } while (0 <= i); 425 while (0 < j) { SA[--j] = 0; } 426 } 427 if (isbwt == false) { induceSA(T, SA, C, B, n, k); } 428 else { pidx = computeBWT(T, SA, C, B, n, k); } 429 C = null; B = null; 430 return pidx; 431 } 432 433 /*- Suffixsorting -*/ 434 /* byte */ 435 /// <summary> 436 /// Constructs the suffix array of a given string in linear time. 437 /// </summary> 438 /// <param name="T">input string</param> 439 /// <param name="SA">output suffix array</param> 440 /// <param name="n">length of the given string</param> 441 /// <returns>0 if no error occurred, -1 or -2 otherwise</returns> 442 public static 443 int 444 sufsort(byte[] T, int[] SA, int n) 445 { 446 if ((T == null) || (SA == null) || 447 (T.Length < n) || (SA.Length < n)) { return -1; } 448 if (n <= 1) { if (n == 1) { SA[0] = 0; } return 0; } 449 return sais_main(new ByteArray(T, 0), SA, 0, n, 256, false); 450 } 451 /* char */ 452 /// <summary> 453 /// Constructs the suffix array of a given string in linear time. 454 /// </summary> 455 /// <param name="T">input string</param> 456 /// <param name="SA">output suffix array</param> 457 /// <param name="n">length of the given string</param> 458 /// <returns>0 if no error occurred, -1 or -2 otherwise</returns> 459 public static 460 int 461 sufsort(char[] T, int[] SA, int n) 462 { 463 if ((T == null) || (SA == null) || 464 (T.Length < n) || (SA.Length < n)) { return -1; } 465 if (n <= 1) { if (n == 1) { SA[0] = 0; } return 0; } 466 return sais_main(new CharArray(T, 0), SA, 0, n, 65536, false); 467 } 468 /* int */ 469 /// <summary> 470 /// Constructs the suffix array of a given string in linear time. 471 /// </summary> 472 /// <param name="T">input string</param> 473 /// <param name="SA">output suffix array</param> 474 /// <param name="n">length of the given string</param> 475 /// <param name="k">alphabet size</param> 476 /// <returns>0 if no error occurred, -1 or -2 otherwise</returns> 477 public static 478 int 479 sufsort(int[] T, int[] SA, int n, int k) 480 { 481 if ((T == null) || (SA == null) || 482 (T.Length < n) || (SA.Length < n) || 483 (k <= 0)) { return -1; } 484 if (n <= 1) { if (n == 1) { SA[0] = 0; } return 0; } 485 return sais_main(new IntArray(T, 0), SA, 0, n, k, false); 486 } 487 /* string */ 488 /// <summary> 489 /// Constructs the suffix array of a given string in linear time. 490 /// </summary> 491 /// <param name="T">input string</param> 492 /// <param name="SA">output suffix array</param> 493 /// <param name="n">length of the given string</param> 494 /// <returns>0 if no error occurred, -1 or -2 otherwise</returns> 495 public static 496 int 497 sufsort(string T, int[] SA, int n) 498 { 499 if ((T == null) || (SA == null) || 500 (T.Length < n) || (SA.Length < n)) { return -1; } 501 if (n <= 1) { if (n == 1) { SA[0] = 0; } return 0; } 502 return sais_main(new StringArray(T, 0), SA, 0, n, 65536, false); 503 } 504 505 /*- Burrows-Wheeler Transform -*/ 506 /* byte */ 507 /// <summary> 508 /// Constructs the burrows-wheeler transformed string of a given string in linear time. 509 /// </summary> 510 /// <param name="T">input string</param> 511 /// <param name="U">output string</param> 512 /// <param name="A">temporary array</param> 513 /// <param name="n">length of the given string</param> 514 /// <returns>primary index if no error occurred, -1 or -2 otherwise</returns> 515 public static 516 int 517 bwt(byte[] T, byte[] U, int[] A, int n) 518 { 519 int i, pidx; 520 if ((T == null) || (U == null) || (A == null) || 521 (T.Length < n) || (U.Length < n) || (A.Length < n)) { return -1; } 522 if (n <= 1) { if (n == 1) { U[0] = T[0]; } return n; } 523 pidx = sais_main(new ByteArray(T, 0), A, 0, n, 256, true); 524 U[0] = T[n - 1]; 525 for (i = 0; i < pidx; ++i) { U[i + 1] = (byte)A[i]; } 526 for (i += 1; i < n; ++i) { U[i] = (byte)A[i]; } 527 return pidx + 1; 528 } 529 /* char */ 530 /// <summary> 531 /// Constructs the burrows-wheeler transformed string of a given string in linear time. 532 /// </summary> 533 /// <param name="T">input string</param> 534 /// <param name="U">output string</param> 535 /// <param name="A">temporary array</param> 536 /// <param name="n">length of the given string</param> 537 /// <returns>primary index if no error occurred, -1 or -2 otherwise</returns> 538 public static 539 int 540 bwt(char[] T, char[] U, int[] A, int n) 541 { 542 int i, pidx; 543 if ((T == null) || (U == null) || (A == null) || 544 (T.Length < n) || (U.Length < n) || (A.Length < n)) { return -1; } 545 if (n <= 1) { if (n == 1) { U[0] = T[0]; } return n; } 546 pidx = sais_main(new CharArray(T, 0), A, 0, n, 65536, true); 547 U[0] = T[n - 1]; 548 for (i = 0; i < pidx; ++i) { U[i + 1] = (char)A[i]; } 549 for (i += 1; i < n; ++i) { U[i] = (char)A[i]; } 550 return pidx + 1; 551 } 552 /* int */ 553 /// <summary> 554 /// Constructs the burrows-wheeler transformed string of a given string in linear time. 555 /// </summary> 556 /// <param name="T">input string</param> 557 /// <param name="U">output string</param> 558 /// <param name="A">temporary array</param> 559 /// <param name="n">length of the given string</param> 560 /// <param name="k">alphabet size</param> 561 /// <returns>primary index if no error occurred, -1 or -2 otherwise</returns> 562 public static 563 int 564 bwt(int[] T, int[] U, int[] A, int n, int k) 565 { 566 int i, pidx; 567 if ((T == null) || (U == null) || (A == null) || 568 (T.Length < n) || (U.Length < n) || (A.Length < n) || 569 (k <= 0)) { return -1; } 570 if (n <= 1) { if (n == 1) { U[0] = T[0]; } return n; } 571 pidx = sais_main(new IntArray(T, 0), A, 0, n, k, true); 572 U[0] = T[n - 1]; 573 for (i = 0; i < pidx; ++i) { U[i + 1] = A[i]; } 574 for (i += 1; i < n; ++i) { U[i] = A[i]; } 575 return pidx + 1; 576 } 577 } 578 }
1 /* 2 * suftest.cs for SAIS-CSharp 3 * Copyright (c) 2010 Yuta Mori. All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person 6 * obtaining a copy of this software and associated documentation 7 * files (the "Software"), to deal in the Software without 8 * restriction, including without limitation the rights to use, 9 * copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following 12 * conditions: 13 * 14 * The above copyright notice and this permission notice shall be 15 * included in all copies or substantial portions of the Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 19 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 21 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 22 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 24 * OTHER DEALINGS IN THE SOFTWARE. 25 */ 26 27 using SuffixArray; 28 29 namespace suftest 30 { 31 class MainClass 32 { 33 private static 34 int 35 check(byte[] T, int[] SA, int n, bool verbose) 36 { 37 int[] C = new int[256]; 38 int i, p, q, t; 39 int c; 40 41 if (verbose) { System.Console.Write(@"sufcheck: "); } 42 if (n == 0) 43 { 44 if (verbose) { System.Console.WriteLine("Done."); } 45 return 0; 46 } 47 48 /* Check arguments. */ 49 if ((T == null) || (SA == null) || (n < 0)) 50 { 51 if (verbose) { System.Console.WriteLine("Invalid arguments."); } 52 return -1; 53 } 54 55 /* check range: [0..n-1] */ 56 for (i = 0; i < n; ++i) 57 { 58 if ((SA[i] < 0) || (n <= SA[i])) 59 { 60 if (verbose) 61 { 62 System.Console.WriteLine("Out of the range [0," + (n - 1) + "]."); 63 System.Console.WriteLine(" SA[" + i + "]=" + SA[i]); 64 } 65 return -2; 66 } 67 } 68 69 /* check first characters. */ 70 for (i = 1; i < n; ++i) 71 { 72 if (T[SA[i - 1]] > T[SA[i]]) 73 { 74 if (verbose) 75 { 76 System.Console.WriteLine("Suffixes in wrong order."); 77 System.Console.Write(" T[SA[" + (i - 1) + "]=" + SA[i - 1] + "]=" + T[SA[i - 1]]); 78 System.Console.WriteLine(" > T[SA[" + i + "]=" + SA[i] + "]=" + T[SA[i]]); 79 } 80 return -3; 81 } 82 } 83 84 /* check suffixes. */ 85 for (i = 0; i < 256; ++i) { C[i] = 0; } 86 for (i = 0; i < n; ++i) { ++C[T[i]]; } 87 for (i = 0, p = 0; i < 256; ++i) 88 { 89 t = C[i]; 90 C[i] = p; 91 p += t; 92 } 93 94 q = C[T[n - 1]]; 95 C[T[n - 1]] += 1; 96 for (i = 0; i < n; ++i) 97 { 98 p = SA[i]; 99 if (0 < p) 100 { 101 c = T[--p]; 102 t = C[c]; 103 } 104 else 105 { 106 c = T[p = n - 1]; 107 t = q; 108 } 109 if ((t < 0) || (p != SA[t])) 110 { 111 if (verbose) 112 { 113 System.Console.WriteLine("Suffixes in wrong position."); 114 System.Console.WriteLine(" SA[" + t + "]=" + ((0 <= t) ? SA[t] : -1) + " or"); 115 System.Console.WriteLine(" SA[" + i + "]=" + SA[i]); 116 } 117 return -4; 118 } 119 if (t != q) 120 { 121 ++C[c]; 122 if ((n <= C[c]) || (T[SA[C[c]]] != c)) { C[c] = -1; } 123 } 124 } 125 C = null; 126 127 if (verbose) { System.Console.WriteLine("Done."); } 128 return 0; 129 } 130 131 public static 132 void 133 Main(string[] args) 134 { 135 int i; 136 for (i = 0; i < args.Length; ++i) 137 { 138 using (System.IO.FileStream fs = new System.IO.FileStream( 139 args[i], 140 System.IO.FileMode.Open, 141 System.IO.FileAccess.Read)) 142 { 143 System.Console.Write(args[i] + @": {0:d} bytes ... ", fs.Length); 144 byte[] T = new byte[fs.Length]; 145 int[] SA = new int[fs.Length]; 146 fs.Read(T, 0, T.Length); 147 148 System.Diagnostics.Stopwatch sw = new System.Diagnostics.Stopwatch(); 149 sw.Start(); 150 SAIS.sufsort(T, SA, T.Length); 151 sw.Stop(); 152 153 double sec = (double)sw.ElapsedTicks / (double)System.Diagnostics.Stopwatch.Frequency; 154 System.Console.WriteLine(@"{0:f4} sec", sec); 155 156 check(T, SA, T.Length, true); 157 T = null; 158 SA = null; 159 } 160 } 161 } 162 } 163 }
本文《后缀数组》由 Dennis Gao 发表自博客园,未经作者本人同意禁止任何形式的转载,任何自动或人为的爬虫转载行为均为耍流氓。
标签:style blog http io color os ar 使用 for
原文地址:http://www.cnblogs.com/gaochundong/p/suffix_array.html