标签:new only 其他 null ima ret out region false
完美区分UTF-8,与UTF8 without BOM以及其他编码
最少4个byte
1 public class TextEncodingDetect 2 { 3 #region Fields 4 5 private readonly byte[] utf16LEBOM = { 0xFF, 0xFE }; 6 private readonly byte[] utf16BEBOM = { 0xFE, 0xFF }; 7 private readonly byte[] utf8BOM = { 0xEF, 0xBB, 0xBF }; 8 9 private bool nullSuggestsBinary = true; 10 private double utf16ExpectedNullPercent = 70; 11 private double utf16UnexpectedNullPercent = 10; 12 13 #endregion 14 15 #region Enums 16 17 public enum Encoding 18 { 19 None, // Unknown or binary 20 ANSI, // 0-255 21 ASCII, // 0-127 22 UTF8_BOM, // UTF8 with BOM 23 UTF8_NOBOM, // UTF8 without BOM 24 UTF16_LE_BOM, // UTF16 LE with BOM 25 UTF16_LE_NOBOM, // UTF16 LE without BOM 26 UTF16_BE_BOM, // UTF16-BE with BOM 27 UTF16_BE_NOBOM // UTF16-BE without BOM 28 } 29 30 #endregion 31 32 #region Properties 33 34 public bool NullSuggestsBinary 35 { 36 set 37 { 38 this.nullSuggestsBinary = value; 39 } 40 } 41 42 public double Utf16ExpectedNullPercent 43 { 44 set 45 { 46 if (value > 0 && value < 100) 47 { 48 this.utf16ExpectedNullPercent = value; 49 } 50 } 51 } 52 53 public double Utf16UnexpectedNullPercent 54 { 55 set 56 { 57 if (value > 0 && value < 100) 58 { 59 this.utf16UnexpectedNullPercent = value; 60 } 61 } 62 } 63 64 #endregion 65 66 public static int GetBOMLengthFromEncodingMode(Encoding encoding) 67 { 68 int length = 0; 69 70 if (encoding == Encoding.UTF16_BE_BOM || encoding == Encoding.UTF16_LE_BOM) 71 { 72 length = 2; 73 } 74 else if (encoding == Encoding.UTF8_BOM) 75 { 76 length = 3; 77 } 78 79 return length; 80 } 81 82 83 /// <summary> 84 /// 85 /// </summary> 86 /// <param name="buffer"></param> 87 /// <param name="size"></param> 88 /// <returns></returns> 89 public Encoding DetectEncoding(byte[] buffer, int size) 90 { 91 // First check if we have a BOM and return that if so 92 Encoding encoding = this.CheckBOM(buffer, size); 93 if (encoding != Encoding.None) 94 { 95 return encoding; 96 } 97 98 // Now check for valid UTF8 99 encoding = this.CheckUTF8(buffer, size); 100 if (encoding != Encoding.None) 101 { 102 return encoding; 103 } 104 105 // Now try UTF16 106 encoding = this.CheckUTF16NewlineChars(buffer, size); 107 if (encoding != Encoding.None) 108 { 109 return encoding; 110 } 111 112 encoding = this.CheckUTF16ASCII(buffer, size); 113 if (encoding != Encoding.None) 114 { 115 return encoding; 116 } 117 118 // ANSI or None (binary) then 119 if (!this.DoesContainNulls(buffer, size)) 120 { 121 return Encoding.ANSI; 122 } 123 else 124 { 125 // Found a null, return based on the preference in null_suggests_binary_ 126 if (this.nullSuggestsBinary) 127 { 128 return Encoding.None; 129 } 130 else 131 { 132 return Encoding.ANSI; 133 } 134 } 135 } 136 137 138 public Encoding CheckBOM(byte[] buffer, int size) 139 { 140 // Check for BOM 141 if (size >= 2 && buffer[0] == this.utf16LEBOM[0] && buffer[1] == this.utf16LEBOM[1]) 142 { 143 return Encoding.UTF16_LE_BOM; 144 } 145 else if (size >= 2 && buffer[0] == this.utf16BEBOM[0] && buffer[1] == this.utf16BEBOM[1]) 146 { 147 return Encoding.UTF16_BE_BOM; 148 } 149 else if (size >= 3 && buffer[0] == this.utf8BOM[0] && buffer[1] == this.utf8BOM[1] && buffer[2] == this.utf8BOM[2]) 150 { 151 return Encoding.UTF8_BOM; 152 } 153 else 154 { 155 return Encoding.None; 156 } 157 } 158 159 /////////////////////////////////////////////////////////////////////////////// 160 // Checks if a buffer contains valid utf8. Returns: 161 // None - not valid utf8 162 // UTF8_NOBOM - valid utf8 encodings and multibyte sequences 163 // ASCII - Only data in the 0-127 range. 164 /////////////////////////////////////////////////////////////////////////////// 165 166 private Encoding CheckUTF8(byte[] buffer, int size) 167 { 168 // UTF8 Valid sequences 169 // 0xxxxxxx ASCII 170 // 110xxxxx 10xxxxxx 2-byte 171 // 1110xxxx 10xxxxxx 10xxxxxx 3-byte 172 // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 4-byte 173 // 174 // Width in UTF8 175 // Decimal Width 176 // 0-127 1 byte 177 // 194-223 2 bytes 178 // 224-239 3 bytes 179 // 240-244 4 bytes 180 // 181 // Subsequent chars are in the range 128-191 182 bool only_saw_ascii_range = true; 183 uint pos = 0; 184 int more_chars; 185 186 while (pos < size) 187 { 188 byte ch = buffer[pos++]; 189 190 if (ch == 0 && this.nullSuggestsBinary) 191 { 192 return Encoding.None; 193 } 194 else if (ch <= 127) 195 { 196 // 1 byte 197 more_chars = 0; 198 } 199 else if (ch >= 194 && ch <= 223) 200 { 201 // 2 Byte 202 more_chars = 1; 203 } 204 else if (ch >= 224 && ch <= 239) 205 { 206 // 3 Byte 207 more_chars = 2; 208 } 209 else if (ch >= 240 && ch <= 244) 210 { 211 // 4 Byte 212 more_chars = 3; 213 } 214 else 215 { 216 return Encoding.None; // Not utf8 217 } 218 219 // Check secondary chars are in range if we are expecting any 220 while (more_chars > 0 && pos < size) 221 { 222 only_saw_ascii_range = false; // Seen non-ascii chars now 223 224 ch = buffer[pos++]; 225 if (ch < 128 || ch > 191) 226 { 227 return Encoding.None; // Not utf8 228 } 229 230 --more_chars; 231 } 232 } 233 234 // If we get to here then only valid UTF-8 sequences have been processed 235 236 // If we only saw chars in the range 0-127 then we can‘t assume UTF8 (the caller will need to decide) 237 if (only_saw_ascii_range) 238 { 239 return Encoding.ASCII; 240 } 241 else 242 { 243 return Encoding.UTF8_NOBOM; 244 } 245 } 246 247 /////////////////////////////////////////////////////////////////////////////// 248 // Checks if a buffer contains text that looks like utf16 by scanning for 249 // newline chars that would be present even in non-english text. 250 // Returns: 251 // None - not valid utf16 252 // UTF16_LE_NOBOM - looks like utf16 le 253 // UTF16_BE_NOBOM - looks like utf16 be 254 /////////////////////////////////////////////////////////////////////////////// 255 256 private Encoding CheckUTF16NewlineChars(byte[] buffer, int size) 257 { 258 if (size < 2) 259 { 260 return Encoding.None; 261 } 262 263 // Reduce size by 1 so we don‘t need to worry about bounds checking for pairs of bytes 264 size--; 265 266 int le_control_chars = 0; 267 int be_control_chars = 0; 268 byte ch1, ch2; 269 270 uint pos = 0; 271 while (pos < size) 272 { 273 ch1 = buffer[pos++]; 274 ch2 = buffer[pos++]; 275 276 if (ch1 == 0) 277 { 278 if (ch2 == 0x0a || ch2 == 0x0d) 279 { 280 ++be_control_chars; 281 } 282 } 283 else if (ch2 == 0) 284 { 285 if (ch1 == 0x0a || ch1 == 0x0d) 286 { 287 ++le_control_chars; 288 } 289 } 290 291 // If we are getting both LE and BE control chars then this file is not utf16 292 if (le_control_chars > 0 && be_control_chars > 0) 293 { 294 return Encoding.None; 295 } 296 } 297 298 if (le_control_chars > 0) 299 { 300 return Encoding.UTF16_LE_NOBOM; 301 } 302 else if (be_control_chars > 0) 303 { 304 return Encoding.UTF16_BE_NOBOM; 305 } 306 else 307 { 308 return Encoding.None; 309 } 310 } 311 312 /////////////////////////////////////////////////////////////////////////////// 313 // Checks if a buffer contains text that looks like utf16. This is done based 314 // the use of nulls which in ASCII/script like text can be useful to identify. 315 // Returns: 316 // None - not valid utf16 317 // UTF16_LE_NOBOM - looks like utf16 le 318 // UTF16_BE_NOBOM - looks like utf16 be 319 /////////////////////////////////////////////////////////////////////////////// 320 321 private Encoding CheckUTF16ASCII(byte[] buffer, int size) 322 { 323 int num_odd_nulls = 0; 324 int num_even_nulls = 0; 325 326 // Get even nulls 327 uint pos = 0; 328 while (pos < size) 329 { 330 if (buffer[pos] == 0) 331 { 332 num_even_nulls++; 333 } 334 335 pos += 2; 336 } 337 338 // Get odd nulls 339 pos = 1; 340 while (pos < size) 341 { 342 if (buffer[pos] == 0) 343 { 344 num_odd_nulls++; 345 } 346 347 pos += 2; 348 } 349 350 double even_null_threshold = (num_even_nulls * 2.0) / size; 351 double odd_null_threshold = (num_odd_nulls * 2.0) / size; 352 double expected_null_threshold = this.utf16ExpectedNullPercent / 100.0; 353 double unexpected_null_threshold = this.utf16UnexpectedNullPercent / 100.0; 354 355 // Lots of odd nulls, low number of even nulls 356 if (even_null_threshold < unexpected_null_threshold && odd_null_threshold > expected_null_threshold) 357 { 358 return Encoding.UTF16_LE_NOBOM; 359 } 360 361 // Lots of even nulls, low number of odd nulls 362 if (odd_null_threshold < unexpected_null_threshold && even_null_threshold > expected_null_threshold) 363 { 364 return Encoding.UTF16_BE_NOBOM; 365 } 366 367 // Don‘t know 368 return Encoding.None; 369 } 370 371 /////////////////////////////////////////////////////////////////////////////// 372 // Checks if a buffer contains any nulls. Used to check for binary vs text data. 373 /////////////////////////////////////////////////////////////////////////////// 374 375 private bool DoesContainNulls(byte[] buffer, int size) 376 { 377 uint pos = 0; 378 while (pos < size) 379 { 380 if (buffer[pos++] == 0) 381 { 382 return true; 383 } 384 } 385 386 return false; 387 } 388 }
标签:new only 其他 null ima ret out region false
原文地址:https://www.cnblogs.com/Old-Fish/p/9169111.html