标签:ring length chm with diff size sage upper div
List: imap I tried the code you referenced (the exact program and compilation script are in attachments), but it failed. The program takes input as modified UTF-7, uses MailboxToURL routine to change it to UTF-8 and then uses the URLtoMailbox routine to change it to UTF-7 again: int main(int argc, char* argv[]){ char out[OUTSIZE]; char in[OUTSIZE]; strcpy(in,argv[1]); printf("in: %s\n",in); MailboxToURL(out,in); printf("out: %s\n",out); URLtoMailbox(in,out); printf("in: %s\n",in); } As an input I gave it the following UTF-7 code: a&AQUBBA-e&AFC-f which is the code produced by Microsoft Outlook and contains bunch of Polish letters. The output of the program is the following: [tomcat@fatcat]$ ./utf7test ‘a&AQUBBA-e&AFC-f‘ in: a&AQUBBA-e&AFC-f out: a%C4%85%C4%84e%50f in: a&AQUBBA-ePf So, as you can see, the conversion is not 1:1 ;-)))) Strange enough, if I use the resulting output (a&AQUBBA-ePf) as an input to another iteration, it starts behaving correctly ;-))) Can you help me? Marek. ps. I tried the code on linux. There are couple of strange assignments in the code (like unsigned long variable = char variable), so I mention it just in case this might be of some importance. > -----Original Message----- > From: Chris Newman [mailto:chris+imap@innosoft.com] > Sent: Tuesday, July 24, 2001 8:43 PM > To: Marek Kowal; imap@u.washington.edu > Subject: Re: modified UTF7 to UTF8 conversion > > > Try the code in: > <http://www.innosoft.com/rfc/rfc2192.html> > > It‘s missing a security check for invalid UTF-8 chararacters > on input, but > is otherwise correct to my knowledge. If it‘s broken, please > email me the > example which breaks it so I can fix the code. > > - Chris > > --On Monday, July 23, 2001 19:52 +0200 Marek Kowal > <kowalm@onet.pl> wrote: > > > Hi there, > > > > I am having HARD time trying to convert modified UTF7 > mailbox names to > > UTF8 (which I then convert to ISO-8859-2 using iconv > library, BTW). I > > tried the UTF7 to URL UTF8 code (which I found in imap > discussion list, > > > http://www.washington.edu/imap/listarch/1997/msg00800.html), > but it does > > not seem to work correctly - if I run it one-way and then > back on some > > string, sometimes I get different results - the resulting > UTF7 code is > > not the same. > > > > Anyway, can anybody point me to the proper conversion > routines which can > > transform between modified UTF7 and UTF8? It could be > separate code, but > > if anybody did it already as iconv conversion table, that > would be great. > ["compile" (application/octet-stream)] ["utf7test.c" (application/octet-stream)] #include <stdio.h> #include <string.h> #include <iconv.h> #define OUTSIZE 1024 /* hexadecimal lookup table */ static char hex[] = "0123456789ABCDEF"; /* URL unsafe printable characters */ static char urlunsafe[] = " \"#%&+:;<=>?@[\\]^`{|}"; /* UTF7 modified base64 alphabet */ static char base64chars[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; #define UNDEFINED 64 /* UTF16 definitions */ #define UTF16MASK 0x03FFUL #define UTF16SHIFT 10 #define UTF16BASE 0x10000UL #define UTF16HIGHSTART 0xD800UL #define UTF16HIGHEND 0xDBFFUL #define UTF16LOSTART 0xDC00UL #define UTF16LOEND 0xDFFFUL /* Convert an IMAP mailbox to a URL path * dst needs to have roughly 4 times the storage space of src * Hex encoding can triple the size of the input * UTF-7 can be slightly denser than UTF-8 * (worst case: 8 octets UTF-7 becomes 9 octets UTF-8) */ void MailboxToURL(char *dst, char *src) { unsigned char c, i, bitcount; unsigned long ucs4, utf16, bitbuf; unsigned char base64[256], utf8[6]; /* initialize modified base64 decoding table */ memset(base64, UNDEFINED, sizeof (base64)); for (i = 0; i < sizeof (base64chars); ++i) { base64[base64chars[i]] = i; } /* loop until end of string */ while (*src != ‘\0‘) { c = *src++; /* deal with literal characters and &- */ if (c != ‘&‘ || *src == ‘-‘) { if (c < ‘ ‘ || c > ‘~‘ || strchr(urlunsafe, c) != NULL) { /* hex encode if necessary */ dst[0] = ‘%‘; dst[1] = hex[c >> 4]; dst[2] = hex[c & 0x0f]; dst += 3; } else { /* encode literally */ *dst++ = c; } /* skip over the ‘-‘ if this is an &- sequence */ if (c == ‘&‘) ++src; } else { /* convert modified UTF-7 -> UTF-16 -> UCS-4 -> UTF-8 -> HEX */ bitbuf = 0; bitcount = 0; ucs4 = 0; while ((c = base64[(unsigned char) *src]) != UNDEFINED) { ++src; bitbuf = (bitbuf << 6) | c; bitcount += 6; /* enough bits for a UTF-16 character? */ if (bitcount >= 16) { bitcount -= 16; utf16 = (bitcount ? bitbuf >> bitcount : bitbuf) & 0xffff; /* convert UTF16 to UCS4 */ if (utf16 >= UTF16HIGHSTART && utf16 <= UTF16HIGHEND) { ucs4 = (utf16 - UTF16HIGHSTART) << UTF16SHIFT; continue; } else if (utf16 >= UTF16LOSTART && utf16 <= UTF16LOEND) { ucs4 += utf16 - UTF16LOSTART + UTF16BASE; } else { ucs4 = utf16; } /* convert UTF-16 range of UCS4 to UTF-8 */ if (ucs4 <= 0x7fUL) { utf8[0] = ucs4; i = 1; } else if (ucs4 <= 0x7ffUL) { utf8[0] = 0xc0 | (ucs4 >> 6); utf8[1] = 0x80 | (ucs4 & 0x3f); i = 2; } else if (ucs4 <= 0xffffUL) { utf8[0] = 0xe0 | (ucs4 >> 12); utf8[1] = 0x80 | ((ucs4 >> 6) & 0x3f); utf8[2] = 0x80 | (ucs4 & 0x3f); i = 3; } else { utf8[0] = 0xf0 | (ucs4 >> 18); utf8[1] = 0x80 | ((ucs4 >> 12) & 0x3f); utf8[2] = 0x80 | ((ucs4 >> 6) & 0x3f); utf8[3] = 0x80 | (ucs4 & 0x3f); i = 4; } /* convert utf8 to hex */ for (c = 0; c < i; ++c) { dst[0] = ‘%‘; dst[1] = hex[utf8[c] >> 4]; dst[2] = hex[utf8[c] & 0x0f]; dst += 3; } } } /* skip over trailing ‘-‘ in modified UTF-7 encoding */ if (*src == ‘-‘) ++src; } } /* terminate destination string */ *dst = ‘\0‘; } /* Convert hex coded UTF-8 URL path to modified UTF-7 IMAP mailbox * dst should be about twice the length of src to deal with non-hex * coded URLs */ void URLtoMailbox(char *dst, char *src) { unsigned int utf8pos, utf8total, i, c, utf7mode, bitstogo, utf16flag; unsigned long ucs4, bitbuf; unsigned char hextab[256]; /* initialize hex lookup table */ memset(hextab, 0, sizeof (hextab)); for (i = 0; i < sizeof (hex); ++i) { hextab[hex[i]] = i; if (isupper(hex[i])) hextab[tolower(hex[i])] = i; } utf7mode = 0; utf8total = 0; bitstogo = 0; while ((c = *src) != ‘\0‘) { ++src; /* undo hex-encoding */ if (c == ‘%‘ && src[0] != ‘\0‘ && src[1] != ‘\0‘) { c = (hextab[src[0]] << 4) | hextab[src[1]]; src += 2; } /* normal character? */ if (c >= ‘ ‘ && c <= ‘~‘) { /* switch out of UTF-7 mode */ if (utf7mode) { if (bitstogo) { *dst++ = base64chars[(bitbuf << (6 - bitstogo)) & 0x3F]; } *dst++ = ‘-‘; utf7mode = 0; } *dst++ = c; /* encode ‘&‘ as ‘&-‘ */ if (c == ‘&‘) { *dst++ = ‘-‘; } continue; } /* switch to UTF-7 mode */ if (!utf7mode) { *dst++ = ‘&‘; utf7mode = 1; } /* Encode US-ASCII characters as themselves */ if (c < 0x80) { ucs4 = c; utf8total = 1; } else if (utf8total) { /* save UTF8 bits into UCS4 */ ucs4 = (ucs4 << 6) | (c & 0x3FUL); if (++utf8pos < utf8total) { continue; } } else { utf8pos = 1; if (c < 0xE0) { utf8total = 2; ucs4 = c & 0x1F; } else if (c < 0xF0) { utf8total = 3; ucs4 = c & 0x0F; } else { /* NOTE: can‘t convert UTF8 sequences longer than 4 */ utf8total = 4; ucs4 = c & 0x03; } continue; } /* loop to split ucs4 into two utf16 chars if necessary */ utf8total = 0; do { if (ucs4 >= UTF16BASE) { ucs4 -= UTF16BASE; bitbuf = (bitbuf << 16) | ((ucs4 >> UTF16SHIFT) + UTF16HIGHSTART); ucs4 = (ucs4 & UTF16MASK) + UTF16LOSTART; utf16flag = 1; } else { bitbuf = (bitbuf << 16) | ucs4; utf16flag = 0; } bitstogo += 16; /* spew out base64 */ while (bitstogo >= 6) { bitstogo -= 6; *dst++ = base64chars[(bitstogo ? (bitbuf >> bitstogo) : bitbuf) & 0x3F]; } } while (utf16flag); } /* if in UTF-7 mode, finish in ASCII */ if (utf7mode) { if (bitstogo) { *dst++ = base64chars[(bitbuf << (6 - bitstogo)) & 0x3F]; } *dst++ = ‘-‘; } /* tie off string */ *dst = ‘\0‘; } int main(int argc, char* argv[]){ char out[OUTSIZE]; char in[OUTSIZE]; strcpy(in,argv[1]); printf("in: %s\n",in); MailboxToURL(out,in); printf("out: %s\n",out); URLtoMailbox(in,out); printf("in: %s\n",in); }
标签:ring length chm with diff size sage upper div
原文地址:http://www.cnblogs.com/fanchaostudy/p/7144031.html