|
Ruby
2.0.0p481(2014-05-08revision45883)
|
00001 /********************************************************************** 00002 00003 encoding.c - 00004 00005 $Author: nagachika $ 00006 created at: Thu May 24 17:23:27 JST 2007 00007 00008 Copyright (C) 2007 Yukihiro Matsumoto 00009 00010 **********************************************************************/ 00011 00012 #include "ruby/ruby.h" 00013 #include "ruby/encoding.h" 00014 #include "internal.h" 00015 #include "regenc.h" 00016 #include <ctype.h> 00017 #ifndef NO_LOCALE_CHARMAP 00018 #ifdef __CYGWIN__ 00019 #include <windows.h> 00020 #endif 00021 #ifdef HAVE_LANGINFO_H 00022 #include <langinfo.h> 00023 #endif 00024 #endif 00025 #include "ruby/util.h" 00026 00027 #if defined __GNUC__ && __GNUC__ >= 4 00028 #pragma GCC visibility push(default) 00029 int rb_enc_register(const char *name, rb_encoding *encoding); 00030 void rb_enc_set_base(const char *name, const char *orig); 00031 void rb_encdb_declare(const char *name); 00032 int rb_encdb_replicate(const char *name, const char *orig); 00033 int rb_encdb_dummy(const char *name); 00034 int rb_encdb_alias(const char *alias, const char *orig); 00035 void rb_encdb_set_unicode(int index); 00036 #pragma GCC visibility pop 00037 #endif 00038 00039 static ID id_encoding; 00040 VALUE rb_cEncoding; 00041 static VALUE rb_encoding_list; 00042 00043 struct rb_encoding_entry { 00044 const char *name; 00045 rb_encoding *enc; 00046 rb_encoding *base; 00047 }; 00048 00049 static struct { 00050 struct rb_encoding_entry *list; 00051 int count; 00052 int size; 00053 st_table *names; 00054 } enc_table; 00055 00056 void rb_enc_init(void); 00057 00058 #define ENCODING_COUNT ENCINDEX_BUILTIN_MAX 00059 #define UNSPECIFIED_ENCODING INT_MAX 00060 00061 #define ENCODING_NAMELEN_MAX 63 00062 #define valid_encoding_name_p(name) ((name) && strlen(name) <= ENCODING_NAMELEN_MAX) 00063 00064 #define enc_autoload_p(enc) (!rb_enc_mbmaxlen(enc)) 00065 00066 static int load_encoding(const char *name); 00067 00068 static size_t 00069 enc_memsize(const void *p) 00070 { 00071 return 0; 00072 } 00073 00074 static const rb_data_type_t encoding_data_type = { 00075 "encoding", 00076 {0, 0, enc_memsize,}, 00077 }; 00078 00079 #define is_data_encoding(obj) (RTYPEDDATA_P(obj) && RTYPEDDATA_TYPE(obj) == &encoding_data_type) 00080 00081 static VALUE 00082 enc_new(rb_encoding *encoding) 00083 { 00084 return TypedData_Wrap_Struct(rb_cEncoding, &encoding_data_type, encoding); 00085 } 00086 00087 static VALUE 00088 rb_enc_from_encoding_index(int idx) 00089 { 00090 VALUE list, enc; 00091 00092 if (!(list = rb_encoding_list)) { 00093 rb_bug("rb_enc_from_encoding_index(%d): no rb_encoding_list", idx); 00094 } 00095 enc = rb_ary_entry(list, idx); 00096 if (NIL_P(enc)) { 00097 rb_bug("rb_enc_from_encoding_index(%d): not created yet", idx); 00098 } 00099 return enc; 00100 } 00101 00102 VALUE 00103 rb_enc_from_encoding(rb_encoding *encoding) 00104 { 00105 int idx; 00106 if (!encoding) return Qnil; 00107 idx = ENC_TO_ENCINDEX(encoding); 00108 return rb_enc_from_encoding_index(idx); 00109 } 00110 00111 static int enc_autoload(rb_encoding *); 00112 00113 static int 00114 check_encoding(rb_encoding *enc) 00115 { 00116 int index = rb_enc_to_index(enc); 00117 if (rb_enc_from_index(index) != enc) 00118 return -1; 00119 if (enc_autoload_p(enc)) { 00120 index = enc_autoload(enc); 00121 } 00122 return index; 00123 } 00124 00125 static int 00126 enc_check_encoding(VALUE obj) 00127 { 00128 if (SPECIAL_CONST_P(obj) || !rb_typeddata_is_kind_of(obj, &encoding_data_type)) { 00129 return -1; 00130 } 00131 return check_encoding(RDATA(obj)->data); 00132 } 00133 00134 static int 00135 must_encoding(VALUE enc) 00136 { 00137 int index = enc_check_encoding(enc); 00138 if (index < 0) { 00139 rb_raise(rb_eTypeError, "wrong argument type %s (expected Encoding)", 00140 rb_obj_classname(enc)); 00141 } 00142 return index; 00143 } 00144 00145 int 00146 rb_to_encoding_index(VALUE enc) 00147 { 00148 int idx; 00149 00150 idx = enc_check_encoding(enc); 00151 if (idx >= 0) { 00152 return idx; 00153 } 00154 else if (NIL_P(enc = rb_check_string_type(enc))) { 00155 return -1; 00156 } 00157 if (!rb_enc_asciicompat(rb_enc_get(enc))) { 00158 return -1; 00159 } 00160 return rb_enc_find_index(StringValueCStr(enc)); 00161 } 00162 00163 /* Returns encoding index or UNSPECIFIED_ENCODING */ 00164 static int 00165 str_find_encindex(VALUE enc) 00166 { 00167 int idx; 00168 00169 StringValue(enc); 00170 if (!rb_enc_asciicompat(rb_enc_get(enc))) { 00171 rb_raise(rb_eArgError, "invalid name encoding (non ASCII)"); 00172 } 00173 idx = rb_enc_find_index(StringValueCStr(enc)); 00174 return idx; 00175 } 00176 00177 static int 00178 str_to_encindex(VALUE enc) 00179 { 00180 int idx = str_find_encindex(enc); 00181 if (idx < 0) { 00182 rb_raise(rb_eArgError, "unknown encoding name - %s", RSTRING_PTR(enc)); 00183 } 00184 return idx; 00185 } 00186 00187 static rb_encoding * 00188 str_to_encoding(VALUE enc) 00189 { 00190 return rb_enc_from_index(str_to_encindex(enc)); 00191 } 00192 00193 rb_encoding * 00194 rb_to_encoding(VALUE enc) 00195 { 00196 if (enc_check_encoding(enc) >= 0) return RDATA(enc)->data; 00197 return str_to_encoding(enc); 00198 } 00199 00200 rb_encoding * 00201 rb_find_encoding(VALUE enc) 00202 { 00203 int idx; 00204 if (enc_check_encoding(enc) >= 0) return RDATA(enc)->data; 00205 idx = str_find_encindex(enc); 00206 if (idx < 0) return NULL; 00207 return rb_enc_from_index(idx); 00208 } 00209 00210 void 00211 rb_gc_mark_encodings(void) 00212 { 00213 } 00214 00215 static int 00216 enc_table_expand(int newsize) 00217 { 00218 struct rb_encoding_entry *ent; 00219 int count = newsize; 00220 00221 if (enc_table.size >= newsize) return newsize; 00222 newsize = (newsize + 7) / 8 * 8; 00223 ent = realloc(enc_table.list, sizeof(*enc_table.list) * newsize); 00224 if (!ent) return -1; 00225 memset(ent + enc_table.size, 0, sizeof(*ent)*(newsize - enc_table.size)); 00226 enc_table.list = ent; 00227 enc_table.size = newsize; 00228 return count; 00229 } 00230 00231 static int 00232 enc_register_at(int index, const char *name, rb_encoding *encoding) 00233 { 00234 struct rb_encoding_entry *ent = &enc_table.list[index]; 00235 VALUE list; 00236 00237 if (!valid_encoding_name_p(name)) return -1; 00238 if (!ent->name) { 00239 ent->name = name = strdup(name); 00240 } 00241 else if (STRCASECMP(name, ent->name)) { 00242 return -1; 00243 } 00244 if (!ent->enc) { 00245 ent->enc = xmalloc(sizeof(rb_encoding)); 00246 } 00247 if (encoding) { 00248 *ent->enc = *encoding; 00249 } 00250 else { 00251 memset(ent->enc, 0, sizeof(*ent->enc)); 00252 } 00253 encoding = ent->enc; 00254 encoding->name = name; 00255 encoding->ruby_encoding_index = index; 00256 st_insert(enc_table.names, (st_data_t)name, (st_data_t)index); 00257 list = rb_encoding_list; 00258 if (list && NIL_P(rb_ary_entry(list, index))) { 00259 /* initialize encoding data */ 00260 rb_ary_store(list, index, enc_new(encoding)); 00261 } 00262 return index; 00263 } 00264 00265 static int 00266 enc_register(const char *name, rb_encoding *encoding) 00267 { 00268 int index = enc_table.count; 00269 00270 if ((index = enc_table_expand(index + 1)) < 0) return -1; 00271 enc_table.count = index; 00272 return enc_register_at(index - 1, name, encoding); 00273 } 00274 00275 static void set_encoding_const(const char *, rb_encoding *); 00276 int rb_enc_registered(const char *name); 00277 00278 int 00279 rb_enc_register(const char *name, rb_encoding *encoding) 00280 { 00281 int index = rb_enc_registered(name); 00282 00283 if (index >= 0) { 00284 rb_encoding *oldenc = rb_enc_from_index(index); 00285 if (STRCASECMP(name, rb_enc_name(oldenc))) { 00286 index = enc_register(name, encoding); 00287 } 00288 else if (enc_autoload_p(oldenc) || !ENC_DUMMY_P(oldenc)) { 00289 enc_register_at(index, name, encoding); 00290 } 00291 else { 00292 rb_raise(rb_eArgError, "encoding %s is already registered", name); 00293 } 00294 } 00295 else { 00296 index = enc_register(name, encoding); 00297 set_encoding_const(name, rb_enc_from_index(index)); 00298 } 00299 return index; 00300 } 00301 00302 void 00303 rb_encdb_declare(const char *name) 00304 { 00305 int idx = rb_enc_registered(name); 00306 if (idx < 0) { 00307 idx = enc_register(name, 0); 00308 } 00309 set_encoding_const(name, rb_enc_from_index(idx)); 00310 } 00311 00312 static void 00313 enc_check_duplication(const char *name) 00314 { 00315 if (rb_enc_registered(name) >= 0) { 00316 rb_raise(rb_eArgError, "encoding %s is already registered", name); 00317 } 00318 } 00319 00320 static rb_encoding* 00321 set_base_encoding(int index, rb_encoding *base) 00322 { 00323 rb_encoding *enc = enc_table.list[index].enc; 00324 00325 enc_table.list[index].base = base; 00326 if (rb_enc_dummy_p(base)) ENC_SET_DUMMY(enc); 00327 return enc; 00328 } 00329 00330 /* for encdb.h 00331 * Set base encoding for encodings which are not replicas 00332 * but not in their own files. 00333 */ 00334 void 00335 rb_enc_set_base(const char *name, const char *orig) 00336 { 00337 int idx = rb_enc_registered(name); 00338 int origidx = rb_enc_registered(orig); 00339 set_base_encoding(idx, rb_enc_from_index(origidx)); 00340 } 00341 00342 int 00343 rb_enc_replicate(const char *name, rb_encoding *encoding) 00344 { 00345 int idx; 00346 00347 enc_check_duplication(name); 00348 idx = enc_register(name, encoding); 00349 set_base_encoding(idx, encoding); 00350 set_encoding_const(name, rb_enc_from_index(idx)); 00351 return idx; 00352 } 00353 00354 /* 00355 * call-seq: 00356 * enc.replicate(name) -> encoding 00357 * 00358 * Returns a replicated encoding of _enc_ whose name is _name_. 00359 * The new encoding should have the same byte structure of _enc_. 00360 * If _name_ is used by another encoding, raise ArgumentError. 00361 * 00362 */ 00363 static VALUE 00364 enc_replicate(VALUE encoding, VALUE name) 00365 { 00366 return rb_enc_from_encoding_index( 00367 rb_enc_replicate(StringValueCStr(name), 00368 rb_to_encoding(encoding))); 00369 } 00370 00371 static int 00372 enc_replicate_with_index(const char *name, rb_encoding *origenc, int idx) 00373 { 00374 if (idx < 0) { 00375 idx = enc_register(name, origenc); 00376 } 00377 else { 00378 idx = enc_register_at(idx, name, origenc); 00379 } 00380 if (idx >= 0) { 00381 set_base_encoding(idx, origenc); 00382 set_encoding_const(name, rb_enc_from_index(idx)); 00383 } 00384 return idx; 00385 } 00386 00387 int 00388 rb_encdb_replicate(const char *name, const char *orig) 00389 { 00390 int origidx = rb_enc_registered(orig); 00391 int idx = rb_enc_registered(name); 00392 00393 if (origidx < 0) { 00394 origidx = enc_register(orig, 0); 00395 } 00396 return enc_replicate_with_index(name, rb_enc_from_index(origidx), idx); 00397 } 00398 00399 int 00400 rb_define_dummy_encoding(const char *name) 00401 { 00402 int index = rb_enc_replicate(name, rb_ascii8bit_encoding()); 00403 rb_encoding *enc = enc_table.list[index].enc; 00404 00405 ENC_SET_DUMMY(enc); 00406 return index; 00407 } 00408 00409 int 00410 rb_encdb_dummy(const char *name) 00411 { 00412 int index = enc_replicate_with_index(name, rb_ascii8bit_encoding(), 00413 rb_enc_registered(name)); 00414 rb_encoding *enc = enc_table.list[index].enc; 00415 00416 ENC_SET_DUMMY(enc); 00417 return index; 00418 } 00419 00420 /* 00421 * call-seq: 00422 * enc.dummy? -> true or false 00423 * 00424 * Returns true for dummy encodings. 00425 * A dummy encoding is an encoding for which character handling is not properly 00426 * implemented. 00427 * It is used for stateful encodings. 00428 * 00429 * Encoding::ISO_2022_JP.dummy? #=> true 00430 * Encoding::UTF_8.dummy? #=> false 00431 * 00432 */ 00433 static VALUE 00434 enc_dummy_p(VALUE enc) 00435 { 00436 return ENC_DUMMY_P(enc_table.list[must_encoding(enc)].enc) ? Qtrue : Qfalse; 00437 } 00438 00439 /* 00440 * call-seq: 00441 * enc.ascii_compatible? -> true or false 00442 * 00443 * Returns whether ASCII-compatible or not. 00444 * 00445 * Encoding::UTF_8.ascii_compatible? #=> true 00446 * Encoding::UTF_16BE.ascii_compatible? #=> false 00447 * 00448 */ 00449 static VALUE 00450 enc_ascii_compatible_p(VALUE enc) 00451 { 00452 return rb_enc_asciicompat(enc_table.list[must_encoding(enc)].enc) ? Qtrue : Qfalse; 00453 } 00454 00455 /* 00456 * Returns 1 when the encoding is Unicode series other than UTF-7 else 0. 00457 */ 00458 int 00459 rb_enc_unicode_p(rb_encoding *enc) 00460 { 00461 return ONIGENC_IS_UNICODE(enc); 00462 } 00463 00464 static st_data_t 00465 enc_dup_name(st_data_t name) 00466 { 00467 return (st_data_t)strdup((const char *)name); 00468 } 00469 00470 /* 00471 * Returns copied alias name when the key is added for st_table, 00472 * else returns NULL. 00473 */ 00474 static int 00475 enc_alias_internal(const char *alias, int idx) 00476 { 00477 return st_insert2(enc_table.names, (st_data_t)alias, (st_data_t)idx, 00478 enc_dup_name); 00479 } 00480 00481 static int 00482 enc_alias(const char *alias, int idx) 00483 { 00484 if (!valid_encoding_name_p(alias)) return -1; 00485 if (!enc_alias_internal(alias, idx)) 00486 set_encoding_const(alias, rb_enc_from_index(idx)); 00487 return idx; 00488 } 00489 00490 int 00491 rb_enc_alias(const char *alias, const char *orig) 00492 { 00493 int idx; 00494 00495 enc_check_duplication(alias); 00496 if (!enc_table.list) { 00497 rb_enc_init(); 00498 } 00499 if ((idx = rb_enc_find_index(orig)) < 0) { 00500 return -1; 00501 } 00502 return enc_alias(alias, idx); 00503 } 00504 00505 int 00506 rb_encdb_alias(const char *alias, const char *orig) 00507 { 00508 int idx = rb_enc_registered(orig); 00509 00510 if (idx < 0) { 00511 idx = enc_register(orig, 0); 00512 } 00513 return enc_alias(alias, idx); 00514 } 00515 00516 void 00517 rb_encdb_set_unicode(int index) 00518 { 00519 rb_enc_from_index(index)->flags |= ONIGENC_FLAG_UNICODE; 00520 } 00521 00522 enum { 00523 ENCINDEX_ASCII, 00524 ENCINDEX_UTF_8, 00525 ENCINDEX_US_ASCII, 00526 ENCINDEX_BUILTIN_MAX 00527 }; 00528 00529 extern rb_encoding OnigEncodingUTF_8; 00530 extern rb_encoding OnigEncodingUS_ASCII; 00531 00532 void 00533 rb_enc_init(void) 00534 { 00535 enc_table_expand(ENCODING_COUNT + 1); 00536 if (!enc_table.names) { 00537 enc_table.names = st_init_strcasetable(); 00538 } 00539 #define ENC_REGISTER(enc) enc_register_at(ENCINDEX_##enc, rb_enc_name(&OnigEncoding##enc), &OnigEncoding##enc) 00540 ENC_REGISTER(ASCII); 00541 ENC_REGISTER(UTF_8); 00542 ENC_REGISTER(US_ASCII); 00543 #undef ENC_REGISTER 00544 enc_table.count = ENCINDEX_BUILTIN_MAX; 00545 } 00546 00547 rb_encoding * 00548 rb_enc_from_index(int index) 00549 { 00550 if (!enc_table.list) { 00551 rb_enc_init(); 00552 } 00553 if (index < 0 || enc_table.count <= index) { 00554 return 0; 00555 } 00556 return enc_table.list[index].enc; 00557 } 00558 00559 int 00560 rb_enc_registered(const char *name) 00561 { 00562 st_data_t idx = 0; 00563 00564 if (!name) return -1; 00565 if (!enc_table.list) return -1; 00566 if (st_lookup(enc_table.names, (st_data_t)name, &idx)) { 00567 return (int)idx; 00568 } 00569 return -1; 00570 } 00571 00572 static VALUE 00573 require_enc(VALUE enclib) 00574 { 00575 int safe = rb_safe_level(); 00576 return rb_require_safe(enclib, safe > 3 ? 3 : safe); 00577 } 00578 00579 static int 00580 load_encoding(const char *name) 00581 { 00582 VALUE enclib = rb_sprintf("enc/%s.so", name); 00583 VALUE verbose = ruby_verbose; 00584 VALUE debug = ruby_debug; 00585 VALUE errinfo; 00586 VALUE loaded; 00587 char *s = RSTRING_PTR(enclib) + 4, *e = RSTRING_END(enclib) - 3; 00588 int idx; 00589 00590 while (s < e) { 00591 if (!ISALNUM(*s)) *s = '_'; 00592 else if (ISUPPER(*s)) *s = (char)TOLOWER(*s); 00593 ++s; 00594 } 00595 FL_UNSET(enclib, FL_TAINT|FL_UNTRUSTED); 00596 OBJ_FREEZE(enclib); 00597 ruby_verbose = Qfalse; 00598 ruby_debug = Qfalse; 00599 errinfo = rb_errinfo(); 00600 loaded = rb_protect(require_enc, enclib, 0); 00601 ruby_verbose = verbose; 00602 ruby_debug = debug; 00603 rb_set_errinfo(errinfo); 00604 if (NIL_P(loaded)) return -1; 00605 if ((idx = rb_enc_registered(name)) < 0) return -1; 00606 if (enc_autoload_p(enc_table.list[idx].enc)) return -1; 00607 return idx; 00608 } 00609 00610 static int 00611 enc_autoload(rb_encoding *enc) 00612 { 00613 int i; 00614 rb_encoding *base = enc_table.list[ENC_TO_ENCINDEX(enc)].base; 00615 00616 if (base) { 00617 i = 0; 00618 do { 00619 if (i >= enc_table.count) return -1; 00620 } while (enc_table.list[i].enc != base && (++i, 1)); 00621 if (enc_autoload_p(base)) { 00622 if (enc_autoload(base) < 0) return -1; 00623 } 00624 i = ENC_TO_ENCINDEX(enc); 00625 enc_register_at(i, rb_enc_name(enc), base); 00626 } 00627 else { 00628 i = load_encoding(rb_enc_name(enc)); 00629 } 00630 return i; 00631 } 00632 00633 /* Return encoding index or UNSPECIFIED_ENCODING from encoding name */ 00634 int 00635 rb_enc_find_index(const char *name) 00636 { 00637 int i = rb_enc_registered(name); 00638 rb_encoding *enc; 00639 00640 if (i < 0) { 00641 i = load_encoding(name); 00642 } 00643 else if (!(enc = rb_enc_from_index(i))) { 00644 if (i != UNSPECIFIED_ENCODING) { 00645 rb_raise(rb_eArgError, "encoding %s is not registered", name); 00646 } 00647 } 00648 else if (enc_autoload_p(enc)) { 00649 if (enc_autoload(enc) < 0) { 00650 rb_warn("failed to load encoding (%s); use ASCII-8BIT instead", 00651 name); 00652 return 0; 00653 } 00654 } 00655 return i; 00656 } 00657 00658 rb_encoding * 00659 rb_enc_find(const char *name) 00660 { 00661 int idx = rb_enc_find_index(name); 00662 if (idx < 0) idx = 0; 00663 return rb_enc_from_index(idx); 00664 } 00665 00666 static inline int 00667 enc_capable(VALUE obj) 00668 { 00669 if (SPECIAL_CONST_P(obj)) return SYMBOL_P(obj); 00670 switch (BUILTIN_TYPE(obj)) { 00671 case T_STRING: 00672 case T_REGEXP: 00673 case T_FILE: 00674 return TRUE; 00675 case T_DATA: 00676 if (is_data_encoding(obj)) return TRUE; 00677 default: 00678 return FALSE; 00679 } 00680 } 00681 00682 ID 00683 rb_id_encoding(void) 00684 { 00685 CONST_ID(id_encoding, "encoding"); 00686 return id_encoding; 00687 } 00688 00689 int 00690 rb_enc_get_index(VALUE obj) 00691 { 00692 int i = -1; 00693 VALUE tmp; 00694 00695 if (SPECIAL_CONST_P(obj)) { 00696 if (!SYMBOL_P(obj)) return -1; 00697 obj = rb_id2str(SYM2ID(obj)); 00698 } 00699 switch (BUILTIN_TYPE(obj)) { 00700 as_default: 00701 default: 00702 case T_STRING: 00703 case T_REGEXP: 00704 i = ENCODING_GET_INLINED(obj); 00705 if (i == ENCODING_INLINE_MAX) { 00706 VALUE iv; 00707 00708 iv = rb_ivar_get(obj, rb_id_encoding()); 00709 i = NUM2INT(iv); 00710 } 00711 break; 00712 case T_FILE: 00713 tmp = rb_funcall(obj, rb_intern("internal_encoding"), 0, 0); 00714 if (NIL_P(tmp)) obj = rb_funcall(obj, rb_intern("external_encoding"), 0, 0); 00715 else obj = tmp; 00716 if (NIL_P(obj)) break; 00717 case T_DATA: 00718 if (is_data_encoding(obj)) { 00719 i = enc_check_encoding(obj); 00720 } 00721 else { 00722 goto as_default; 00723 } 00724 break; 00725 } 00726 return i; 00727 } 00728 00729 static void 00730 enc_set_index(VALUE obj, int idx) 00731 { 00732 if (idx < ENCODING_INLINE_MAX) { 00733 ENCODING_SET_INLINED(obj, idx); 00734 return; 00735 } 00736 ENCODING_SET_INLINED(obj, ENCODING_INLINE_MAX); 00737 rb_ivar_set(obj, rb_id_encoding(), INT2NUM(idx)); 00738 } 00739 00740 void 00741 rb_enc_set_index(VALUE obj, int idx) 00742 { 00743 rb_check_frozen(obj); 00744 enc_set_index(obj, idx); 00745 } 00746 00747 VALUE 00748 rb_enc_associate_index(VALUE obj, int idx) 00749 { 00750 /* enc_check_capable(obj);*/ 00751 rb_check_frozen(obj); 00752 if (rb_enc_get_index(obj) == idx) 00753 return obj; 00754 if (SPECIAL_CONST_P(obj)) { 00755 rb_raise(rb_eArgError, "cannot set encoding"); 00756 } 00757 if (!ENC_CODERANGE_ASCIIONLY(obj) || 00758 !rb_enc_asciicompat(rb_enc_from_index(idx))) { 00759 ENC_CODERANGE_CLEAR(obj); 00760 } 00761 enc_set_index(obj, idx); 00762 return obj; 00763 } 00764 00765 VALUE 00766 rb_enc_associate(VALUE obj, rb_encoding *enc) 00767 { 00768 return rb_enc_associate_index(obj, rb_enc_to_index(enc)); 00769 } 00770 00771 rb_encoding* 00772 rb_enc_get(VALUE obj) 00773 { 00774 return rb_enc_from_index(rb_enc_get_index(obj)); 00775 } 00776 00777 rb_encoding* 00778 rb_enc_check(VALUE str1, VALUE str2) 00779 { 00780 rb_encoding *enc = rb_enc_compatible(str1, str2); 00781 if (!enc) 00782 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s", 00783 rb_enc_name(rb_enc_get(str1)), 00784 rb_enc_name(rb_enc_get(str2))); 00785 return enc; 00786 } 00787 00788 rb_encoding* 00789 rb_enc_compatible(VALUE str1, VALUE str2) 00790 { 00791 int idx1, idx2; 00792 rb_encoding *enc1, *enc2; 00793 int isstr1, isstr2; 00794 00795 idx1 = rb_enc_get_index(str1); 00796 idx2 = rb_enc_get_index(str2); 00797 00798 if (idx1 < 0 || idx2 < 0) 00799 return 0; 00800 00801 if (idx1 == idx2) { 00802 return rb_enc_from_index(idx1); 00803 } 00804 enc1 = rb_enc_from_index(idx1); 00805 enc2 = rb_enc_from_index(idx2); 00806 00807 isstr2 = RB_TYPE_P(str2, T_STRING); 00808 if (isstr2 && RSTRING_LEN(str2) == 0) 00809 return enc1; 00810 isstr1 = RB_TYPE_P(str1, T_STRING); 00811 if (isstr1 && RSTRING_LEN(str1) == 0) 00812 return (rb_enc_asciicompat(enc1) && rb_enc_str_asciionly_p(str2)) ? enc1 : enc2; 00813 if (!rb_enc_asciicompat(enc1) || !rb_enc_asciicompat(enc2)) { 00814 return 0; 00815 } 00816 00817 /* objects whose encoding is the same of contents */ 00818 if (!isstr2 && idx2 == ENCINDEX_US_ASCII) 00819 return enc1; 00820 if (!isstr1 && idx1 == ENCINDEX_US_ASCII) 00821 return enc2; 00822 00823 if (!isstr1) { 00824 VALUE tmp = str1; 00825 int idx0 = idx1; 00826 str1 = str2; 00827 str2 = tmp; 00828 idx1 = idx2; 00829 idx2 = idx0; 00830 idx0 = isstr1; 00831 isstr1 = isstr2; 00832 isstr2 = idx0; 00833 } 00834 if (isstr1) { 00835 int cr1, cr2; 00836 00837 cr1 = rb_enc_str_coderange(str1); 00838 if (isstr2) { 00839 cr2 = rb_enc_str_coderange(str2); 00840 if (cr1 != cr2) { 00841 /* may need to handle ENC_CODERANGE_BROKEN */ 00842 if (cr1 == ENC_CODERANGE_7BIT) return enc2; 00843 if (cr2 == ENC_CODERANGE_7BIT) return enc1; 00844 } 00845 if (cr2 == ENC_CODERANGE_7BIT) { 00846 return enc1; 00847 } 00848 } 00849 if (cr1 == ENC_CODERANGE_7BIT) 00850 return enc2; 00851 } 00852 return 0; 00853 } 00854 00855 void 00856 rb_enc_copy(VALUE obj1, VALUE obj2) 00857 { 00858 rb_enc_associate_index(obj1, rb_enc_get_index(obj2)); 00859 } 00860 00861 00862 /* 00863 * call-seq: 00864 * obj.encoding -> encoding 00865 * 00866 * Returns the Encoding object that represents the encoding of obj. 00867 */ 00868 00869 VALUE 00870 rb_obj_encoding(VALUE obj) 00871 { 00872 int idx = rb_enc_get_index(obj); 00873 if (idx < 0) { 00874 rb_raise(rb_eTypeError, "unknown encoding"); 00875 } 00876 return rb_enc_from_encoding_index(idx); 00877 } 00878 00879 int 00880 rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc) 00881 { 00882 return ONIGENC_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e); 00883 } 00884 00885 int 00886 rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc) 00887 { 00888 int n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e); 00889 if (MBCLEN_CHARFOUND_P(n) && MBCLEN_CHARFOUND_LEN(n) <= e-p) 00890 return MBCLEN_CHARFOUND_LEN(n); 00891 else { 00892 int min = rb_enc_mbminlen(enc); 00893 return min <= e-p ? min : (int)(e-p); 00894 } 00895 } 00896 00897 int 00898 rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc) 00899 { 00900 int n; 00901 if (e <= p) 00902 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1); 00903 n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e); 00904 if (e-p < n) 00905 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(n-(int)(e-p)); 00906 return n; 00907 } 00908 00909 int 00910 rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc) 00911 { 00912 unsigned int c, l; 00913 if (e <= p) 00914 return -1; 00915 if (rb_enc_asciicompat(enc)) { 00916 c = (unsigned char)*p; 00917 if (!ISASCII(c)) 00918 return -1; 00919 if (len) *len = 1; 00920 return c; 00921 } 00922 l = rb_enc_precise_mbclen(p, e, enc); 00923 if (!MBCLEN_CHARFOUND_P(l)) 00924 return -1; 00925 c = rb_enc_mbc_to_codepoint(p, e, enc); 00926 if (!rb_enc_isascii(c, enc)) 00927 return -1; 00928 if (len) *len = l; 00929 return c; 00930 } 00931 00932 unsigned int 00933 rb_enc_codepoint_len(const char *p, const char *e, int *len_p, rb_encoding *enc) 00934 { 00935 int r; 00936 if (e <= p) 00937 rb_raise(rb_eArgError, "empty string"); 00938 r = rb_enc_precise_mbclen(p, e, enc); 00939 if (!MBCLEN_CHARFOUND_P(r)) { 00940 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(enc)); 00941 } 00942 if (len_p) *len_p = MBCLEN_CHARFOUND_LEN(r); 00943 return rb_enc_mbc_to_codepoint(p, e, enc); 00944 } 00945 00946 #undef rb_enc_codepoint 00947 unsigned int 00948 rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc) 00949 { 00950 return rb_enc_codepoint_len(p, e, 0, enc); 00951 } 00952 00953 int 00954 rb_enc_codelen(int c, rb_encoding *enc) 00955 { 00956 int n = ONIGENC_CODE_TO_MBCLEN(enc,c); 00957 if (n == 0) { 00958 rb_raise(rb_eArgError, "invalid codepoint 0x%x in %s", c, rb_enc_name(enc)); 00959 } 00960 return n; 00961 } 00962 00963 int 00964 rb_enc_toupper(int c, rb_encoding *enc) 00965 { 00966 return (ONIGENC_IS_ASCII_CODE(c)?ONIGENC_ASCII_CODE_TO_UPPER_CASE(c):(c)); 00967 } 00968 00969 int 00970 rb_enc_tolower(int c, rb_encoding *enc) 00971 { 00972 return (ONIGENC_IS_ASCII_CODE(c)?ONIGENC_ASCII_CODE_TO_LOWER_CASE(c):(c)); 00973 } 00974 00975 /* 00976 * call-seq: 00977 * enc.inspect -> string 00978 * 00979 * Returns a string which represents the encoding for programmers. 00980 * 00981 * Encoding::UTF_8.inspect #=> "#<Encoding:UTF-8>" 00982 * Encoding::ISO_2022_JP.inspect #=> "#<Encoding:ISO-2022-JP (dummy)>" 00983 */ 00984 static VALUE 00985 enc_inspect(VALUE self) 00986 { 00987 VALUE str = rb_sprintf("#<%s:%s%s>", rb_obj_classname(self), 00988 rb_enc_name((rb_encoding*)DATA_PTR(self)), 00989 (enc_dummy_p(self) ? " (dummy)" : "")); 00990 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT); 00991 return str; 00992 } 00993 00994 /* 00995 * call-seq: 00996 * enc.name -> string 00997 * 00998 * Returns the name of the encoding. 00999 * 01000 * Encoding::UTF_8.name #=> "UTF-8" 01001 */ 01002 static VALUE 01003 enc_name(VALUE self) 01004 { 01005 return rb_usascii_str_new2(rb_enc_name((rb_encoding*)DATA_PTR(self))); 01006 } 01007 01008 static int 01009 enc_names_i(st_data_t name, st_data_t idx, st_data_t args) 01010 { 01011 VALUE *arg = (VALUE *)args; 01012 01013 if ((int)idx == (int)arg[0]) { 01014 VALUE str = rb_usascii_str_new2((char *)name); 01015 OBJ_FREEZE(str); 01016 rb_ary_push(arg[1], str); 01017 } 01018 return ST_CONTINUE; 01019 } 01020 01021 /* 01022 * call-seq: 01023 * enc.names -> array 01024 * 01025 * Returns the list of name and aliases of the encoding. 01026 * 01027 * Encoding::WINDOWS_31J.names #=> ["Windows-31J", "CP932", "csWindows31J"] 01028 */ 01029 static VALUE 01030 enc_names(VALUE self) 01031 { 01032 VALUE args[2]; 01033 01034 args[0] = (VALUE)rb_to_encoding_index(self); 01035 args[1] = rb_ary_new2(0); 01036 st_foreach(enc_table.names, enc_names_i, (st_data_t)args); 01037 return args[1]; 01038 } 01039 01040 /* 01041 * call-seq: 01042 * Encoding.list -> [enc1, enc2, ...] 01043 * 01044 * Returns the list of loaded encodings. 01045 * 01046 * Encoding.list 01047 * #=> [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>, 01048 * #<Encoding:ISO-2022-JP (dummy)>] 01049 * 01050 * Encoding.find("US-ASCII") 01051 * #=> #<Encoding:US-ASCII> 01052 * 01053 * Encoding.list 01054 * #=> [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>, 01055 * #<Encoding:US-ASCII>, #<Encoding:ISO-2022-JP (dummy)>] 01056 * 01057 */ 01058 static VALUE 01059 enc_list(VALUE klass) 01060 { 01061 VALUE ary = rb_ary_new2(0); 01062 rb_ary_replace(ary, rb_encoding_list); 01063 return ary; 01064 } 01065 01066 /* 01067 * call-seq: 01068 * Encoding.find(string) -> enc 01069 * Encoding.find(symbol) -> enc 01070 * 01071 * Search the encoding with specified <i>name</i>. 01072 * <i>name</i> should be a string or symbol. 01073 * 01074 * Encoding.find("US-ASCII") #=> #<Encoding:US-ASCII> 01075 * Encoding.find(:Shift_JIS) #=> #<Encoding:Shift_JIS> 01076 * 01077 * Names which this method accept are encoding names and aliases 01078 * including following special aliases 01079 * 01080 * "external":: default external encoding 01081 * "internal":: default internal encoding 01082 * "locale":: locale encoding 01083 * "filesystem":: filesystem encoding 01084 * 01085 * An ArgumentError is raised when no encoding with <i>name</i>. 01086 * Only <code>Encoding.find("internal")</code> however returns nil 01087 * when no encoding named "internal", in other words, when Ruby has no 01088 * default internal encoding. 01089 */ 01090 static VALUE 01091 enc_find(VALUE klass, VALUE enc) 01092 { 01093 int idx; 01094 if (RB_TYPE_P(enc, T_DATA) && is_data_encoding(enc)) 01095 return enc; 01096 idx = str_to_encindex(enc); 01097 if (idx == UNSPECIFIED_ENCODING) return Qnil; 01098 return rb_enc_from_encoding_index(idx); 01099 } 01100 01101 /* 01102 * call-seq: 01103 * Encoding.compatible?(obj1, obj2) -> enc or nil 01104 * 01105 * Checks the compatibility of two objects. 01106 * 01107 * If the objects are both strings they are compatible when they are 01108 * concatenatable. The encoding of the concatenated string will be returned 01109 * if they are compatible, nil if they are not. 01110 * 01111 * Encoding.compatible?("\xa1".force_encoding("iso-8859-1"), "b") 01112 * #=> #<Encoding:ISO-8859-1> 01113 * 01114 * Encoding.compatible?( 01115 * "\xa1".force_encoding("iso-8859-1"), 01116 * "\xa1\xa1".force_encoding("euc-jp")) 01117 * #=> nil 01118 * 01119 * If the objects are non-strings their encodings are compatible when they 01120 * have an encoding and: 01121 * * Either encoding is US-ASCII compatible 01122 * * One of the encodings is a 7-bit encoding 01123 * 01124 */ 01125 static VALUE 01126 enc_compatible_p(VALUE klass, VALUE str1, VALUE str2) 01127 { 01128 rb_encoding *enc; 01129 01130 if (!enc_capable(str1)) return Qnil; 01131 if (!enc_capable(str2)) return Qnil; 01132 enc = rb_enc_compatible(str1, str2); 01133 if (!enc) return Qnil; 01134 return rb_enc_from_encoding(enc); 01135 } 01136 01137 /* :nodoc: */ 01138 static VALUE 01139 enc_dump(int argc, VALUE *argv, VALUE self) 01140 { 01141 rb_scan_args(argc, argv, "01", 0); 01142 return enc_name(self); 01143 } 01144 01145 /* :nodoc: */ 01146 static VALUE 01147 enc_load(VALUE klass, VALUE str) 01148 { 01149 return enc_find(klass, str); 01150 } 01151 01152 rb_encoding * 01153 rb_ascii8bit_encoding(void) 01154 { 01155 if (!enc_table.list) { 01156 rb_enc_init(); 01157 } 01158 return enc_table.list[ENCINDEX_ASCII].enc; 01159 } 01160 01161 int 01162 rb_ascii8bit_encindex(void) 01163 { 01164 return ENCINDEX_ASCII; 01165 } 01166 01167 rb_encoding * 01168 rb_utf8_encoding(void) 01169 { 01170 if (!enc_table.list) { 01171 rb_enc_init(); 01172 } 01173 return enc_table.list[ENCINDEX_UTF_8].enc; 01174 } 01175 01176 int 01177 rb_utf8_encindex(void) 01178 { 01179 return ENCINDEX_UTF_8; 01180 } 01181 01182 rb_encoding * 01183 rb_usascii_encoding(void) 01184 { 01185 if (!enc_table.list) { 01186 rb_enc_init(); 01187 } 01188 return enc_table.list[ENCINDEX_US_ASCII].enc; 01189 } 01190 01191 int 01192 rb_usascii_encindex(void) 01193 { 01194 return ENCINDEX_US_ASCII; 01195 } 01196 01197 int 01198 rb_locale_encindex(void) 01199 { 01200 VALUE charmap = rb_locale_charmap(rb_cEncoding); 01201 int idx; 01202 01203 if (NIL_P(charmap)) 01204 idx = rb_usascii_encindex(); 01205 else if ((idx = rb_enc_find_index(StringValueCStr(charmap))) < 0) 01206 idx = rb_ascii8bit_encindex(); 01207 01208 if (rb_enc_registered("locale") < 0) enc_alias_internal("locale", idx); 01209 01210 return idx; 01211 } 01212 01213 rb_encoding * 01214 rb_locale_encoding(void) 01215 { 01216 return rb_enc_from_index(rb_locale_encindex()); 01217 } 01218 01219 static int 01220 enc_set_filesystem_encoding(void) 01221 { 01222 int idx; 01223 #if defined NO_LOCALE_CHARMAP 01224 idx = rb_enc_to_index(rb_default_external_encoding()); 01225 #elif defined _WIN32 || defined __CYGWIN__ 01226 char cp[sizeof(int) * 8 / 3 + 4]; 01227 snprintf(cp, sizeof cp, "CP%d", AreFileApisANSI() ? GetACP() : GetOEMCP()); 01228 idx = rb_enc_find_index(cp); 01229 if (idx < 0) idx = rb_ascii8bit_encindex(); 01230 #else 01231 idx = rb_enc_to_index(rb_default_external_encoding()); 01232 #endif 01233 01234 enc_alias_internal("filesystem", idx); 01235 return idx; 01236 } 01237 01238 int 01239 rb_filesystem_encindex(void) 01240 { 01241 int idx = rb_enc_registered("filesystem"); 01242 if (idx < 0) 01243 idx = rb_ascii8bit_encindex(); 01244 return idx; 01245 } 01246 01247 rb_encoding * 01248 rb_filesystem_encoding(void) 01249 { 01250 return rb_enc_from_index(rb_filesystem_encindex()); 01251 } 01252 01253 struct default_encoding { 01254 int index; /* -2 => not yet set, -1 => nil */ 01255 rb_encoding *enc; 01256 }; 01257 01258 static struct default_encoding default_external = {0}; 01259 01260 static int 01261 enc_set_default_encoding(struct default_encoding *def, VALUE encoding, const char *name) 01262 { 01263 int overridden = FALSE; 01264 01265 if (def->index != -2) 01266 /* Already set */ 01267 overridden = TRUE; 01268 01269 if (NIL_P(encoding)) { 01270 def->index = -1; 01271 def->enc = 0; 01272 st_insert(enc_table.names, (st_data_t)strdup(name), 01273 (st_data_t)UNSPECIFIED_ENCODING); 01274 } 01275 else { 01276 def->index = rb_enc_to_index(rb_to_encoding(encoding)); 01277 def->enc = 0; 01278 enc_alias_internal(name, def->index); 01279 } 01280 01281 if (def == &default_external) 01282 enc_set_filesystem_encoding(); 01283 01284 return overridden; 01285 } 01286 01287 rb_encoding * 01288 rb_default_external_encoding(void) 01289 { 01290 if (default_external.enc) return default_external.enc; 01291 01292 if (default_external.index >= 0) { 01293 default_external.enc = rb_enc_from_index(default_external.index); 01294 return default_external.enc; 01295 } 01296 else { 01297 return rb_locale_encoding(); 01298 } 01299 } 01300 01301 VALUE 01302 rb_enc_default_external(void) 01303 { 01304 return rb_enc_from_encoding(rb_default_external_encoding()); 01305 } 01306 01307 /* 01308 * call-seq: 01309 * Encoding.default_external -> enc 01310 * 01311 * Returns default external encoding. 01312 * 01313 * The default external encoding is used by default for strings created from 01314 * the following locations: 01315 * 01316 * * CSV 01317 * * File data read from disk 01318 * * SDBM 01319 * * StringIO 01320 * * Zlib::GzipReader 01321 * * Zlib::GzipWriter 01322 * * String#inspect 01323 * * Regexp#inspect 01324 * 01325 * While strings created from these locations will have this encoding, the 01326 * encoding may not be valid. Be sure to check String#valid_encoding?. 01327 * 01328 * File data written to disk will be transcoded to the default external 01329 * encoding when written. 01330 * 01331 * The default external encoding is initialized by the locale or -E option. 01332 */ 01333 static VALUE 01334 get_default_external(VALUE klass) 01335 { 01336 return rb_enc_default_external(); 01337 } 01338 01339 void 01340 rb_enc_set_default_external(VALUE encoding) 01341 { 01342 if (NIL_P(encoding)) { 01343 rb_raise(rb_eArgError, "default external can not be nil"); 01344 } 01345 enc_set_default_encoding(&default_external, encoding, 01346 "external"); 01347 } 01348 01349 /* 01350 * call-seq: 01351 * Encoding.default_external = enc 01352 * 01353 * Sets default external encoding. You should not set 01354 * Encoding::default_external in ruby code as strings created before changing 01355 * the value may have a different encoding from strings created after the value 01356 * was changed., instead you should use <tt>ruby -E</tt> to invoke ruby with 01357 * the correct default_external. 01358 * 01359 * See Encoding::default_external for information on how the default external 01360 * encoding is used. 01361 */ 01362 static VALUE 01363 set_default_external(VALUE klass, VALUE encoding) 01364 { 01365 rb_warning("setting Encoding.default_external"); 01366 rb_enc_set_default_external(encoding); 01367 return encoding; 01368 } 01369 01370 static struct default_encoding default_internal = {-2}; 01371 01372 rb_encoding * 01373 rb_default_internal_encoding(void) 01374 { 01375 if (!default_internal.enc && default_internal.index >= 0) { 01376 default_internal.enc = rb_enc_from_index(default_internal.index); 01377 } 01378 return default_internal.enc; /* can be NULL */ 01379 } 01380 01381 VALUE 01382 rb_enc_default_internal(void) 01383 { 01384 /* Note: These functions cope with default_internal not being set */ 01385 return rb_enc_from_encoding(rb_default_internal_encoding()); 01386 } 01387 01388 /* 01389 * call-seq: 01390 * Encoding.default_internal -> enc 01391 * 01392 * Returns default internal encoding. Strings will be transcoded to the 01393 * default internal encoding in the following places if the default internal 01394 * encoding is not nil: 01395 * 01396 * * CSV 01397 * * Etc.sysconfdir and Etc.systmpdir 01398 * * File data read from disk 01399 * * File names from Dir 01400 * * Integer#chr 01401 * * String#inspect and Regexp#inspect 01402 * * Strings returned from Curses 01403 * * Strings returned from Readline 01404 * * Strings returned from SDBM 01405 * * Time#zone 01406 * * Values from ENV 01407 * * Values in ARGV including $PROGRAM_NAME 01408 * * __FILE__ 01409 * 01410 * Additionally String#encode and String#encode! use the default internal 01411 * encoding if no encoding is given. 01412 * 01413 * The locale encoding (__ENCODING__), not default_internal, is used as the 01414 * encoding of created strings. 01415 * 01416 * Encoding::default_internal is initialized by the source file's 01417 * internal_encoding or -E option. 01418 */ 01419 static VALUE 01420 get_default_internal(VALUE klass) 01421 { 01422 return rb_enc_default_internal(); 01423 } 01424 01425 void 01426 rb_enc_set_default_internal(VALUE encoding) 01427 { 01428 enc_set_default_encoding(&default_internal, encoding, 01429 "internal"); 01430 } 01431 01432 /* 01433 * call-seq: 01434 * Encoding.default_internal = enc or nil 01435 * 01436 * Sets default internal encoding or removes default internal encoding when 01437 * passed nil. You should not set Encoding::default_internal in ruby code as 01438 * strings created before changing the value may have a different encoding 01439 * from strings created after the change. Instead you should use 01440 * <tt>ruby -E</tt> to invoke ruby with the correct default_internal. 01441 * 01442 * See Encoding::default_internal for information on how the default internal 01443 * encoding is used. 01444 */ 01445 static VALUE 01446 set_default_internal(VALUE klass, VALUE encoding) 01447 { 01448 rb_warning("setting Encoding.default_internal"); 01449 rb_enc_set_default_internal(encoding); 01450 return encoding; 01451 } 01452 01453 /* 01454 * call-seq: 01455 * Encoding.locale_charmap -> string 01456 * 01457 * Returns the locale charmap name. 01458 * It returns nil if no appropriate information. 01459 * 01460 * Debian GNU/Linux 01461 * LANG=C 01462 * Encoding.locale_charmap #=> "ANSI_X3.4-1968" 01463 * LANG=ja_JP.EUC-JP 01464 * Encoding.locale_charmap #=> "EUC-JP" 01465 * 01466 * SunOS 5 01467 * LANG=C 01468 * Encoding.locale_charmap #=> "646" 01469 * LANG=ja 01470 * Encoding.locale_charmap #=> "eucJP" 01471 * 01472 * The result is highly platform dependent. 01473 * So Encoding.find(Encoding.locale_charmap) may cause an error. 01474 * If you need some encoding object even for unknown locale, 01475 * Encoding.find("locale") can be used. 01476 * 01477 */ 01478 VALUE 01479 rb_locale_charmap(VALUE klass) 01480 { 01481 #if defined NO_LOCALE_CHARMAP 01482 return rb_usascii_str_new2("ASCII-8BIT"); 01483 #elif defined _WIN32 || defined __CYGWIN__ 01484 const char *codeset = 0; 01485 char cp[sizeof(int) * 3 + 4]; 01486 # ifdef __CYGWIN__ 01487 const char *nl_langinfo_codeset(void); 01488 codeset = nl_langinfo_codeset(); 01489 # endif 01490 if (!codeset) { 01491 UINT codepage = GetConsoleCP(); 01492 if (!codepage) codepage = GetACP(); 01493 snprintf(cp, sizeof(cp), "CP%d", codepage); 01494 codeset = cp; 01495 } 01496 return rb_usascii_str_new2(codeset); 01497 #elif defined HAVE_LANGINFO_H 01498 char *codeset; 01499 codeset = nl_langinfo(CODESET); 01500 return rb_usascii_str_new2(codeset); 01501 #else 01502 return Qnil; 01503 #endif 01504 } 01505 01506 static void 01507 set_encoding_const(const char *name, rb_encoding *enc) 01508 { 01509 VALUE encoding = rb_enc_from_encoding(enc); 01510 char *s = (char *)name; 01511 int haslower = 0, hasupper = 0, valid = 0; 01512 01513 if (ISDIGIT(*s)) return; 01514 if (ISUPPER(*s)) { 01515 hasupper = 1; 01516 while (*++s && (ISALNUM(*s) || *s == '_')) { 01517 if (ISLOWER(*s)) haslower = 1; 01518 } 01519 } 01520 if (!*s) { 01521 if (s - name > ENCODING_NAMELEN_MAX) return; 01522 valid = 1; 01523 rb_define_const(rb_cEncoding, name, encoding); 01524 } 01525 if (!valid || haslower) { 01526 size_t len = s - name; 01527 if (len > ENCODING_NAMELEN_MAX) return; 01528 if (!haslower || !hasupper) { 01529 do { 01530 if (ISLOWER(*s)) haslower = 1; 01531 if (ISUPPER(*s)) hasupper = 1; 01532 } while (*++s && (!haslower || !hasupper)); 01533 len = s - name; 01534 } 01535 len += strlen(s); 01536 if (len++ > ENCODING_NAMELEN_MAX) return; 01537 MEMCPY(s = ALLOCA_N(char, len), name, char, len); 01538 name = s; 01539 if (!valid) { 01540 if (ISLOWER(*s)) *s = ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s); 01541 for (; *s; ++s) { 01542 if (!ISALNUM(*s)) *s = '_'; 01543 } 01544 if (hasupper) { 01545 rb_define_const(rb_cEncoding, name, encoding); 01546 } 01547 } 01548 if (haslower) { 01549 for (s = (char *)name; *s; ++s) { 01550 if (ISLOWER(*s)) *s = ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s); 01551 } 01552 rb_define_const(rb_cEncoding, name, encoding); 01553 } 01554 } 01555 } 01556 01557 static int 01558 rb_enc_name_list_i(st_data_t name, st_data_t idx, st_data_t arg) 01559 { 01560 VALUE ary = (VALUE)arg; 01561 VALUE str = rb_usascii_str_new2((char *)name); 01562 OBJ_FREEZE(str); 01563 rb_ary_push(ary, str); 01564 return ST_CONTINUE; 01565 } 01566 01567 /* 01568 * call-seq: 01569 * Encoding.name_list -> ["enc1", "enc2", ...] 01570 * 01571 * Returns the list of available encoding names. 01572 * 01573 * Encoding.name_list 01574 * #=> ["US-ASCII", "ASCII-8BIT", "UTF-8", 01575 * "ISO-8859-1", "Shift_JIS", "EUC-JP", 01576 * "Windows-31J", 01577 * "BINARY", "CP932", "eucJP"] 01578 * 01579 */ 01580 01581 static VALUE 01582 rb_enc_name_list(VALUE klass) 01583 { 01584 VALUE ary = rb_ary_new2(enc_table.names->num_entries); 01585 st_foreach(enc_table.names, rb_enc_name_list_i, (st_data_t)ary); 01586 return ary; 01587 } 01588 01589 static int 01590 rb_enc_aliases_enc_i(st_data_t name, st_data_t orig, st_data_t arg) 01591 { 01592 VALUE *p = (VALUE *)arg; 01593 VALUE aliases = p[0], ary = p[1]; 01594 int idx = (int)orig; 01595 VALUE key, str = rb_ary_entry(ary, idx); 01596 01597 if (NIL_P(str)) { 01598 rb_encoding *enc = rb_enc_from_index(idx); 01599 01600 if (!enc) return ST_CONTINUE; 01601 if (STRCASECMP((char*)name, rb_enc_name(enc)) == 0) { 01602 return ST_CONTINUE; 01603 } 01604 str = rb_usascii_str_new2(rb_enc_name(enc)); 01605 OBJ_FREEZE(str); 01606 rb_ary_store(ary, idx, str); 01607 } 01608 key = rb_usascii_str_new2((char *)name); 01609 OBJ_FREEZE(key); 01610 rb_hash_aset(aliases, key, str); 01611 return ST_CONTINUE; 01612 } 01613 01614 /* 01615 * call-seq: 01616 * Encoding.aliases -> {"alias1" => "orig1", "alias2" => "orig2", ...} 01617 * 01618 * Returns the hash of available encoding alias and original encoding name. 01619 * 01620 * Encoding.aliases 01621 * #=> {"BINARY"=>"ASCII-8BIT", "ASCII"=>"US-ASCII", "ANSI_X3.4-1986"=>"US-ASCII", 01622 * "SJIS"=>"Shift_JIS", "eucJP"=>"EUC-JP", "CP932"=>"Windows-31J"} 01623 * 01624 */ 01625 01626 static VALUE 01627 rb_enc_aliases(VALUE klass) 01628 { 01629 VALUE aliases[2]; 01630 aliases[0] = rb_hash_new(); 01631 aliases[1] = rb_ary_new(); 01632 st_foreach(enc_table.names, rb_enc_aliases_enc_i, (st_data_t)aliases); 01633 return aliases[0]; 01634 } 01635 01636 /* 01637 * An Encoding instance represents a character encoding usable in Ruby. It is 01638 * defined as a constant under the Encoding namespace. It has a name and 01639 * optionally, aliases: 01640 * 01641 * Encoding::ISO_8859_1.name 01642 * #=> #<Encoding:ISO-8859-1> 01643 * 01644 * Encoding::ISO_8859_1.names 01645 * #=> ["ISO-8859-1", "ISO8859-1"] 01646 * 01647 * Ruby methods dealing with encodings return or accept Encoding instances as 01648 * arguments (when a method accepts an Encoding instance as an argument, it 01649 * can be passed an Encoding name or alias instead). 01650 * 01651 * "some string".encoding 01652 * #=> #<Encoding:UTF-8> 01653 * 01654 * string = "some string".encode(Encoding::ISO_8859_1) 01655 * #=> "some string" 01656 * string.encoding 01657 * #=> #<Encoding:ISO-8859-1> 01658 * 01659 * "some string".encode "ISO-8859-1" 01660 * #=> "some string" 01661 * 01662 * <code>Encoding::ASCII_8BIT</code> is a special encoding that is usually 01663 * used for a byte string, not a character string. But as the name insists, 01664 * its characters in the range of ASCII are considered as ASCII characters. 01665 * This is useful when you use ASCII-8BIT characters with other ASCII 01666 * compatible characters. 01667 * 01668 * == Changing an encoding 01669 * 01670 * The associated Encoding of a String can be changed in two different ways. 01671 * 01672 * First, it is possible to set the Encoding of a string to a new Encoding 01673 * without changing the internal byte representation of the string, with 01674 * String#force_encoding. This is how you can tell Ruby the correct encoding 01675 * of a string. 01676 * 01677 * string 01678 * #=> "R\xC3\xA9sum\xC3\xA9" 01679 * string.encoding 01680 * #=> #<Encoding:ISO-8859-1> 01681 * string.force_encoding(Encoding::UTF-8) 01682 * #=> "R\u00E9sum\u00E9" 01683 * 01684 * Second, it is possible to transcode a string, i.e. translate its internal 01685 * byte representation to another encoding. Its associated encoding is also 01686 * set to the other encoding. See String#encode for the various forms of 01687 * transcoding, and the Encoding::Converter class for additional control over 01688 * the transcoding process. 01689 * 01690 * string 01691 * #=> "R\u00E9sum\u00E9" 01692 * string.encoding 01693 * #=> #<Encoding:UTF-8> 01694 * string = string.encode!(Encoding::ISO_8859_1) 01695 * #=> "R\xE9sum\xE9" 01696 * string.encoding 01697 * #=> #<Encoding::ISO-8859-1> 01698 * 01699 * == Script encoding 01700 * 01701 * All Ruby script code has an associated Encoding which any String literal 01702 * created in the source code will be associated to. 01703 * 01704 * The default script encoding is <code>Encoding::US-ASCII</code>, but it can 01705 * be changed by a magic comment on the first line of the source code file (or 01706 * second line, if there is a shebang line on the first). The comment must 01707 * contain the word <code>coding</code> or <code>encoding</code>, followed 01708 * by a colon, space and the Encoding name or alias: 01709 * 01710 * # encoding: UTF-8 01711 * 01712 * "some string".encoding 01713 * #=> #<Encoding:UTF-8> 01714 * 01715 * The <code>__ENCODING__</code> keyword returns the script encoding of the file 01716 * which the keyword is written: 01717 * 01718 * # encoding: ISO-8859-1 01719 * 01720 * __ENCODING__ 01721 * #=> #<Encoding:ISO-8859-1> 01722 * 01723 * <code>ruby -K</code> will change the default locale encoding, but this is 01724 * not recommended. Ruby source files should declare its script encoding by a 01725 * magic comment even when they only depend on US-ASCII strings or regular 01726 * expressions. 01727 * 01728 * == Locale encoding 01729 * 01730 * The default encoding of the environment. Usually derived from locale. 01731 * 01732 * see Encoding.locale_charmap, Encoding.find('locale') 01733 * 01734 * == Filesystem encoding 01735 * 01736 * The default encoding of strings from the filesystem of the environment. 01737 * This is used for strings of file names or paths. 01738 * 01739 * see Encoding.find('filesystem') 01740 * 01741 * == External encoding 01742 * 01743 * Each IO object has an external encoding which indicates the encoding that 01744 * Ruby will use to read its data. By default Ruby sets the external encoding 01745 * of an IO object to the default external encoding. The default external 01746 * encoding is set by locale encoding or the interpreter <code>-E</code> option. 01747 * Encoding.default_external returns the current value of the external 01748 * encoding. 01749 * 01750 * ENV["LANG"] 01751 * #=> "UTF-8" 01752 * Encoding.default_external 01753 * #=> #<Encoding:UTF-8> 01754 * 01755 * $ ruby -E ISO-8859-1 -e "p Encoding.default_external" 01756 * #<Encoding:ISO-8859-1> 01757 * 01758 * $ LANG=C ruby -e 'p Encoding.default_external' 01759 * #<Encoding:US-ASCII> 01760 * 01761 * The default external encoding may also be set through 01762 * Encoding.default_external=, but you should not do this as strings created 01763 * before and after the change will have inconsistent encodings. Instead use 01764 * <code>ruby -E</code> to invoke ruby with the correct external encoding. 01765 * 01766 * When you know that the actual encoding of the data of an IO object is not 01767 * the default external encoding, you can reset its external encoding with 01768 * IO#set_encoding or set it at IO object creation (see IO.new options). 01769 * 01770 * == Internal encoding 01771 * 01772 * To process the data of an IO object which has an encoding different 01773 * from its external encoding, you can set its internal encoding. Ruby will use 01774 * this internal encoding to transcode the data when it is read from the IO 01775 * object. 01776 * 01777 * Conversely, when data is written to the IO object it is transcoded from the 01778 * internal encoding to the external encoding of the IO object. 01779 * 01780 * The internal encoding of an IO object can be set with 01781 * IO#set_encoding or at IO object creation (see IO.new options). 01782 * 01783 * The internal encoding is optional and when not set, the Ruby default 01784 * internal encoding is used. If not explicitly set this default internal 01785 * encoding is +nil+ meaning that by default, no transcoding occurs. 01786 * 01787 * The default internal encoding can be set with the interpreter option 01788 * <code>-E</code>. Encoding.default_internal returns the current internal 01789 * encoding. 01790 * 01791 * $ ruby -e 'p Encoding.default_internal' 01792 * nil 01793 * 01794 * $ ruby -E ISO-8859-1:UTF-8 -e "p [Encoding.default_external, \ 01795 * Encoding.default_internal]" 01796 * [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>] 01797 * 01798 * The default internal encoding may also be set through 01799 * Encoding.default_internal=, but you should not do this as strings created 01800 * before and after the change will have inconsistent encodings. Instead use 01801 * <code>ruby -E</code> to invoke ruby with the correct internal encoding. 01802 * 01803 * == IO encoding example 01804 * 01805 * In the following example a UTF-8 encoded string "R\u00E9sum\u00E9" is transcoded for 01806 * output to ISO-8859-1 encoding, then read back in and transcoded to UTF-8: 01807 * 01808 * string = "R\u00E9sum\u00E9" 01809 * 01810 * open("transcoded.txt", "w:ISO-8859-1") do |io| 01811 * io.write(string) 01812 * end 01813 * 01814 * puts "raw text:" 01815 * p File.binread("transcoded.txt") 01816 * puts 01817 * 01818 * open("transcoded.txt", "r:ISO-8859-1:UTF-8") do |io| 01819 * puts "transcoded text:" 01820 * p io.read 01821 * end 01822 * 01823 * While writing the file, the internal encoding is not specified as it is 01824 * only necessary for reading. While reading the file both the internal and 01825 * external encoding must be specified to obtain the correct result. 01826 * 01827 * $ ruby t.rb 01828 * raw text: 01829 * "R\xE9sum\xE9" 01830 * 01831 * transcoded text: 01832 * "R\u00E9sum\u00E9" 01833 * 01834 */ 01835 01836 void 01837 Init_Encoding(void) 01838 { 01839 #undef rb_intern 01840 #define rb_intern(str) rb_intern_const(str) 01841 VALUE list; 01842 int i; 01843 01844 rb_cEncoding = rb_define_class("Encoding", rb_cObject); 01845 rb_undef_alloc_func(rb_cEncoding); 01846 rb_undef_method(CLASS_OF(rb_cEncoding), "new"); 01847 rb_define_method(rb_cEncoding, "to_s", enc_name, 0); 01848 rb_define_method(rb_cEncoding, "inspect", enc_inspect, 0); 01849 rb_define_method(rb_cEncoding, "name", enc_name, 0); 01850 rb_define_method(rb_cEncoding, "names", enc_names, 0); 01851 rb_define_method(rb_cEncoding, "dummy?", enc_dummy_p, 0); 01852 rb_define_method(rb_cEncoding, "ascii_compatible?", enc_ascii_compatible_p, 0); 01853 rb_define_method(rb_cEncoding, "replicate", enc_replicate, 1); 01854 rb_define_singleton_method(rb_cEncoding, "list", enc_list, 0); 01855 rb_define_singleton_method(rb_cEncoding, "name_list", rb_enc_name_list, 0); 01856 rb_define_singleton_method(rb_cEncoding, "aliases", rb_enc_aliases, 0); 01857 rb_define_singleton_method(rb_cEncoding, "find", enc_find, 1); 01858 rb_define_singleton_method(rb_cEncoding, "compatible?", enc_compatible_p, 2); 01859 01860 rb_define_method(rb_cEncoding, "_dump", enc_dump, -1); 01861 rb_define_singleton_method(rb_cEncoding, "_load", enc_load, 1); 01862 01863 rb_define_singleton_method(rb_cEncoding, "default_external", get_default_external, 0); 01864 rb_define_singleton_method(rb_cEncoding, "default_external=", set_default_external, 1); 01865 rb_define_singleton_method(rb_cEncoding, "default_internal", get_default_internal, 0); 01866 rb_define_singleton_method(rb_cEncoding, "default_internal=", set_default_internal, 1); 01867 rb_define_singleton_method(rb_cEncoding, "locale_charmap", rb_locale_charmap, 0); 01868 01869 list = rb_ary_new2(enc_table.count); 01870 RBASIC(list)->klass = 0; 01871 rb_encoding_list = list; 01872 rb_gc_register_mark_object(list); 01873 01874 for (i = 0; i < enc_table.count; ++i) { 01875 rb_ary_push(list, enc_new(enc_table.list[i].enc)); 01876 } 01877 } 01878 01879 /* locale insensitive ctype functions */ 01880 01881 #define ctype_test(c, ctype) \ 01882 (rb_isascii(c) && ONIGENC_IS_ASCII_CODE_CTYPE((c), (ctype))) 01883 01884 int rb_isalnum(int c) { return ctype_test(c, ONIGENC_CTYPE_ALNUM); } 01885 int rb_isalpha(int c) { return ctype_test(c, ONIGENC_CTYPE_ALPHA); } 01886 int rb_isblank(int c) { return ctype_test(c, ONIGENC_CTYPE_BLANK); } 01887 int rb_iscntrl(int c) { return ctype_test(c, ONIGENC_CTYPE_CNTRL); } 01888 int rb_isdigit(int c) { return ctype_test(c, ONIGENC_CTYPE_DIGIT); } 01889 int rb_isgraph(int c) { return ctype_test(c, ONIGENC_CTYPE_GRAPH); } 01890 int rb_islower(int c) { return ctype_test(c, ONIGENC_CTYPE_LOWER); } 01891 int rb_isprint(int c) { return ctype_test(c, ONIGENC_CTYPE_PRINT); } 01892 int rb_ispunct(int c) { return ctype_test(c, ONIGENC_CTYPE_PUNCT); } 01893 int rb_isspace(int c) { return ctype_test(c, ONIGENC_CTYPE_SPACE); } 01894 int rb_isupper(int c) { return ctype_test(c, ONIGENC_CTYPE_UPPER); } 01895 int rb_isxdigit(int c) { return ctype_test(c, ONIGENC_CTYPE_XDIGIT); } 01896 01897 int 01898 rb_tolower(int c) 01899 { 01900 return rb_isascii(c) ? ONIGENC_ASCII_CODE_TO_LOWER_CASE(c) : c; 01901 } 01902 01903 int 01904 rb_toupper(int c) 01905 { 01906 return rb_isascii(c) ? ONIGENC_ASCII_CODE_TO_UPPER_CASE(c) : c; 01907 } 01908 01909
1.7.6.1