|
Ruby
2.0.0p481(2014-05-08revision45883)
|
00001 /********************************************************************** 00002 00003 string.c - 00004 00005 $Author: nagachika $ 00006 created at: Mon Aug 9 17:12:58 JST 1993 00007 00008 Copyright (C) 1993-2007 Yukihiro Matsumoto 00009 Copyright (C) 2000 Network Applied Communication Laboratory, Inc. 00010 Copyright (C) 2000 Information-technology Promotion Agency, Japan 00011 00012 **********************************************************************/ 00013 00014 #include "ruby/ruby.h" 00015 #include "ruby/re.h" 00016 #include "ruby/encoding.h" 00017 #include "vm_core.h" 00018 #include "internal.h" 00019 #include "probes.h" 00020 #include <assert.h> 00021 00022 #define BEG(no) (regs->beg[(no)]) 00023 #define END(no) (regs->end[(no)]) 00024 00025 #include <math.h> 00026 #include <ctype.h> 00027 00028 #ifdef HAVE_UNISTD_H 00029 #include <unistd.h> 00030 #endif 00031 00032 #define numberof(array) (int)(sizeof(array) / sizeof((array)[0])) 00033 00034 #undef rb_str_new_cstr 00035 #undef rb_tainted_str_new_cstr 00036 #undef rb_usascii_str_new_cstr 00037 #undef rb_external_str_new_cstr 00038 #undef rb_locale_str_new_cstr 00039 #undef rb_str_new2 00040 #undef rb_str_new3 00041 #undef rb_str_new4 00042 #undef rb_str_new5 00043 #undef rb_tainted_str_new2 00044 #undef rb_usascii_str_new2 00045 #undef rb_str_dup_frozen 00046 #undef rb_str_buf_new_cstr 00047 #undef rb_str_buf_new2 00048 #undef rb_str_buf_cat2 00049 #undef rb_str_cat2 00050 00051 static VALUE rb_str_clear(VALUE str); 00052 00053 VALUE rb_cString; 00054 VALUE rb_cSymbol; 00055 00056 #define RUBY_MAX_CHAR_LEN 16 00057 #define STR_TMPLOCK FL_USER7 00058 #define STR_NOEMBED FL_USER1 00059 #define STR_SHARED FL_USER2 /* = ELTS_SHARED */ 00060 #define STR_ASSOC FL_USER3 00061 #define STR_SHARED_P(s) FL_ALL((s), STR_NOEMBED|ELTS_SHARED) 00062 #define STR_ASSOC_P(s) FL_ALL((s), STR_NOEMBED|STR_ASSOC) 00063 #define STR_NOCAPA (STR_NOEMBED|ELTS_SHARED|STR_ASSOC) 00064 #define STR_NOCAPA_P(s) (FL_TEST((s),STR_NOEMBED) && FL_ANY((s),ELTS_SHARED|STR_ASSOC)) 00065 #define STR_UNSET_NOCAPA(s) do {\ 00066 if (FL_TEST((s),STR_NOEMBED)) FL_UNSET((s),(ELTS_SHARED|STR_ASSOC));\ 00067 } while (0) 00068 00069 00070 #define STR_SET_NOEMBED(str) do {\ 00071 FL_SET((str), STR_NOEMBED);\ 00072 STR_SET_EMBED_LEN((str), 0);\ 00073 } while (0) 00074 #define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED) 00075 #define STR_EMBED_P(str) (!FL_TEST((str), STR_NOEMBED)) 00076 #define STR_SET_EMBED_LEN(str, n) do { \ 00077 long tmp_n = (n);\ 00078 RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\ 00079 RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\ 00080 } while (0) 00081 00082 #define STR_SET_LEN(str, n) do { \ 00083 if (STR_EMBED_P(str)) {\ 00084 STR_SET_EMBED_LEN((str), (n));\ 00085 }\ 00086 else {\ 00087 RSTRING(str)->as.heap.len = (n);\ 00088 }\ 00089 } while (0) 00090 00091 #define STR_DEC_LEN(str) do {\ 00092 if (STR_EMBED_P(str)) {\ 00093 long n = RSTRING_LEN(str);\ 00094 n--;\ 00095 STR_SET_EMBED_LEN((str), n);\ 00096 }\ 00097 else {\ 00098 RSTRING(str)->as.heap.len--;\ 00099 }\ 00100 } while (0) 00101 00102 #define RESIZE_CAPA(str,capacity) do {\ 00103 if (STR_EMBED_P(str)) {\ 00104 if ((capacity) > RSTRING_EMBED_LEN_MAX) {\ 00105 char *tmp = ALLOC_N(char, (capacity)+1);\ 00106 memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\ 00107 RSTRING(str)->as.heap.ptr = tmp;\ 00108 RSTRING(str)->as.heap.len = RSTRING_LEN(str);\ 00109 STR_SET_NOEMBED(str);\ 00110 RSTRING(str)->as.heap.aux.capa = (capacity);\ 00111 }\ 00112 }\ 00113 else {\ 00114 REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\ 00115 if (!STR_NOCAPA_P(str))\ 00116 RSTRING(str)->as.heap.aux.capa = (capacity);\ 00117 }\ 00118 } while (0) 00119 00120 #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) 00121 #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) 00122 00123 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str)) 00124 00125 static inline int 00126 single_byte_optimizable(VALUE str) 00127 { 00128 rb_encoding *enc; 00129 00130 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */ 00131 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) 00132 return 1; 00133 00134 enc = STR_ENC_GET(str); 00135 if (rb_enc_mbmaxlen(enc) == 1) 00136 return 1; 00137 00138 /* Conservative. Possibly single byte. 00139 * "\xa1" in Shift_JIS for example. */ 00140 return 0; 00141 } 00142 00143 VALUE rb_fs; 00144 00145 static inline const char * 00146 search_nonascii(const char *p, const char *e) 00147 { 00148 #if SIZEOF_VALUE == 8 00149 # define NONASCII_MASK 0x8080808080808080ULL 00150 #elif SIZEOF_VALUE == 4 00151 # define NONASCII_MASK 0x80808080UL 00152 #endif 00153 #ifdef NONASCII_MASK 00154 if ((int)sizeof(VALUE) * 2 < e - p) { 00155 const VALUE *s, *t; 00156 const VALUE lowbits = sizeof(VALUE) - 1; 00157 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits)); 00158 while (p < (const char *)s) { 00159 if (!ISASCII(*p)) 00160 return p; 00161 p++; 00162 } 00163 t = (const VALUE*)(~lowbits & (VALUE)e); 00164 while (s < t) { 00165 if (*s & NONASCII_MASK) { 00166 t = s; 00167 break; 00168 } 00169 s++; 00170 } 00171 p = (const char *)t; 00172 } 00173 #endif 00174 while (p < e) { 00175 if (!ISASCII(*p)) 00176 return p; 00177 p++; 00178 } 00179 return NULL; 00180 } 00181 00182 static int 00183 coderange_scan(const char *p, long len, rb_encoding *enc) 00184 { 00185 const char *e = p + len; 00186 00187 if (rb_enc_to_index(enc) == 0) { 00188 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */ 00189 p = search_nonascii(p, e); 00190 return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT; 00191 } 00192 00193 if (rb_enc_asciicompat(enc)) { 00194 p = search_nonascii(p, e); 00195 if (!p) { 00196 return ENC_CODERANGE_7BIT; 00197 } 00198 while (p < e) { 00199 int ret = rb_enc_precise_mbclen(p, e, enc); 00200 if (!MBCLEN_CHARFOUND_P(ret)) { 00201 return ENC_CODERANGE_BROKEN; 00202 } 00203 p += MBCLEN_CHARFOUND_LEN(ret); 00204 if (p < e) { 00205 p = search_nonascii(p, e); 00206 if (!p) { 00207 return ENC_CODERANGE_VALID; 00208 } 00209 } 00210 } 00211 if (e < p) { 00212 return ENC_CODERANGE_BROKEN; 00213 } 00214 return ENC_CODERANGE_VALID; 00215 } 00216 00217 while (p < e) { 00218 int ret = rb_enc_precise_mbclen(p, e, enc); 00219 00220 if (!MBCLEN_CHARFOUND_P(ret)) { 00221 return ENC_CODERANGE_BROKEN; 00222 } 00223 p += MBCLEN_CHARFOUND_LEN(ret); 00224 } 00225 if (e < p) { 00226 return ENC_CODERANGE_BROKEN; 00227 } 00228 return ENC_CODERANGE_VALID; 00229 } 00230 00231 long 00232 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr) 00233 { 00234 const char *p = s; 00235 00236 if (*cr == ENC_CODERANGE_BROKEN) 00237 return e - s; 00238 00239 if (rb_enc_to_index(enc) == 0) { 00240 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */ 00241 p = search_nonascii(p, e); 00242 *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID; 00243 return e - s; 00244 } 00245 else if (rb_enc_asciicompat(enc)) { 00246 p = search_nonascii(p, e); 00247 if (!p) { 00248 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT; 00249 return e - s; 00250 } 00251 while (p < e) { 00252 int ret = rb_enc_precise_mbclen(p, e, enc); 00253 if (!MBCLEN_CHARFOUND_P(ret)) { 00254 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN; 00255 return p - s; 00256 } 00257 p += MBCLEN_CHARFOUND_LEN(ret); 00258 if (p < e) { 00259 p = search_nonascii(p, e); 00260 if (!p) { 00261 *cr = ENC_CODERANGE_VALID; 00262 return e - s; 00263 } 00264 } 00265 } 00266 *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID; 00267 return p - s; 00268 } 00269 else { 00270 while (p < e) { 00271 int ret = rb_enc_precise_mbclen(p, e, enc); 00272 if (!MBCLEN_CHARFOUND_P(ret)) { 00273 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN; 00274 return p - s; 00275 } 00276 p += MBCLEN_CHARFOUND_LEN(ret); 00277 } 00278 *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID; 00279 return p - s; 00280 } 00281 } 00282 00283 static inline void 00284 str_enc_copy(VALUE str1, VALUE str2) 00285 { 00286 rb_enc_set_index(str1, ENCODING_GET(str2)); 00287 } 00288 00289 static void 00290 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src) 00291 { 00292 /* this function is designed for copying encoding and coderange 00293 * from src to new string "dest" which is made from the part of src. 00294 */ 00295 str_enc_copy(dest, src); 00296 if (RSTRING_LEN(dest) == 0) { 00297 if (!rb_enc_asciicompat(STR_ENC_GET(src))) 00298 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID); 00299 else 00300 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT); 00301 return; 00302 } 00303 switch (ENC_CODERANGE(src)) { 00304 case ENC_CODERANGE_7BIT: 00305 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT); 00306 break; 00307 case ENC_CODERANGE_VALID: 00308 if (!rb_enc_asciicompat(STR_ENC_GET(src)) || 00309 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest))) 00310 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID); 00311 else 00312 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT); 00313 break; 00314 default: 00315 break; 00316 } 00317 } 00318 00319 static void 00320 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src) 00321 { 00322 str_enc_copy(dest, src); 00323 ENC_CODERANGE_SET(dest, ENC_CODERANGE(src)); 00324 } 00325 00326 int 00327 rb_enc_str_coderange(VALUE str) 00328 { 00329 int cr = ENC_CODERANGE(str); 00330 00331 if (cr == ENC_CODERANGE_UNKNOWN) { 00332 rb_encoding *enc = STR_ENC_GET(str); 00333 cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc); 00334 ENC_CODERANGE_SET(str, cr); 00335 } 00336 return cr; 00337 } 00338 00339 int 00340 rb_enc_str_asciionly_p(VALUE str) 00341 { 00342 rb_encoding *enc = STR_ENC_GET(str); 00343 00344 if (!rb_enc_asciicompat(enc)) 00345 return FALSE; 00346 else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) 00347 return TRUE; 00348 return FALSE; 00349 } 00350 00351 static inline void 00352 str_mod_check(VALUE s, const char *p, long len) 00353 { 00354 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){ 00355 rb_raise(rb_eRuntimeError, "string modified"); 00356 } 00357 } 00358 00359 size_t 00360 rb_str_capacity(VALUE str) 00361 { 00362 if (STR_EMBED_P(str)) { 00363 return RSTRING_EMBED_LEN_MAX; 00364 } 00365 else if (STR_NOCAPA_P(str)) { 00366 return RSTRING(str)->as.heap.len; 00367 } 00368 else { 00369 return RSTRING(str)->as.heap.aux.capa; 00370 } 00371 } 00372 00373 static inline VALUE 00374 str_alloc(VALUE klass) 00375 { 00376 NEWOBJ_OF(str, struct RString, klass, T_STRING); 00377 00378 str->as.heap.ptr = 0; 00379 str->as.heap.len = 0; 00380 str->as.heap.aux.capa = 0; 00381 00382 return (VALUE)str; 00383 } 00384 00385 static inline VALUE 00386 empty_str_alloc(VALUE klass) 00387 { 00388 if (RUBY_DTRACE_STRING_CREATE_ENABLED()) { 00389 RUBY_DTRACE_STRING_CREATE(0, rb_sourcefile(), rb_sourceline()); 00390 } 00391 return str_alloc(klass); 00392 } 00393 00394 static VALUE 00395 str_new(VALUE klass, const char *ptr, long len) 00396 { 00397 VALUE str; 00398 00399 if (len < 0) { 00400 rb_raise(rb_eArgError, "negative string size (or size too big)"); 00401 } 00402 00403 if (RUBY_DTRACE_STRING_CREATE_ENABLED()) { 00404 RUBY_DTRACE_STRING_CREATE(len, rb_sourcefile(), rb_sourceline()); 00405 } 00406 00407 str = str_alloc(klass); 00408 if (len > RSTRING_EMBED_LEN_MAX) { 00409 RSTRING(str)->as.heap.aux.capa = len; 00410 RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1); 00411 STR_SET_NOEMBED(str); 00412 } 00413 else if (len == 0) { 00414 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT); 00415 } 00416 if (ptr) { 00417 memcpy(RSTRING_PTR(str), ptr, len); 00418 } 00419 STR_SET_LEN(str, len); 00420 RSTRING_PTR(str)[len] = '\0'; 00421 return str; 00422 } 00423 00424 VALUE 00425 rb_str_new(const char *ptr, long len) 00426 { 00427 return str_new(rb_cString, ptr, len); 00428 } 00429 00430 VALUE 00431 rb_usascii_str_new(const char *ptr, long len) 00432 { 00433 VALUE str = rb_str_new(ptr, len); 00434 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT); 00435 return str; 00436 } 00437 00438 VALUE 00439 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc) 00440 { 00441 VALUE str = rb_str_new(ptr, len); 00442 rb_enc_associate(str, enc); 00443 return str; 00444 } 00445 00446 VALUE 00447 rb_str_new_cstr(const char *ptr) 00448 { 00449 if (!ptr) { 00450 rb_raise(rb_eArgError, "NULL pointer given"); 00451 } 00452 return rb_str_new(ptr, strlen(ptr)); 00453 } 00454 00455 RUBY_ALIAS_FUNCTION(rb_str_new2(const char *ptr), rb_str_new_cstr, (ptr)) 00456 #define rb_str_new2 rb_str_new_cstr 00457 00458 VALUE 00459 rb_usascii_str_new_cstr(const char *ptr) 00460 { 00461 VALUE str = rb_str_new2(ptr); 00462 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT); 00463 return str; 00464 } 00465 00466 RUBY_ALIAS_FUNCTION(rb_usascii_str_new2(const char *ptr), rb_usascii_str_new_cstr, (ptr)) 00467 #define rb_usascii_str_new2 rb_usascii_str_new_cstr 00468 00469 VALUE 00470 rb_tainted_str_new(const char *ptr, long len) 00471 { 00472 VALUE str = rb_str_new(ptr, len); 00473 00474 OBJ_TAINT(str); 00475 return str; 00476 } 00477 00478 VALUE 00479 rb_tainted_str_new_cstr(const char *ptr) 00480 { 00481 VALUE str = rb_str_new2(ptr); 00482 00483 OBJ_TAINT(str); 00484 return str; 00485 } 00486 00487 RUBY_ALIAS_FUNCTION(rb_tainted_str_new2(const char *ptr), rb_tainted_str_new_cstr, (ptr)) 00488 #define rb_tainted_str_new2 rb_tainted_str_new_cstr 00489 00490 VALUE 00491 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts) 00492 { 00493 extern VALUE rb_cEncodingConverter; 00494 rb_econv_t *ec; 00495 rb_econv_result_t ret; 00496 long len, olen; 00497 VALUE econv_wrapper; 00498 VALUE newstr; 00499 const unsigned char *start, *sp; 00500 unsigned char *dest, *dp; 00501 size_t converted_output = 0; 00502 00503 if (!to) return str; 00504 if (!from) from = rb_enc_get(str); 00505 if (from == to) return str; 00506 if ((rb_enc_asciicompat(to) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) || 00507 to == rb_ascii8bit_encoding()) { 00508 if (STR_ENC_GET(str) != to) { 00509 str = rb_str_dup(str); 00510 rb_enc_associate(str, to); 00511 } 00512 return str; 00513 } 00514 00515 len = RSTRING_LEN(str); 00516 newstr = rb_str_new(0, len); 00517 olen = len; 00518 00519 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter); 00520 RBASIC(econv_wrapper)->klass = 0; 00521 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts); 00522 if (!ec) return str; 00523 DATA_PTR(econv_wrapper) = ec; 00524 00525 sp = (unsigned char*)RSTRING_PTR(str); 00526 start = sp; 00527 while ((dest = (unsigned char*)RSTRING_PTR(newstr)), 00528 (dp = dest + converted_output), 00529 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)), 00530 ret == econv_destination_buffer_full) { 00531 /* destination buffer short */ 00532 size_t converted_input = sp - start; 00533 size_t rest = len - converted_input; 00534 converted_output = dp - dest; 00535 rb_str_set_len(newstr, converted_output); 00536 if (converted_input && converted_output && 00537 rest < (LONG_MAX / converted_output)) { 00538 rest = (rest * converted_output) / converted_input; 00539 } 00540 else { 00541 rest = olen; 00542 } 00543 olen += rest < 2 ? 2 : rest; 00544 rb_str_resize(newstr, olen); 00545 } 00546 DATA_PTR(econv_wrapper) = 0; 00547 rb_econv_close(ec); 00548 rb_gc_force_recycle(econv_wrapper); 00549 switch (ret) { 00550 case econv_finished: 00551 len = dp - (unsigned char*)RSTRING_PTR(newstr); 00552 rb_str_set_len(newstr, len); 00553 rb_enc_associate(newstr, to); 00554 return newstr; 00555 00556 default: 00557 /* some error, return original */ 00558 return str; 00559 } 00560 } 00561 00562 VALUE 00563 rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to) 00564 { 00565 return rb_str_conv_enc_opts(str, from, to, 0, Qnil); 00566 } 00567 00568 VALUE 00569 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc) 00570 { 00571 VALUE str; 00572 00573 str = rb_tainted_str_new(ptr, len); 00574 if (eenc == rb_usascii_encoding() && 00575 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) { 00576 rb_enc_associate(str, rb_ascii8bit_encoding()); 00577 return str; 00578 } 00579 rb_enc_associate(str, eenc); 00580 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding()); 00581 } 00582 00583 VALUE 00584 rb_external_str_new(const char *ptr, long len) 00585 { 00586 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding()); 00587 } 00588 00589 VALUE 00590 rb_external_str_new_cstr(const char *ptr) 00591 { 00592 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding()); 00593 } 00594 00595 VALUE 00596 rb_locale_str_new(const char *ptr, long len) 00597 { 00598 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding()); 00599 } 00600 00601 VALUE 00602 rb_locale_str_new_cstr(const char *ptr) 00603 { 00604 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding()); 00605 } 00606 00607 VALUE 00608 rb_filesystem_str_new(const char *ptr, long len) 00609 { 00610 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding()); 00611 } 00612 00613 VALUE 00614 rb_filesystem_str_new_cstr(const char *ptr) 00615 { 00616 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding()); 00617 } 00618 00619 VALUE 00620 rb_str_export(VALUE str) 00621 { 00622 return rb_str_conv_enc(str, STR_ENC_GET(str), rb_default_external_encoding()); 00623 } 00624 00625 VALUE 00626 rb_str_export_locale(VALUE str) 00627 { 00628 return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding()); 00629 } 00630 00631 VALUE 00632 rb_str_export_to_enc(VALUE str, rb_encoding *enc) 00633 { 00634 return rb_str_conv_enc(str, STR_ENC_GET(str), enc); 00635 } 00636 00637 static VALUE 00638 str_replace_shared_without_enc(VALUE str2, VALUE str) 00639 { 00640 if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) { 00641 STR_SET_EMBED(str2); 00642 memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1); 00643 STR_SET_EMBED_LEN(str2, RSTRING_LEN(str)); 00644 } 00645 else { 00646 str = rb_str_new_frozen(str); 00647 FL_SET(str2, STR_NOEMBED); 00648 RSTRING(str2)->as.heap.len = RSTRING_LEN(str); 00649 RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str); 00650 RSTRING(str2)->as.heap.aux.shared = str; 00651 FL_SET(str2, ELTS_SHARED); 00652 } 00653 return str2; 00654 } 00655 00656 static VALUE 00657 str_replace_shared(VALUE str2, VALUE str) 00658 { 00659 str_replace_shared_without_enc(str2, str); 00660 rb_enc_cr_str_exact_copy(str2, str); 00661 return str2; 00662 } 00663 00664 static VALUE 00665 str_new_shared(VALUE klass, VALUE str) 00666 { 00667 return str_replace_shared(str_alloc(klass), str); 00668 } 00669 00670 static VALUE 00671 str_new3(VALUE klass, VALUE str) 00672 { 00673 return str_new_shared(klass, str); 00674 } 00675 00676 VALUE 00677 rb_str_new_shared(VALUE str) 00678 { 00679 VALUE str2 = str_new3(rb_obj_class(str), str); 00680 00681 OBJ_INFECT(str2, str); 00682 return str2; 00683 } 00684 00685 RUBY_ALIAS_FUNCTION(rb_str_new3(VALUE str), rb_str_new_shared, (str)) 00686 #define rb_str_new3 rb_str_new_shared 00687 00688 static VALUE 00689 str_new4(VALUE klass, VALUE str) 00690 { 00691 VALUE str2; 00692 00693 str2 = str_alloc(klass); 00694 STR_SET_NOEMBED(str2); 00695 RSTRING(str2)->as.heap.len = RSTRING_LEN(str); 00696 RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str); 00697 if (STR_SHARED_P(str)) { 00698 VALUE shared = RSTRING(str)->as.heap.aux.shared; 00699 assert(OBJ_FROZEN(shared)); 00700 FL_SET(str2, ELTS_SHARED); 00701 RSTRING(str2)->as.heap.aux.shared = shared; 00702 } 00703 else { 00704 FL_SET(str, ELTS_SHARED); 00705 RSTRING(str)->as.heap.aux.shared = str2; 00706 } 00707 rb_enc_cr_str_exact_copy(str2, str); 00708 OBJ_INFECT(str2, str); 00709 return str2; 00710 } 00711 00712 VALUE 00713 rb_str_new_frozen(VALUE orig) 00714 { 00715 VALUE klass, str; 00716 00717 if (OBJ_FROZEN(orig)) return orig; 00718 klass = rb_obj_class(orig); 00719 if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) { 00720 long ofs; 00721 assert(OBJ_FROZEN(str)); 00722 ofs = RSTRING_LEN(str) - RSTRING_LEN(orig); 00723 if ((ofs > 0) || (klass != RBASIC(str)->klass) || 00724 ((RBASIC(str)->flags ^ RBASIC(orig)->flags) & (FL_TAINT|FL_UNTRUSTED)) || 00725 ENCODING_GET(str) != ENCODING_GET(orig)) { 00726 str = str_new3(klass, str); 00727 RSTRING(str)->as.heap.ptr += ofs; 00728 RSTRING(str)->as.heap.len -= ofs; 00729 rb_enc_cr_str_exact_copy(str, orig); 00730 OBJ_INFECT(str, orig); 00731 } 00732 } 00733 else if (STR_EMBED_P(orig)) { 00734 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig)); 00735 rb_enc_cr_str_exact_copy(str, orig); 00736 OBJ_INFECT(str, orig); 00737 } 00738 else if (STR_ASSOC_P(orig)) { 00739 VALUE assoc = RSTRING(orig)->as.heap.aux.shared; 00740 FL_UNSET(orig, STR_ASSOC); 00741 str = str_new4(klass, orig); 00742 FL_SET(str, STR_ASSOC); 00743 RSTRING(str)->as.heap.aux.shared = assoc; 00744 } 00745 else { 00746 str = str_new4(klass, orig); 00747 } 00748 OBJ_FREEZE(str); 00749 return str; 00750 } 00751 00752 RUBY_ALIAS_FUNCTION(rb_str_new4(VALUE orig), rb_str_new_frozen, (orig)) 00753 #define rb_str_new4 rb_str_new_frozen 00754 00755 VALUE 00756 rb_str_new_with_class(VALUE obj, const char *ptr, long len) 00757 { 00758 return str_new(rb_obj_class(obj), ptr, len); 00759 } 00760 00761 RUBY_ALIAS_FUNCTION(rb_str_new5(VALUE obj, const char *ptr, long len), 00762 rb_str_new_with_class, (obj, ptr, len)) 00763 #define rb_str_new5 rb_str_new_with_class 00764 00765 static VALUE 00766 str_new_empty(VALUE str) 00767 { 00768 VALUE v = rb_str_new5(str, 0, 0); 00769 rb_enc_copy(v, str); 00770 OBJ_INFECT(v, str); 00771 return v; 00772 } 00773 00774 #define STR_BUF_MIN_SIZE 128 00775 00776 VALUE 00777 rb_str_buf_new(long capa) 00778 { 00779 VALUE str = str_alloc(rb_cString); 00780 00781 if (capa < STR_BUF_MIN_SIZE) { 00782 capa = STR_BUF_MIN_SIZE; 00783 } 00784 FL_SET(str, STR_NOEMBED); 00785 RSTRING(str)->as.heap.aux.capa = capa; 00786 RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1); 00787 RSTRING(str)->as.heap.ptr[0] = '\0'; 00788 00789 return str; 00790 } 00791 00792 VALUE 00793 rb_str_buf_new_cstr(const char *ptr) 00794 { 00795 VALUE str; 00796 long len = strlen(ptr); 00797 00798 str = rb_str_buf_new(len); 00799 rb_str_buf_cat(str, ptr, len); 00800 00801 return str; 00802 } 00803 00804 RUBY_ALIAS_FUNCTION(rb_str_buf_new2(const char *ptr), rb_str_buf_new_cstr, (ptr)) 00805 #define rb_str_buf_new2 rb_str_buf_new_cstr 00806 00807 VALUE 00808 rb_str_tmp_new(long len) 00809 { 00810 return str_new(0, 0, len); 00811 } 00812 00813 void * 00814 rb_alloc_tmp_buffer(volatile VALUE *store, long len) 00815 { 00816 VALUE s = rb_str_tmp_new(len); 00817 *store = s; 00818 return RSTRING_PTR(s); 00819 } 00820 00821 void 00822 rb_free_tmp_buffer(volatile VALUE *store) 00823 { 00824 VALUE s = *store; 00825 *store = 0; 00826 if (s) rb_str_clear(s); 00827 } 00828 00829 void 00830 rb_str_free(VALUE str) 00831 { 00832 if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) { 00833 xfree(RSTRING(str)->as.heap.ptr); 00834 } 00835 } 00836 00837 RUBY_FUNC_EXPORTED size_t 00838 rb_str_memsize(VALUE str) 00839 { 00840 if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) { 00841 return RSTRING(str)->as.heap.aux.capa; 00842 } 00843 else { 00844 return 0; 00845 } 00846 } 00847 00848 VALUE 00849 rb_str_to_str(VALUE str) 00850 { 00851 return rb_convert_type(str, T_STRING, "String", "to_str"); 00852 } 00853 00854 static inline void str_discard(VALUE str); 00855 00856 void 00857 rb_str_shared_replace(VALUE str, VALUE str2) 00858 { 00859 rb_encoding *enc; 00860 int cr; 00861 if (str == str2) return; 00862 enc = STR_ENC_GET(str2); 00863 cr = ENC_CODERANGE(str2); 00864 str_discard(str); 00865 OBJ_INFECT(str, str2); 00866 if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) { 00867 STR_SET_EMBED(str); 00868 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1); 00869 STR_SET_EMBED_LEN(str, RSTRING_LEN(str2)); 00870 rb_enc_associate(str, enc); 00871 ENC_CODERANGE_SET(str, cr); 00872 return; 00873 } 00874 STR_SET_NOEMBED(str); 00875 STR_UNSET_NOCAPA(str); 00876 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2); 00877 RSTRING(str)->as.heap.len = RSTRING_LEN(str2); 00878 if (STR_NOCAPA_P(str2)) { 00879 FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA); 00880 RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared; 00881 } 00882 else { 00883 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa; 00884 } 00885 STR_SET_EMBED(str2); /* abandon str2 */ 00886 RSTRING_PTR(str2)[0] = 0; 00887 STR_SET_EMBED_LEN(str2, 0); 00888 rb_enc_associate(str, enc); 00889 ENC_CODERANGE_SET(str, cr); 00890 } 00891 00892 static ID id_to_s; 00893 00894 VALUE 00895 rb_obj_as_string(VALUE obj) 00896 { 00897 VALUE str; 00898 00899 if (RB_TYPE_P(obj, T_STRING)) { 00900 return obj; 00901 } 00902 str = rb_funcall(obj, id_to_s, 0); 00903 if (!RB_TYPE_P(str, T_STRING)) 00904 return rb_any_to_s(obj); 00905 if (OBJ_TAINTED(obj)) OBJ_TAINT(str); 00906 return str; 00907 } 00908 00909 static VALUE 00910 str_replace(VALUE str, VALUE str2) 00911 { 00912 long len; 00913 00914 len = RSTRING_LEN(str2); 00915 if (STR_ASSOC_P(str2)) { 00916 str2 = rb_str_new4(str2); 00917 } 00918 if (STR_SHARED_P(str2)) { 00919 VALUE shared = RSTRING(str2)->as.heap.aux.shared; 00920 assert(OBJ_FROZEN(shared)); 00921 STR_SET_NOEMBED(str); 00922 RSTRING(str)->as.heap.len = len; 00923 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2); 00924 FL_SET(str, ELTS_SHARED); 00925 FL_UNSET(str, STR_ASSOC); 00926 RSTRING(str)->as.heap.aux.shared = shared; 00927 } 00928 else { 00929 str_replace_shared(str, str2); 00930 } 00931 00932 OBJ_INFECT(str, str2); 00933 rb_enc_cr_str_exact_copy(str, str2); 00934 return str; 00935 } 00936 00937 static VALUE 00938 str_duplicate(VALUE klass, VALUE str) 00939 { 00940 VALUE dup = str_alloc(klass); 00941 str_replace(dup, str); 00942 return dup; 00943 } 00944 00945 VALUE 00946 rb_str_dup(VALUE str) 00947 { 00948 return str_duplicate(rb_obj_class(str), str); 00949 } 00950 00951 VALUE 00952 rb_str_resurrect(VALUE str) 00953 { 00954 if (RUBY_DTRACE_STRING_CREATE_ENABLED()) { 00955 RUBY_DTRACE_STRING_CREATE(RSTRING_LEN(str), 00956 rb_sourcefile(), rb_sourceline()); 00957 } 00958 return str_replace(str_alloc(rb_cString), str); 00959 } 00960 00961 /* 00962 * call-seq: 00963 * String.new(str="") -> new_str 00964 * 00965 * Returns a new string object containing a copy of <i>str</i>. 00966 */ 00967 00968 static VALUE 00969 rb_str_init(int argc, VALUE *argv, VALUE str) 00970 { 00971 VALUE orig; 00972 00973 if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1) 00974 rb_str_replace(str, orig); 00975 return str; 00976 } 00977 00978 static inline long 00979 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr) 00980 { 00981 long c; 00982 const char *q; 00983 00984 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) { 00985 return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc); 00986 } 00987 else if (rb_enc_asciicompat(enc)) { 00988 c = 0; 00989 if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) { 00990 while (p < e) { 00991 if (ISASCII(*p)) { 00992 q = search_nonascii(p, e); 00993 if (!q) 00994 return c + (e - p); 00995 c += q - p; 00996 p = q; 00997 } 00998 p += rb_enc_fast_mbclen(p, e, enc); 00999 c++; 01000 } 01001 } 01002 else { 01003 while (p < e) { 01004 if (ISASCII(*p)) { 01005 q = search_nonascii(p, e); 01006 if (!q) 01007 return c + (e - p); 01008 c += q - p; 01009 p = q; 01010 } 01011 p += rb_enc_mbclen(p, e, enc); 01012 c++; 01013 } 01014 } 01015 return c; 01016 } 01017 01018 for (c=0; p<e; c++) { 01019 p += rb_enc_mbclen(p, e, enc); 01020 } 01021 return c; 01022 } 01023 01024 long 01025 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc) 01026 { 01027 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN); 01028 } 01029 01030 long 01031 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr) 01032 { 01033 long c; 01034 const char *q; 01035 int ret; 01036 01037 *cr = 0; 01038 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) { 01039 return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc); 01040 } 01041 else if (rb_enc_asciicompat(enc)) { 01042 c = 0; 01043 while (p < e) { 01044 if (ISASCII(*p)) { 01045 q = search_nonascii(p, e); 01046 if (!q) { 01047 if (!*cr) *cr = ENC_CODERANGE_7BIT; 01048 return c + (e - p); 01049 } 01050 c += q - p; 01051 p = q; 01052 } 01053 ret = rb_enc_precise_mbclen(p, e, enc); 01054 if (MBCLEN_CHARFOUND_P(ret)) { 01055 *cr |= ENC_CODERANGE_VALID; 01056 p += MBCLEN_CHARFOUND_LEN(ret); 01057 } 01058 else { 01059 *cr = ENC_CODERANGE_BROKEN; 01060 p++; 01061 } 01062 c++; 01063 } 01064 if (!*cr) *cr = ENC_CODERANGE_7BIT; 01065 return c; 01066 } 01067 01068 for (c=0; p<e; c++) { 01069 ret = rb_enc_precise_mbclen(p, e, enc); 01070 if (MBCLEN_CHARFOUND_P(ret)) { 01071 *cr |= ENC_CODERANGE_VALID; 01072 p += MBCLEN_CHARFOUND_LEN(ret); 01073 } 01074 else { 01075 *cr = ENC_CODERANGE_BROKEN; 01076 if (p + rb_enc_mbminlen(enc) <= e) 01077 p += rb_enc_mbminlen(enc); 01078 else 01079 p = e; 01080 } 01081 } 01082 if (!*cr) *cr = ENC_CODERANGE_7BIT; 01083 return c; 01084 } 01085 01086 #ifdef NONASCII_MASK 01087 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80) 01088 01089 /* 01090 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx 01091 * bit represention. (see http://en.wikipedia.org/wiki/UTF-8) 01092 * Therefore, following pseudo code can detect UTF-8 leading byte. 01093 * 01094 * if (!(byte & 0x80)) 01095 * byte |= 0x40; // turn on bit6 01096 * return ((byte>>6) & 1); // bit6 represent it's leading byte or not. 01097 * 01098 * This function calculate every bytes in the argument word `s' 01099 * using the above logic concurrently. and gather every bytes result. 01100 */ 01101 static inline VALUE 01102 count_utf8_lead_bytes_with_word(const VALUE *s) 01103 { 01104 VALUE d = *s; 01105 01106 /* Transform into bit0 represent UTF-8 leading or not. */ 01107 d |= ~(d>>1); 01108 d >>= 6; 01109 d &= NONASCII_MASK >> 7; 01110 01111 /* Gather every bytes. */ 01112 d += (d>>8); 01113 d += (d>>16); 01114 #if SIZEOF_VALUE == 8 01115 d += (d>>32); 01116 #endif 01117 return (d&0xF); 01118 } 01119 #endif 01120 01121 static long 01122 str_strlen(VALUE str, rb_encoding *enc) 01123 { 01124 const char *p, *e; 01125 long n; 01126 int cr; 01127 01128 if (single_byte_optimizable(str)) return RSTRING_LEN(str); 01129 if (!enc) enc = STR_ENC_GET(str); 01130 p = RSTRING_PTR(str); 01131 e = RSTRING_END(str); 01132 cr = ENC_CODERANGE(str); 01133 #ifdef NONASCII_MASK 01134 if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID && 01135 enc == rb_utf8_encoding()) { 01136 01137 VALUE len = 0; 01138 if ((int)sizeof(VALUE) * 2 < e - p) { 01139 const VALUE *s, *t; 01140 const VALUE lowbits = sizeof(VALUE) - 1; 01141 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits)); 01142 t = (const VALUE*)(~lowbits & (VALUE)e); 01143 while (p < (const char *)s) { 01144 if (is_utf8_lead_byte(*p)) len++; 01145 p++; 01146 } 01147 while (s < t) { 01148 len += count_utf8_lead_bytes_with_word(s); 01149 s++; 01150 } 01151 p = (const char *)s; 01152 } 01153 while (p < e) { 01154 if (is_utf8_lead_byte(*p)) len++; 01155 p++; 01156 } 01157 return (long)len; 01158 } 01159 #endif 01160 n = rb_enc_strlen_cr(p, e, enc, &cr); 01161 if (cr) { 01162 ENC_CODERANGE_SET(str, cr); 01163 } 01164 return n; 01165 } 01166 01167 long 01168 rb_str_strlen(VALUE str) 01169 { 01170 return str_strlen(str, STR_ENC_GET(str)); 01171 } 01172 01173 /* 01174 * call-seq: 01175 * str.length -> integer 01176 * str.size -> integer 01177 * 01178 * Returns the character length of <i>str</i>. 01179 */ 01180 01181 VALUE 01182 rb_str_length(VALUE str) 01183 { 01184 long len; 01185 01186 len = str_strlen(str, STR_ENC_GET(str)); 01187 return LONG2NUM(len); 01188 } 01189 01190 /* 01191 * call-seq: 01192 * str.bytesize -> integer 01193 * 01194 * Returns the length of +str+ in bytes. 01195 * 01196 * "\x80\u3042".bytesize #=> 4 01197 * "hello".bytesize #=> 5 01198 */ 01199 01200 static VALUE 01201 rb_str_bytesize(VALUE str) 01202 { 01203 return LONG2NUM(RSTRING_LEN(str)); 01204 } 01205 01206 /* 01207 * call-seq: 01208 * str.empty? -> true or false 01209 * 01210 * Returns <code>true</code> if <i>str</i> has a length of zero. 01211 * 01212 * "hello".empty? #=> false 01213 * " ".empty? #=> false 01214 * "".empty? #=> true 01215 */ 01216 01217 static VALUE 01218 rb_str_empty(VALUE str) 01219 { 01220 if (RSTRING_LEN(str) == 0) 01221 return Qtrue; 01222 return Qfalse; 01223 } 01224 01225 /* 01226 * call-seq: 01227 * str + other_str -> new_str 01228 * 01229 * Concatenation---Returns a new <code>String</code> containing 01230 * <i>other_str</i> concatenated to <i>str</i>. 01231 * 01232 * "Hello from " + self.to_s #=> "Hello from main" 01233 */ 01234 01235 VALUE 01236 rb_str_plus(VALUE str1, VALUE str2) 01237 { 01238 VALUE str3; 01239 rb_encoding *enc; 01240 01241 StringValue(str2); 01242 enc = rb_enc_check(str1, str2); 01243 str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2)); 01244 memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1)); 01245 memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1), 01246 RSTRING_PTR(str2), RSTRING_LEN(str2)); 01247 RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0'; 01248 01249 if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2)) 01250 OBJ_TAINT(str3); 01251 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc), 01252 ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2))); 01253 return str3; 01254 } 01255 01256 /* 01257 * call-seq: 01258 * str * integer -> new_str 01259 * 01260 * Copy --- Returns a new String containing +integer+ copies of the receiver. 01261 * +integer+ must be greater than or equal to 0. 01262 * 01263 * "Ho! " * 3 #=> "Ho! Ho! Ho! " 01264 * "Ho! " * 0 #=> "" 01265 */ 01266 01267 VALUE 01268 rb_str_times(VALUE str, VALUE times) 01269 { 01270 VALUE str2; 01271 long n, len; 01272 char *ptr2; 01273 01274 len = NUM2LONG(times); 01275 if (len < 0) { 01276 rb_raise(rb_eArgError, "negative argument"); 01277 } 01278 if (len && LONG_MAX/len < RSTRING_LEN(str)) { 01279 rb_raise(rb_eArgError, "argument too big"); 01280 } 01281 01282 str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str)); 01283 ptr2 = RSTRING_PTR(str2); 01284 if (len) { 01285 n = RSTRING_LEN(str); 01286 memcpy(ptr2, RSTRING_PTR(str), n); 01287 while (n <= len/2) { 01288 memcpy(ptr2 + n, ptr2, n); 01289 n *= 2; 01290 } 01291 memcpy(ptr2 + n, ptr2, len-n); 01292 } 01293 ptr2[RSTRING_LEN(str2)] = '\0'; 01294 OBJ_INFECT(str2, str); 01295 rb_enc_cr_str_copy_for_substr(str2, str); 01296 01297 return str2; 01298 } 01299 01300 /* 01301 * call-seq: 01302 * str % arg -> new_str 01303 * 01304 * Format---Uses <i>str</i> as a format specification, and returns the result 01305 * of applying it to <i>arg</i>. If the format specification contains more than 01306 * one substitution, then <i>arg</i> must be an <code>Array</code> or <code>Hash</code> 01307 * containing the values to be substituted. See <code>Kernel::sprintf</code> for 01308 * details of the format string. 01309 * 01310 * "%05d" % 123 #=> "00123" 01311 * "%-5s: %08x" % [ "ID", self.object_id ] #=> "ID : 200e14d6" 01312 * "foo = %{foo}" % { :foo => 'bar' } #=> "foo = bar" 01313 */ 01314 01315 static VALUE 01316 rb_str_format_m(VALUE str, VALUE arg) 01317 { 01318 volatile VALUE tmp = rb_check_array_type(arg); 01319 01320 if (!NIL_P(tmp)) { 01321 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_PTR(tmp), str); 01322 } 01323 return rb_str_format(1, &arg, str); 01324 } 01325 01326 static inline void 01327 str_modifiable(VALUE str) 01328 { 01329 if (FL_TEST(str, STR_TMPLOCK)) { 01330 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked"); 01331 } 01332 rb_check_frozen(str); 01333 if (!OBJ_UNTRUSTED(str) && rb_safe_level() >= 4) 01334 rb_raise(rb_eSecurityError, "Insecure: can't modify string"); 01335 } 01336 01337 static inline int 01338 str_independent(VALUE str) 01339 { 01340 str_modifiable(str); 01341 if (!STR_SHARED_P(str)) return 1; 01342 if (STR_EMBED_P(str)) return 1; 01343 return 0; 01344 } 01345 01346 static void 01347 str_make_independent_expand(VALUE str, long expand) 01348 { 01349 char *ptr; 01350 long len = RSTRING_LEN(str); 01351 long capa = len + expand; 01352 01353 if (len > capa) len = capa; 01354 ptr = ALLOC_N(char, capa + 1); 01355 if (RSTRING_PTR(str)) { 01356 memcpy(ptr, RSTRING_PTR(str), len); 01357 } 01358 STR_SET_NOEMBED(str); 01359 STR_UNSET_NOCAPA(str); 01360 ptr[len] = 0; 01361 RSTRING(str)->as.heap.ptr = ptr; 01362 RSTRING(str)->as.heap.len = len; 01363 RSTRING(str)->as.heap.aux.capa = capa; 01364 } 01365 01366 #define str_make_independent(str) str_make_independent_expand((str), 0L) 01367 01368 void 01369 rb_str_modify(VALUE str) 01370 { 01371 if (!str_independent(str)) 01372 str_make_independent(str); 01373 ENC_CODERANGE_CLEAR(str); 01374 } 01375 01376 void 01377 rb_str_modify_expand(VALUE str, long expand) 01378 { 01379 if (expand < 0) { 01380 rb_raise(rb_eArgError, "negative expanding string size"); 01381 } 01382 if (!str_independent(str)) { 01383 str_make_independent_expand(str, expand); 01384 } 01385 else if (expand > 0) { 01386 long len = RSTRING_LEN(str); 01387 long capa = len + expand; 01388 if (!STR_EMBED_P(str)) { 01389 REALLOC_N(RSTRING(str)->as.heap.ptr, char, capa+1); 01390 STR_UNSET_NOCAPA(str); 01391 RSTRING(str)->as.heap.aux.capa = capa; 01392 } 01393 else if (capa > RSTRING_EMBED_LEN_MAX) { 01394 str_make_independent_expand(str, expand); 01395 } 01396 } 01397 ENC_CODERANGE_CLEAR(str); 01398 } 01399 01400 /* As rb_str_modify(), but don't clear coderange */ 01401 static void 01402 str_modify_keep_cr(VALUE str) 01403 { 01404 if (!str_independent(str)) 01405 str_make_independent(str); 01406 if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN) 01407 /* Force re-scan later */ 01408 ENC_CODERANGE_CLEAR(str); 01409 } 01410 01411 static inline void 01412 str_discard(VALUE str) 01413 { 01414 str_modifiable(str); 01415 if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) { 01416 xfree(RSTRING_PTR(str)); 01417 RSTRING(str)->as.heap.ptr = 0; 01418 RSTRING(str)->as.heap.len = 0; 01419 } 01420 } 01421 01422 void 01423 rb_str_associate(VALUE str, VALUE add) 01424 { 01425 /* sanity check */ 01426 rb_check_frozen(str); 01427 if (STR_ASSOC_P(str)) { 01428 /* already associated */ 01429 rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add); 01430 } 01431 else { 01432 if (STR_SHARED_P(str)) { 01433 VALUE assoc = RSTRING(str)->as.heap.aux.shared; 01434 str_make_independent(str); 01435 if (STR_ASSOC_P(assoc)) { 01436 assoc = RSTRING(assoc)->as.heap.aux.shared; 01437 rb_ary_concat(assoc, add); 01438 add = assoc; 01439 } 01440 } 01441 else if (STR_EMBED_P(str)) { 01442 str_make_independent(str); 01443 } 01444 else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) { 01445 RESIZE_CAPA(str, RSTRING_LEN(str)); 01446 } 01447 FL_SET(str, STR_ASSOC); 01448 RBASIC(add)->klass = 0; 01449 RSTRING(str)->as.heap.aux.shared = add; 01450 } 01451 } 01452 01453 VALUE 01454 rb_str_associated(VALUE str) 01455 { 01456 if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared; 01457 if (STR_ASSOC_P(str)) { 01458 return RSTRING(str)->as.heap.aux.shared; 01459 } 01460 return Qfalse; 01461 } 01462 01463 void 01464 rb_must_asciicompat(VALUE str) 01465 { 01466 rb_encoding *enc = rb_enc_get(str); 01467 if (!rb_enc_asciicompat(enc)) { 01468 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc)); 01469 } 01470 } 01471 01472 VALUE 01473 rb_string_value(volatile VALUE *ptr) 01474 { 01475 VALUE s = *ptr; 01476 if (!RB_TYPE_P(s, T_STRING)) { 01477 s = rb_str_to_str(s); 01478 *ptr = s; 01479 } 01480 return s; 01481 } 01482 01483 char * 01484 rb_string_value_ptr(volatile VALUE *ptr) 01485 { 01486 VALUE str = rb_string_value(ptr); 01487 return RSTRING_PTR(str); 01488 } 01489 01490 char * 01491 rb_string_value_cstr(volatile VALUE *ptr) 01492 { 01493 VALUE str = rb_string_value(ptr); 01494 char *s = RSTRING_PTR(str); 01495 long len = RSTRING_LEN(str); 01496 01497 if (!s || memchr(s, 0, len)) { 01498 rb_raise(rb_eArgError, "string contains null byte"); 01499 } 01500 if (s[len]) { 01501 rb_str_modify(str); 01502 s = RSTRING_PTR(str); 01503 s[RSTRING_LEN(str)] = 0; 01504 } 01505 return s; 01506 } 01507 01508 VALUE 01509 rb_check_string_type(VALUE str) 01510 { 01511 str = rb_check_convert_type(str, T_STRING, "String", "to_str"); 01512 return str; 01513 } 01514 01515 /* 01516 * call-seq: 01517 * String.try_convert(obj) -> string or nil 01518 * 01519 * Try to convert <i>obj</i> into a String, using to_str method. 01520 * Returns converted string or nil if <i>obj</i> cannot be converted 01521 * for any reason. 01522 * 01523 * String.try_convert("str") #=> "str" 01524 * String.try_convert(/re/) #=> nil 01525 */ 01526 static VALUE 01527 rb_str_s_try_convert(VALUE dummy, VALUE str) 01528 { 01529 return rb_check_string_type(str); 01530 } 01531 01532 static char* 01533 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc) 01534 { 01535 long nth = *nthp; 01536 if (rb_enc_mbmaxlen(enc) == 1) { 01537 p += nth; 01538 } 01539 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) { 01540 p += nth * rb_enc_mbmaxlen(enc); 01541 } 01542 else if (rb_enc_asciicompat(enc)) { 01543 const char *p2, *e2; 01544 int n; 01545 01546 while (p < e && 0 < nth) { 01547 e2 = p + nth; 01548 if (e < e2) { 01549 *nthp = nth; 01550 return (char *)e; 01551 } 01552 if (ISASCII(*p)) { 01553 p2 = search_nonascii(p, e2); 01554 if (!p2) { 01555 nth -= e2 - p; 01556 *nthp = nth; 01557 return (char *)e2; 01558 } 01559 nth -= p2 - p; 01560 p = p2; 01561 } 01562 n = rb_enc_mbclen(p, e, enc); 01563 p += n; 01564 nth--; 01565 } 01566 *nthp = nth; 01567 if (nth != 0) { 01568 return (char *)e; 01569 } 01570 return (char *)p; 01571 } 01572 else { 01573 while (p < e && nth--) { 01574 p += rb_enc_mbclen(p, e, enc); 01575 } 01576 } 01577 if (p > e) p = e; 01578 *nthp = nth; 01579 return (char*)p; 01580 } 01581 01582 char* 01583 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc) 01584 { 01585 return str_nth_len(p, e, &nth, enc); 01586 } 01587 01588 static char* 01589 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte) 01590 { 01591 if (singlebyte) 01592 p += nth; 01593 else { 01594 p = str_nth_len(p, e, &nth, enc); 01595 } 01596 if (!p) return 0; 01597 if (p > e) p = e; 01598 return (char *)p; 01599 } 01600 01601 /* char offset to byte offset */ 01602 static long 01603 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte) 01604 { 01605 const char *pp = str_nth(p, e, nth, enc, singlebyte); 01606 if (!pp) return e - p; 01607 return pp - p; 01608 } 01609 01610 long 01611 rb_str_offset(VALUE str, long pos) 01612 { 01613 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos, 01614 STR_ENC_GET(str), single_byte_optimizable(str)); 01615 } 01616 01617 #ifdef NONASCII_MASK 01618 static char * 01619 str_utf8_nth(const char *p, const char *e, long *nthp) 01620 { 01621 long nth = *nthp; 01622 if ((int)SIZEOF_VALUE * 2 < e - p && (int)SIZEOF_VALUE * 2 < nth) { 01623 const VALUE *s, *t; 01624 const VALUE lowbits = sizeof(VALUE) - 1; 01625 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits)); 01626 t = (const VALUE*)(~lowbits & (VALUE)e); 01627 while (p < (const char *)s) { 01628 if (is_utf8_lead_byte(*p)) nth--; 01629 p++; 01630 } 01631 do { 01632 nth -= count_utf8_lead_bytes_with_word(s); 01633 s++; 01634 } while (s < t && (int)sizeof(VALUE) <= nth); 01635 p = (char *)s; 01636 } 01637 while (p < e) { 01638 if (is_utf8_lead_byte(*p)) { 01639 if (nth == 0) break; 01640 nth--; 01641 } 01642 p++; 01643 } 01644 *nthp = nth; 01645 return (char *)p; 01646 } 01647 01648 static long 01649 str_utf8_offset(const char *p, const char *e, long nth) 01650 { 01651 const char *pp = str_utf8_nth(p, e, &nth); 01652 return pp - p; 01653 } 01654 #endif 01655 01656 /* byte offset to char offset */ 01657 long 01658 rb_str_sublen(VALUE str, long pos) 01659 { 01660 if (single_byte_optimizable(str) || pos < 0) 01661 return pos; 01662 else { 01663 char *p = RSTRING_PTR(str); 01664 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str)); 01665 } 01666 } 01667 01668 VALUE 01669 rb_str_subseq(VALUE str, long beg, long len) 01670 { 01671 VALUE str2; 01672 01673 if (RSTRING_LEN(str) == beg + len && 01674 RSTRING_EMBED_LEN_MAX < len) { 01675 str2 = rb_str_new_shared(rb_str_new_frozen(str)); 01676 rb_str_drop_bytes(str2, beg); 01677 } 01678 else { 01679 str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len); 01680 RB_GC_GUARD(str); 01681 } 01682 01683 rb_enc_cr_str_copy_for_substr(str2, str); 01684 OBJ_INFECT(str2, str); 01685 01686 return str2; 01687 } 01688 01689 static char * 01690 rb_str_subpos(VALUE str, long beg, long *lenp) 01691 { 01692 long len = *lenp; 01693 long slen = -1L; 01694 long blen = RSTRING_LEN(str); 01695 rb_encoding *enc = STR_ENC_GET(str); 01696 char *p, *s = RSTRING_PTR(str), *e = s + blen; 01697 01698 if (len < 0) return 0; 01699 if (!blen) { 01700 len = 0; 01701 } 01702 if (single_byte_optimizable(str)) { 01703 if (beg > blen) return 0; 01704 if (beg < 0) { 01705 beg += blen; 01706 if (beg < 0) return 0; 01707 } 01708 if (beg + len > blen) 01709 len = blen - beg; 01710 if (len < 0) return 0; 01711 p = s + beg; 01712 goto end; 01713 } 01714 if (beg < 0) { 01715 if (len > -beg) len = -beg; 01716 if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) { 01717 beg = -beg; 01718 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0); 01719 p = e; 01720 if (!p) return 0; 01721 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0); 01722 if (!p) return 0; 01723 len = e - p; 01724 goto end; 01725 } 01726 else { 01727 slen = str_strlen(str, enc); 01728 beg += slen; 01729 if (beg < 0) return 0; 01730 p = s + beg; 01731 if (len == 0) goto end; 01732 } 01733 } 01734 else if (beg > 0 && beg > RSTRING_LEN(str)) { 01735 return 0; 01736 } 01737 if (len == 0) { 01738 if (beg > str_strlen(str, enc)) return 0; 01739 p = s + beg; 01740 } 01741 #ifdef NONASCII_MASK 01742 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID && 01743 enc == rb_utf8_encoding()) { 01744 p = str_utf8_nth(s, e, &beg); 01745 if (beg > 0) return 0; 01746 len = str_utf8_offset(p, e, len); 01747 } 01748 #endif 01749 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) { 01750 int char_sz = rb_enc_mbmaxlen(enc); 01751 01752 p = s + beg * char_sz; 01753 if (p > e) { 01754 return 0; 01755 } 01756 else if (len * char_sz > e - p) 01757 len = e - p; 01758 else 01759 len *= char_sz; 01760 } 01761 else if ((p = str_nth_len(s, e, &beg, enc)) == e) { 01762 if (beg > 0) return 0; 01763 len = 0; 01764 } 01765 else { 01766 len = str_offset(p, e, len, enc, 0); 01767 } 01768 end: 01769 *lenp = len; 01770 RB_GC_GUARD(str); 01771 return p; 01772 } 01773 01774 VALUE 01775 rb_str_substr(VALUE str, long beg, long len) 01776 { 01777 VALUE str2; 01778 char *p = rb_str_subpos(str, beg, &len); 01779 01780 if (!p) return Qnil; 01781 if (len > RSTRING_EMBED_LEN_MAX && p + len == RSTRING_END(str)) { 01782 str2 = rb_str_new4(str); 01783 str2 = str_new3(rb_obj_class(str2), str2); 01784 RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len; 01785 RSTRING(str2)->as.heap.len = len; 01786 } 01787 else { 01788 str2 = rb_str_new5(str, p, len); 01789 rb_enc_cr_str_copy_for_substr(str2, str); 01790 OBJ_INFECT(str2, str); 01791 RB_GC_GUARD(str); 01792 } 01793 01794 return str2; 01795 } 01796 01797 VALUE 01798 rb_str_freeze(VALUE str) 01799 { 01800 if (STR_ASSOC_P(str)) { 01801 VALUE ary = RSTRING(str)->as.heap.aux.shared; 01802 OBJ_FREEZE(ary); 01803 } 01804 return rb_obj_freeze(str); 01805 } 01806 01807 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str)) 01808 #define rb_str_dup_frozen rb_str_new_frozen 01809 01810 VALUE 01811 rb_str_locktmp(VALUE str) 01812 { 01813 if (FL_TEST(str, STR_TMPLOCK)) { 01814 rb_raise(rb_eRuntimeError, "temporal locking already locked string"); 01815 } 01816 FL_SET(str, STR_TMPLOCK); 01817 return str; 01818 } 01819 01820 VALUE 01821 rb_str_unlocktmp(VALUE str) 01822 { 01823 if (!FL_TEST(str, STR_TMPLOCK)) { 01824 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string"); 01825 } 01826 FL_UNSET(str, STR_TMPLOCK); 01827 return str; 01828 } 01829 01830 VALUE 01831 rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg) 01832 { 01833 rb_str_locktmp(str); 01834 return rb_ensure(func, arg, rb_str_unlocktmp, str); 01835 } 01836 01837 void 01838 rb_str_set_len(VALUE str, long len) 01839 { 01840 long capa; 01841 01842 str_modifiable(str); 01843 if (STR_SHARED_P(str)) { 01844 rb_raise(rb_eRuntimeError, "can't set length of shared string"); 01845 } 01846 if (len > (capa = (long)rb_str_capacity(str))) { 01847 rb_bug("probable buffer overflow: %ld for %ld", len, capa); 01848 } 01849 STR_SET_LEN(str, len); 01850 RSTRING_PTR(str)[len] = '\0'; 01851 } 01852 01853 VALUE 01854 rb_str_resize(VALUE str, long len) 01855 { 01856 long slen; 01857 int independent; 01858 01859 if (len < 0) { 01860 rb_raise(rb_eArgError, "negative string size (or size too big)"); 01861 } 01862 01863 independent = str_independent(str); 01864 ENC_CODERANGE_CLEAR(str); 01865 slen = RSTRING_LEN(str); 01866 if (len != slen) { 01867 if (STR_EMBED_P(str)) { 01868 if (len <= RSTRING_EMBED_LEN_MAX) { 01869 STR_SET_EMBED_LEN(str, len); 01870 RSTRING(str)->as.ary[len] = '\0'; 01871 return str; 01872 } 01873 str_make_independent_expand(str, len - slen); 01874 STR_SET_NOEMBED(str); 01875 } 01876 else if (len <= RSTRING_EMBED_LEN_MAX) { 01877 char *ptr = RSTRING(str)->as.heap.ptr; 01878 STR_SET_EMBED(str); 01879 if (slen > len) slen = len; 01880 if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen); 01881 RSTRING(str)->as.ary[len] = '\0'; 01882 STR_SET_EMBED_LEN(str, len); 01883 if (independent) xfree(ptr); 01884 return str; 01885 } 01886 else if (!independent) { 01887 str_make_independent_expand(str, len - slen); 01888 } 01889 else if (slen < len || slen - len > 1024) { 01890 REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1); 01891 } 01892 if (!STR_NOCAPA_P(str)) { 01893 RSTRING(str)->as.heap.aux.capa = len; 01894 } 01895 RSTRING(str)->as.heap.len = len; 01896 RSTRING(str)->as.heap.ptr[len] = '\0'; /* sentinel */ 01897 } 01898 return str; 01899 } 01900 01901 static VALUE 01902 str_buf_cat(VALUE str, const char *ptr, long len) 01903 { 01904 long capa, total, off = -1; 01905 01906 if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) { 01907 off = ptr - RSTRING_PTR(str); 01908 } 01909 rb_str_modify(str); 01910 if (len == 0) return 0; 01911 if (STR_ASSOC_P(str)) { 01912 FL_UNSET(str, STR_ASSOC); 01913 capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str); 01914 } 01915 else if (STR_EMBED_P(str)) { 01916 capa = RSTRING_EMBED_LEN_MAX; 01917 } 01918 else { 01919 capa = RSTRING(str)->as.heap.aux.capa; 01920 } 01921 if (RSTRING_LEN(str) >= LONG_MAX - len) { 01922 rb_raise(rb_eArgError, "string sizes too big"); 01923 } 01924 total = RSTRING_LEN(str)+len; 01925 if (capa <= total) { 01926 while (total > capa) { 01927 if (capa + 1 >= LONG_MAX / 2) { 01928 capa = (total + 4095) / 4096; 01929 break; 01930 } 01931 capa = (capa + 1) * 2; 01932 } 01933 RESIZE_CAPA(str, capa); 01934 } 01935 if (off != -1) { 01936 ptr = RSTRING_PTR(str) + off; 01937 } 01938 memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len); 01939 STR_SET_LEN(str, total); 01940 RSTRING_PTR(str)[total] = '\0'; /* sentinel */ 01941 01942 return str; 01943 } 01944 01945 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr)) 01946 01947 VALUE 01948 rb_str_buf_cat(VALUE str, const char *ptr, long len) 01949 { 01950 if (len == 0) return str; 01951 if (len < 0) { 01952 rb_raise(rb_eArgError, "negative string size (or size too big)"); 01953 } 01954 return str_buf_cat(str, ptr, len); 01955 } 01956 01957 VALUE 01958 rb_str_buf_cat2(VALUE str, const char *ptr) 01959 { 01960 return rb_str_buf_cat(str, ptr, strlen(ptr)); 01961 } 01962 01963 VALUE 01964 rb_str_cat(VALUE str, const char *ptr, long len) 01965 { 01966 if (len < 0) { 01967 rb_raise(rb_eArgError, "negative string size (or size too big)"); 01968 } 01969 if (STR_ASSOC_P(str)) { 01970 char *p; 01971 rb_str_modify_expand(str, len); 01972 p = RSTRING(str)->as.heap.ptr; 01973 memcpy(p + RSTRING(str)->as.heap.len, ptr, len); 01974 len = RSTRING(str)->as.heap.len += len; 01975 p[len] = '\0'; /* sentinel */ 01976 return str; 01977 } 01978 01979 return rb_str_buf_cat(str, ptr, len); 01980 } 01981 01982 VALUE 01983 rb_str_cat2(VALUE str, const char *ptr) 01984 { 01985 return rb_str_cat(str, ptr, strlen(ptr)); 01986 } 01987 01988 static VALUE 01989 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len, 01990 int ptr_encindex, int ptr_cr, int *ptr_cr_ret) 01991 { 01992 int str_encindex = ENCODING_GET(str); 01993 int res_encindex; 01994 int str_cr, res_cr; 01995 01996 str_cr = ENC_CODERANGE(str); 01997 01998 if (str_encindex == ptr_encindex) { 01999 if (str_cr == ENC_CODERANGE_UNKNOWN) 02000 ptr_cr = ENC_CODERANGE_UNKNOWN; 02001 else if (ptr_cr == ENC_CODERANGE_UNKNOWN) { 02002 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex)); 02003 } 02004 } 02005 else { 02006 rb_encoding *str_enc = rb_enc_from_index(str_encindex); 02007 rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex); 02008 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) { 02009 if (len == 0) 02010 return str; 02011 if (RSTRING_LEN(str) == 0) { 02012 rb_str_buf_cat(str, ptr, len); 02013 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr); 02014 return str; 02015 } 02016 goto incompatible; 02017 } 02018 if (ptr_cr == ENC_CODERANGE_UNKNOWN) { 02019 ptr_cr = coderange_scan(ptr, len, ptr_enc); 02020 } 02021 if (str_cr == ENC_CODERANGE_UNKNOWN) { 02022 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) { 02023 str_cr = rb_enc_str_coderange(str); 02024 } 02025 } 02026 } 02027 if (ptr_cr_ret) 02028 *ptr_cr_ret = ptr_cr; 02029 02030 if (str_encindex != ptr_encindex && 02031 str_cr != ENC_CODERANGE_7BIT && 02032 ptr_cr != ENC_CODERANGE_7BIT) { 02033 incompatible: 02034 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s", 02035 rb_enc_name(rb_enc_from_index(str_encindex)), 02036 rb_enc_name(rb_enc_from_index(ptr_encindex))); 02037 } 02038 02039 if (str_cr == ENC_CODERANGE_UNKNOWN) { 02040 res_encindex = str_encindex; 02041 res_cr = ENC_CODERANGE_UNKNOWN; 02042 } 02043 else if (str_cr == ENC_CODERANGE_7BIT) { 02044 if (ptr_cr == ENC_CODERANGE_7BIT) { 02045 res_encindex = str_encindex; 02046 res_cr = ENC_CODERANGE_7BIT; 02047 } 02048 else { 02049 res_encindex = ptr_encindex; 02050 res_cr = ptr_cr; 02051 } 02052 } 02053 else if (str_cr == ENC_CODERANGE_VALID) { 02054 res_encindex = str_encindex; 02055 if (ptr_cr == ENC_CODERANGE_7BIT || ptr_cr == ENC_CODERANGE_VALID) 02056 res_cr = str_cr; 02057 else 02058 res_cr = ptr_cr; 02059 } 02060 else { /* str_cr == ENC_CODERANGE_BROKEN */ 02061 res_encindex = str_encindex; 02062 res_cr = str_cr; 02063 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN; 02064 } 02065 02066 if (len < 0) { 02067 rb_raise(rb_eArgError, "negative string size (or size too big)"); 02068 } 02069 str_buf_cat(str, ptr, len); 02070 ENCODING_CODERANGE_SET(str, res_encindex, res_cr); 02071 return str; 02072 } 02073 02074 VALUE 02075 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc) 02076 { 02077 return rb_enc_cr_str_buf_cat(str, ptr, len, 02078 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL); 02079 } 02080 02081 VALUE 02082 rb_str_buf_cat_ascii(VALUE str, const char *ptr) 02083 { 02084 /* ptr must reference NUL terminated ASCII string. */ 02085 int encindex = ENCODING_GET(str); 02086 rb_encoding *enc = rb_enc_from_index(encindex); 02087 if (rb_enc_asciicompat(enc)) { 02088 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr), 02089 encindex, ENC_CODERANGE_7BIT, 0); 02090 } 02091 else { 02092 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc)); 02093 while (*ptr) { 02094 unsigned int c = (unsigned char)*ptr; 02095 int len = rb_enc_codelen(c, enc); 02096 rb_enc_mbcput(c, buf, enc); 02097 rb_enc_cr_str_buf_cat(str, buf, len, 02098 encindex, ENC_CODERANGE_VALID, 0); 02099 ptr++; 02100 } 02101 return str; 02102 } 02103 } 02104 02105 VALUE 02106 rb_str_buf_append(VALUE str, VALUE str2) 02107 { 02108 int str2_cr; 02109 02110 str2_cr = ENC_CODERANGE(str2); 02111 02112 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2), 02113 ENCODING_GET(str2), str2_cr, &str2_cr); 02114 02115 OBJ_INFECT(str, str2); 02116 ENC_CODERANGE_SET(str2, str2_cr); 02117 02118 return str; 02119 } 02120 02121 VALUE 02122 rb_str_append(VALUE str, VALUE str2) 02123 { 02124 rb_encoding *enc; 02125 int cr, cr2; 02126 long len2; 02127 02128 StringValue(str2); 02129 if ((len2 = RSTRING_LEN(str2)) > 0 && STR_ASSOC_P(str)) { 02130 long len = RSTRING_LEN(str) + len2; 02131 enc = rb_enc_check(str, str2); 02132 cr = ENC_CODERANGE(str); 02133 if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2; 02134 rb_str_modify_expand(str, len2); 02135 memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len, 02136 RSTRING_PTR(str2), len2+1); 02137 RSTRING(str)->as.heap.len = len; 02138 rb_enc_associate(str, enc); 02139 ENC_CODERANGE_SET(str, cr); 02140 OBJ_INFECT(str, str2); 02141 return str; 02142 } 02143 return rb_str_buf_append(str, str2); 02144 } 02145 02146 /* 02147 * call-seq: 02148 * str << integer -> str 02149 * str.concat(integer) -> str 02150 * str << obj -> str 02151 * str.concat(obj) -> str 02152 * 02153 * Append---Concatenates the given object to <i>str</i>. If the object is a 02154 * <code>Integer</code>, it is considered as a codepoint, and is converted 02155 * to a character before concatenation. 02156 * 02157 * a = "hello " 02158 * a << "world" #=> "hello world" 02159 * a.concat(33) #=> "hello world!" 02160 */ 02161 02162 VALUE 02163 rb_str_concat(VALUE str1, VALUE str2) 02164 { 02165 unsigned int code; 02166 rb_encoding *enc = STR_ENC_GET(str1); 02167 02168 if (FIXNUM_P(str2) || RB_TYPE_P(str2, T_BIGNUM)) { 02169 if (rb_num_to_uint(str2, &code) == 0) { 02170 } 02171 else if (FIXNUM_P(str2)) { 02172 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2)); 02173 } 02174 else { 02175 rb_raise(rb_eRangeError, "bignum out of char range"); 02176 } 02177 } 02178 else { 02179 return rb_str_append(str1, str2); 02180 } 02181 02182 if (enc == rb_usascii_encoding()) { 02183 /* US-ASCII automatically extended to ASCII-8BIT */ 02184 char buf[1]; 02185 buf[0] = (char)code; 02186 if (code > 0xFF) { 02187 rb_raise(rb_eRangeError, "%u out of char range", code); 02188 } 02189 rb_str_cat(str1, buf, 1); 02190 if (code > 127) { 02191 rb_enc_associate(str1, rb_ascii8bit_encoding()); 02192 ENC_CODERANGE_SET(str1, ENC_CODERANGE_VALID); 02193 } 02194 } 02195 else { 02196 long pos = RSTRING_LEN(str1); 02197 int cr = ENC_CODERANGE(str1); 02198 int len; 02199 char *buf; 02200 02201 switch (len = rb_enc_codelen(code, enc)) { 02202 case ONIGERR_INVALID_CODE_POINT_VALUE: 02203 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc)); 02204 break; 02205 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE: 02206 case 0: 02207 rb_raise(rb_eRangeError, "%u out of char range", code); 02208 break; 02209 } 02210 buf = ALLOCA_N(char, len + 1); 02211 rb_enc_mbcput(code, buf, enc); 02212 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) { 02213 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc)); 02214 } 02215 rb_str_resize(str1, pos+len); 02216 memcpy(RSTRING_PTR(str1) + pos, buf, len); 02217 if (cr == ENC_CODERANGE_7BIT && code > 127) 02218 cr = ENC_CODERANGE_VALID; 02219 ENC_CODERANGE_SET(str1, cr); 02220 } 02221 return str1; 02222 } 02223 02224 /* 02225 * call-seq: 02226 * str.prepend(other_str) -> str 02227 * 02228 * Prepend---Prepend the given string to <i>str</i>. 02229 * 02230 * a = "world" 02231 * a.prepend("hello ") #=> "hello world" 02232 * a #=> "hello world" 02233 */ 02234 02235 static VALUE 02236 rb_str_prepend(VALUE str, VALUE str2) 02237 { 02238 StringValue(str2); 02239 StringValue(str); 02240 rb_str_update(str, 0L, 0L, str2); 02241 return str; 02242 } 02243 02244 st_index_t 02245 rb_str_hash(VALUE str) 02246 { 02247 int e = ENCODING_GET(str); 02248 if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) { 02249 e = 0; 02250 } 02251 return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e; 02252 } 02253 02254 int 02255 rb_str_hash_cmp(VALUE str1, VALUE str2) 02256 { 02257 long len; 02258 02259 if (!rb_str_comparable(str1, str2)) return 1; 02260 if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) && 02261 memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) { 02262 return 0; 02263 } 02264 return 1; 02265 } 02266 02267 /* 02268 * call-seq: 02269 * str.hash -> fixnum 02270 * 02271 * Return a hash based on the string's length and content. 02272 */ 02273 02274 static VALUE 02275 rb_str_hash_m(VALUE str) 02276 { 02277 st_index_t hval = rb_str_hash(str); 02278 return INT2FIX(hval); 02279 } 02280 02281 #define lesser(a,b) (((a)>(b))?(b):(a)) 02282 02283 int 02284 rb_str_comparable(VALUE str1, VALUE str2) 02285 { 02286 int idx1, idx2; 02287 int rc1, rc2; 02288 02289 if (RSTRING_LEN(str1) == 0) return TRUE; 02290 if (RSTRING_LEN(str2) == 0) return TRUE; 02291 idx1 = ENCODING_GET(str1); 02292 idx2 = ENCODING_GET(str2); 02293 if (idx1 == idx2) return TRUE; 02294 rc1 = rb_enc_str_coderange(str1); 02295 rc2 = rb_enc_str_coderange(str2); 02296 if (rc1 == ENC_CODERANGE_7BIT) { 02297 if (rc2 == ENC_CODERANGE_7BIT) return TRUE; 02298 if (rb_enc_asciicompat(rb_enc_from_index(idx2))) 02299 return TRUE; 02300 } 02301 if (rc2 == ENC_CODERANGE_7BIT) { 02302 if (rb_enc_asciicompat(rb_enc_from_index(idx1))) 02303 return TRUE; 02304 } 02305 return FALSE; 02306 } 02307 02308 int 02309 rb_str_cmp(VALUE str1, VALUE str2) 02310 { 02311 long len1, len2; 02312 const char *ptr1, *ptr2; 02313 int retval; 02314 02315 if (str1 == str2) return 0; 02316 RSTRING_GETMEM(str1, ptr1, len1); 02317 RSTRING_GETMEM(str2, ptr2, len2); 02318 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) { 02319 if (len1 == len2) { 02320 if (!rb_str_comparable(str1, str2)) { 02321 if (ENCODING_GET(str1) > ENCODING_GET(str2)) 02322 return 1; 02323 return -1; 02324 } 02325 return 0; 02326 } 02327 if (len1 > len2) return 1; 02328 return -1; 02329 } 02330 if (retval > 0) return 1; 02331 return -1; 02332 } 02333 02334 /* expect tail call optimization */ 02335 static VALUE 02336 str_eql(const VALUE str1, const VALUE str2) 02337 { 02338 const long len = RSTRING_LEN(str1); 02339 const char *ptr1, *ptr2; 02340 02341 if (len != RSTRING_LEN(str2)) return Qfalse; 02342 if (!rb_str_comparable(str1, str2)) return Qfalse; 02343 if ((ptr1 = RSTRING_PTR(str1)) == (ptr2 = RSTRING_PTR(str2))) 02344 return Qtrue; 02345 if (memcmp(ptr1, ptr2, len) == 0) 02346 return Qtrue; 02347 return Qfalse; 02348 } 02349 02350 /* 02351 * call-seq: 02352 * str == obj -> true or false 02353 * 02354 * Equality---If <i>obj</i> is not a <code>String</code>, returns 02355 * <code>false</code>. Otherwise, returns <code>true</code> if <i>str</i> 02356 * <code><=></code> <i>obj</i> returns zero. 02357 */ 02358 02359 VALUE 02360 rb_str_equal(VALUE str1, VALUE str2) 02361 { 02362 if (str1 == str2) return Qtrue; 02363 if (!RB_TYPE_P(str2, T_STRING)) { 02364 if (!rb_respond_to(str2, rb_intern("to_str"))) { 02365 return Qfalse; 02366 } 02367 return rb_equal(str2, str1); 02368 } 02369 return str_eql(str1, str2); 02370 } 02371 02372 /* 02373 * call-seq: 02374 * str.eql?(other) -> true or false 02375 * 02376 * Two strings are equal if they have the same length and content. 02377 */ 02378 02379 static VALUE 02380 rb_str_eql(VALUE str1, VALUE str2) 02381 { 02382 if (str1 == str2) return Qtrue; 02383 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse; 02384 return str_eql(str1, str2); 02385 } 02386 02387 /* 02388 * call-seq: 02389 * string <=> other_string -> -1, 0, +1 or nil 02390 * 02391 * 02392 * Comparison---Returns -1, 0, +1 or nil depending on whether +string+ is less 02393 * than, equal to, or greater than +other_string+. 02394 * 02395 * +nil+ is returned if the two values are incomparable. 02396 * 02397 * If the strings are of different lengths, and the strings are equal when 02398 * compared up to the shortest length, then the longer string is considered 02399 * greater than the shorter one. 02400 * 02401 * <code><=></code> is the basis for the methods <code><</code>, 02402 * <code><=</code>, <code>></code>, <code>>=</code>, and 02403 * <code>between?</code>, included from module Comparable. The method 02404 * String#== does not use Comparable#==. 02405 * 02406 * "abcdef" <=> "abcde" #=> 1 02407 * "abcdef" <=> "abcdef" #=> 0 02408 * "abcdef" <=> "abcdefg" #=> -1 02409 * "abcdef" <=> "ABCDEF" #=> 1 02410 */ 02411 02412 static VALUE 02413 rb_str_cmp_m(VALUE str1, VALUE str2) 02414 { 02415 int result; 02416 02417 if (!RB_TYPE_P(str2, T_STRING)) { 02418 VALUE tmp = rb_check_funcall(str2, rb_intern("to_str"), 0, 0); 02419 if (RB_TYPE_P(tmp, T_STRING)) { 02420 result = rb_str_cmp(str1, tmp); 02421 } 02422 else { 02423 return rb_invcmp(str1, str2); 02424 } 02425 } 02426 else { 02427 result = rb_str_cmp(str1, str2); 02428 } 02429 return INT2FIX(result); 02430 } 02431 02432 /* 02433 * call-seq: 02434 * str.casecmp(other_str) -> -1, 0, +1 or nil 02435 * 02436 * Case-insensitive version of <code>String#<=></code>. 02437 * 02438 * "abcdef".casecmp("abcde") #=> 1 02439 * "aBcDeF".casecmp("abcdef") #=> 0 02440 * "abcdef".casecmp("abcdefg") #=> -1 02441 * "abcdef".casecmp("ABCDEF") #=> 0 02442 */ 02443 02444 static VALUE 02445 rb_str_casecmp(VALUE str1, VALUE str2) 02446 { 02447 long len; 02448 rb_encoding *enc; 02449 char *p1, *p1end, *p2, *p2end; 02450 02451 StringValue(str2); 02452 enc = rb_enc_compatible(str1, str2); 02453 if (!enc) { 02454 return Qnil; 02455 } 02456 02457 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1); 02458 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2); 02459 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) { 02460 while (p1 < p1end && p2 < p2end) { 02461 if (*p1 != *p2) { 02462 unsigned int c1 = TOUPPER(*p1 & 0xff); 02463 unsigned int c2 = TOUPPER(*p2 & 0xff); 02464 if (c1 != c2) 02465 return INT2FIX(c1 < c2 ? -1 : 1); 02466 } 02467 p1++; 02468 p2++; 02469 } 02470 } 02471 else { 02472 while (p1 < p1end && p2 < p2end) { 02473 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc); 02474 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc); 02475 02476 if (0 <= c1 && 0 <= c2) { 02477 c1 = TOUPPER(c1); 02478 c2 = TOUPPER(c2); 02479 if (c1 != c2) 02480 return INT2FIX(c1 < c2 ? -1 : 1); 02481 } 02482 else { 02483 int r; 02484 l1 = rb_enc_mbclen(p1, p1end, enc); 02485 l2 = rb_enc_mbclen(p2, p2end, enc); 02486 len = l1 < l2 ? l1 : l2; 02487 r = memcmp(p1, p2, len); 02488 if (r != 0) 02489 return INT2FIX(r < 0 ? -1 : 1); 02490 if (l1 != l2) 02491 return INT2FIX(l1 < l2 ? -1 : 1); 02492 } 02493 p1 += l1; 02494 p2 += l2; 02495 } 02496 } 02497 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0); 02498 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1); 02499 return INT2FIX(-1); 02500 } 02501 02502 static long 02503 rb_str_index(VALUE str, VALUE sub, long offset) 02504 { 02505 long pos; 02506 char *s, *sptr, *e; 02507 long len, slen; 02508 rb_encoding *enc; 02509 02510 enc = rb_enc_check(str, sub); 02511 if (is_broken_string(sub)) { 02512 return -1; 02513 } 02514 len = str_strlen(str, enc); 02515 slen = str_strlen(sub, enc); 02516 if (offset < 0) { 02517 offset += len; 02518 if (offset < 0) return -1; 02519 } 02520 if (len - offset < slen) return -1; 02521 s = RSTRING_PTR(str); 02522 e = s + RSTRING_LEN(str); 02523 if (offset) { 02524 offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str)); 02525 s += offset; 02526 } 02527 if (slen == 0) return offset; 02528 /* need proceed one character at a time */ 02529 sptr = RSTRING_PTR(sub); 02530 slen = RSTRING_LEN(sub); 02531 len = RSTRING_LEN(str) - offset; 02532 for (;;) { 02533 char *t; 02534 pos = rb_memsearch(sptr, slen, s, len, enc); 02535 if (pos < 0) return pos; 02536 t = rb_enc_right_char_head(s, s+pos, e, enc); 02537 if (t == s + pos) break; 02538 if ((len -= t - s) <= 0) return -1; 02539 offset += t - s; 02540 s = t; 02541 } 02542 return pos + offset; 02543 } 02544 02545 02546 /* 02547 * call-seq: 02548 * str.index(substring [, offset]) -> fixnum or nil 02549 * str.index(regexp [, offset]) -> fixnum or nil 02550 * 02551 * Returns the index of the first occurrence of the given <i>substring</i> or 02552 * pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not 02553 * found. If the second parameter is present, it specifies the position in the 02554 * string to begin the search. 02555 * 02556 * "hello".index('e') #=> 1 02557 * "hello".index('lo') #=> 3 02558 * "hello".index('a') #=> nil 02559 * "hello".index(?e) #=> 1 02560 * "hello".index(/[aeiou]/, -3) #=> 4 02561 */ 02562 02563 static VALUE 02564 rb_str_index_m(int argc, VALUE *argv, VALUE str) 02565 { 02566 VALUE sub; 02567 VALUE initpos; 02568 long pos; 02569 02570 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) { 02571 pos = NUM2LONG(initpos); 02572 } 02573 else { 02574 pos = 0; 02575 } 02576 if (pos < 0) { 02577 pos += str_strlen(str, STR_ENC_GET(str)); 02578 if (pos < 0) { 02579 if (RB_TYPE_P(sub, T_REGEXP)) { 02580 rb_backref_set(Qnil); 02581 } 02582 return Qnil; 02583 } 02584 } 02585 02586 if (SPECIAL_CONST_P(sub)) goto generic; 02587 switch (BUILTIN_TYPE(sub)) { 02588 case T_REGEXP: 02589 if (pos > str_strlen(str, STR_ENC_GET(str))) 02590 return Qnil; 02591 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos, 02592 rb_enc_check(str, sub), single_byte_optimizable(str)); 02593 02594 pos = rb_reg_search(sub, str, pos, 0); 02595 pos = rb_str_sublen(str, pos); 02596 break; 02597 02598 generic: 02599 default: { 02600 VALUE tmp; 02601 02602 tmp = rb_check_string_type(sub); 02603 if (NIL_P(tmp)) { 02604 rb_raise(rb_eTypeError, "type mismatch: %s given", 02605 rb_obj_classname(sub)); 02606 } 02607 sub = tmp; 02608 } 02609 /* fall through */ 02610 case T_STRING: 02611 pos = rb_str_index(str, sub, pos); 02612 pos = rb_str_sublen(str, pos); 02613 break; 02614 } 02615 02616 if (pos == -1) return Qnil; 02617 return LONG2NUM(pos); 02618 } 02619 02620 static long 02621 rb_str_rindex(VALUE str, VALUE sub, long pos) 02622 { 02623 long len, slen; 02624 char *s, *sbeg, *e, *t; 02625 rb_encoding *enc; 02626 int singlebyte = single_byte_optimizable(str); 02627 02628 enc = rb_enc_check(str, sub); 02629 if (is_broken_string(sub)) { 02630 return -1; 02631 } 02632 len = str_strlen(str, enc); 02633 slen = str_strlen(sub, enc); 02634 /* substring longer than string */ 02635 if (len < slen) return -1; 02636 if (len - pos < slen) { 02637 pos = len - slen; 02638 } 02639 if (len == 0) { 02640 return pos; 02641 } 02642 sbeg = RSTRING_PTR(str); 02643 e = RSTRING_END(str); 02644 t = RSTRING_PTR(sub); 02645 slen = RSTRING_LEN(sub); 02646 s = str_nth(sbeg, e, pos, enc, singlebyte); 02647 while (s) { 02648 if (memcmp(s, t, slen) == 0) { 02649 return pos; 02650 } 02651 if (pos == 0) break; 02652 pos--; 02653 s = rb_enc_prev_char(sbeg, s, e, enc); 02654 } 02655 return -1; 02656 } 02657 02658 02659 /* 02660 * call-seq: 02661 * str.rindex(substring [, fixnum]) -> fixnum or nil 02662 * str.rindex(regexp [, fixnum]) -> fixnum or nil 02663 * 02664 * Returns the index of the last occurrence of the given <i>substring</i> or 02665 * pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not 02666 * found. If the second parameter is present, it specifies the position in the 02667 * string to end the search---characters beyond this point will not be 02668 * considered. 02669 * 02670 * "hello".rindex('e') #=> 1 02671 * "hello".rindex('l') #=> 3 02672 * "hello".rindex('a') #=> nil 02673 * "hello".rindex(?e) #=> 1 02674 * "hello".rindex(/[aeiou]/, -2) #=> 1 02675 */ 02676 02677 static VALUE 02678 rb_str_rindex_m(int argc, VALUE *argv, VALUE str) 02679 { 02680 VALUE sub; 02681 VALUE vpos; 02682 rb_encoding *enc = STR_ENC_GET(str); 02683 long pos, len = str_strlen(str, enc); 02684 02685 if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) { 02686 pos = NUM2LONG(vpos); 02687 if (pos < 0) { 02688 pos += len; 02689 if (pos < 0) { 02690 if (RB_TYPE_P(sub, T_REGEXP)) { 02691 rb_backref_set(Qnil); 02692 } 02693 return Qnil; 02694 } 02695 } 02696 if (pos > len) pos = len; 02697 } 02698 else { 02699 pos = len; 02700 } 02701 02702 if (SPECIAL_CONST_P(sub)) goto generic; 02703 switch (BUILTIN_TYPE(sub)) { 02704 case T_REGEXP: 02705 /* enc = rb_get_check(str, sub); */ 02706 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos, 02707 STR_ENC_GET(str), single_byte_optimizable(str)); 02708 02709 if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) { 02710 pos = rb_reg_search(sub, str, pos, 1); 02711 pos = rb_str_sublen(str, pos); 02712 } 02713 if (pos >= 0) return LONG2NUM(pos); 02714 break; 02715 02716 generic: 02717 default: { 02718 VALUE tmp; 02719 02720 tmp = rb_check_string_type(sub); 02721 if (NIL_P(tmp)) { 02722 rb_raise(rb_eTypeError, "type mismatch: %s given", 02723 rb_obj_classname(sub)); 02724 } 02725 sub = tmp; 02726 } 02727 /* fall through */ 02728 case T_STRING: 02729 pos = rb_str_rindex(str, sub, pos); 02730 if (pos >= 0) return LONG2NUM(pos); 02731 break; 02732 } 02733 return Qnil; 02734 } 02735 02736 /* 02737 * call-seq: 02738 * str =~ obj -> fixnum or nil 02739 * 02740 * Match---If <i>obj</i> is a <code>Regexp</code>, use it as a pattern to match 02741 * against <i>str</i>,and returns the position the match starts, or 02742 * <code>nil</code> if there is no match. Otherwise, invokes 02743 * <i>obj.=~</i>, passing <i>str</i> as an argument. The default 02744 * <code>=~</code> in <code>Object</code> returns <code>nil</code>. 02745 * 02746 * Note: <code>str =~ regexp</code> is not the same as 02747 * <code>regexp =~ str</code>. Strings captured from named capture groups 02748 * are assigned to local variables only in the second case. 02749 * 02750 * "cat o' 9 tails" =~ /\d/ #=> 7 02751 * "cat o' 9 tails" =~ 9 #=> nil 02752 */ 02753 02754 static VALUE 02755 rb_str_match(VALUE x, VALUE y) 02756 { 02757 if (SPECIAL_CONST_P(y)) goto generic; 02758 switch (BUILTIN_TYPE(y)) { 02759 case T_STRING: 02760 rb_raise(rb_eTypeError, "type mismatch: String given"); 02761 02762 case T_REGEXP: 02763 return rb_reg_match(y, x); 02764 02765 generic: 02766 default: 02767 return rb_funcall(y, rb_intern("=~"), 1, x); 02768 } 02769 } 02770 02771 02772 static VALUE get_pat(VALUE, int); 02773 02774 02775 /* 02776 * call-seq: 02777 * str.match(pattern) -> matchdata or nil 02778 * str.match(pattern, pos) -> matchdata or nil 02779 * 02780 * Converts <i>pattern</i> to a <code>Regexp</code> (if it isn't already one), 02781 * then invokes its <code>match</code> method on <i>str</i>. If the second 02782 * parameter is present, it specifies the position in the string to begin the 02783 * search. 02784 * 02785 * 'hello'.match('(.)\1') #=> #<MatchData "ll" 1:"l"> 02786 * 'hello'.match('(.)\1')[0] #=> "ll" 02787 * 'hello'.match(/(.)\1/)[0] #=> "ll" 02788 * 'hello'.match('xx') #=> nil 02789 * 02790 * If a block is given, invoke the block with MatchData if match succeed, so 02791 * that you can write 02792 * 02793 * str.match(pat) {|m| ...} 02794 * 02795 * instead of 02796 * 02797 * if m = str.match(pat) 02798 * ... 02799 * end 02800 * 02801 * The return value is a value from block execution in this case. 02802 */ 02803 02804 static VALUE 02805 rb_str_match_m(int argc, VALUE *argv, VALUE str) 02806 { 02807 VALUE re, result; 02808 if (argc < 1) 02809 rb_check_arity(argc, 1, 2); 02810 re = argv[0]; 02811 argv[0] = str; 02812 result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv); 02813 if (!NIL_P(result) && rb_block_given_p()) { 02814 return rb_yield(result); 02815 } 02816 return result; 02817 } 02818 02819 enum neighbor_char { 02820 NEIGHBOR_NOT_CHAR, 02821 NEIGHBOR_FOUND, 02822 NEIGHBOR_WRAPPED 02823 }; 02824 02825 static enum neighbor_char 02826 enc_succ_char(char *p, long len, rb_encoding *enc) 02827 { 02828 long i; 02829 int l; 02830 while (1) { 02831 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--) 02832 p[i] = '\0'; 02833 if (i < 0) 02834 return NEIGHBOR_WRAPPED; 02835 ++((unsigned char*)p)[i]; 02836 l = rb_enc_precise_mbclen(p, p+len, enc); 02837 if (MBCLEN_CHARFOUND_P(l)) { 02838 l = MBCLEN_CHARFOUND_LEN(l); 02839 if (l == len) { 02840 return NEIGHBOR_FOUND; 02841 } 02842 else { 02843 memset(p+l, 0xff, len-l); 02844 } 02845 } 02846 if (MBCLEN_INVALID_P(l) && i < len-1) { 02847 long len2; 02848 int l2; 02849 for (len2 = len-1; 0 < len2; len2--) { 02850 l2 = rb_enc_precise_mbclen(p, p+len2, enc); 02851 if (!MBCLEN_INVALID_P(l2)) 02852 break; 02853 } 02854 memset(p+len2+1, 0xff, len-(len2+1)); 02855 } 02856 } 02857 } 02858 02859 static enum neighbor_char 02860 enc_pred_char(char *p, long len, rb_encoding *enc) 02861 { 02862 long i; 02863 int l; 02864 while (1) { 02865 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--) 02866 p[i] = '\xff'; 02867 if (i < 0) 02868 return NEIGHBOR_WRAPPED; 02869 --((unsigned char*)p)[i]; 02870 l = rb_enc_precise_mbclen(p, p+len, enc); 02871 if (MBCLEN_CHARFOUND_P(l)) { 02872 l = MBCLEN_CHARFOUND_LEN(l); 02873 if (l == len) { 02874 return NEIGHBOR_FOUND; 02875 } 02876 else { 02877 memset(p+l, 0, len-l); 02878 } 02879 } 02880 if (MBCLEN_INVALID_P(l) && i < len-1) { 02881 long len2; 02882 int l2; 02883 for (len2 = len-1; 0 < len2; len2--) { 02884 l2 = rb_enc_precise_mbclen(p, p+len2, enc); 02885 if (!MBCLEN_INVALID_P(l2)) 02886 break; 02887 } 02888 memset(p+len2+1, 0, len-(len2+1)); 02889 } 02890 } 02891 } 02892 02893 /* 02894 overwrite +p+ by succeeding letter in +enc+ and returns 02895 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED. 02896 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry. 02897 assuming each ranges are successive, and mbclen 02898 never change in each ranges. 02899 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one 02900 character. 02901 */ 02902 static enum neighbor_char 02903 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry) 02904 { 02905 enum neighbor_char ret; 02906 unsigned int c; 02907 int ctype; 02908 int range; 02909 char save[ONIGENC_CODE_TO_MBC_MAXLEN]; 02910 02911 c = rb_enc_mbc_to_codepoint(p, p+len, enc); 02912 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc)) 02913 ctype = ONIGENC_CTYPE_DIGIT; 02914 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc)) 02915 ctype = ONIGENC_CTYPE_ALPHA; 02916 else 02917 return NEIGHBOR_NOT_CHAR; 02918 02919 MEMCPY(save, p, char, len); 02920 ret = enc_succ_char(p, len, enc); 02921 if (ret == NEIGHBOR_FOUND) { 02922 c = rb_enc_mbc_to_codepoint(p, p+len, enc); 02923 if (rb_enc_isctype(c, ctype, enc)) 02924 return NEIGHBOR_FOUND; 02925 } 02926 MEMCPY(p, save, char, len); 02927 range = 1; 02928 while (1) { 02929 MEMCPY(save, p, char, len); 02930 ret = enc_pred_char(p, len, enc); 02931 if (ret == NEIGHBOR_FOUND) { 02932 c = rb_enc_mbc_to_codepoint(p, p+len, enc); 02933 if (!rb_enc_isctype(c, ctype, enc)) { 02934 MEMCPY(p, save, char, len); 02935 break; 02936 } 02937 } 02938 else { 02939 MEMCPY(p, save, char, len); 02940 break; 02941 } 02942 range++; 02943 } 02944 if (range == 1) { 02945 return NEIGHBOR_NOT_CHAR; 02946 } 02947 02948 if (ctype != ONIGENC_CTYPE_DIGIT) { 02949 MEMCPY(carry, p, char, len); 02950 return NEIGHBOR_WRAPPED; 02951 } 02952 02953 MEMCPY(carry, p, char, len); 02954 enc_succ_char(carry, len, enc); 02955 return NEIGHBOR_WRAPPED; 02956 } 02957 02958 02959 /* 02960 * call-seq: 02961 * str.succ -> new_str 02962 * str.next -> new_str 02963 * 02964 * Returns the successor to <i>str</i>. The successor is calculated by 02965 * incrementing characters starting from the rightmost alphanumeric (or 02966 * the rightmost character if there are no alphanumerics) in the 02967 * string. Incrementing a digit always results in another digit, and 02968 * incrementing a letter results in another letter of the same case. 02969 * Incrementing nonalphanumerics uses the underlying character set's 02970 * collating sequence. 02971 * 02972 * If the increment generates a ``carry,'' the character to the left of 02973 * it is incremented. This process repeats until there is no carry, 02974 * adding an additional character if necessary. 02975 * 02976 * "abcd".succ #=> "abce" 02977 * "THX1138".succ #=> "THX1139" 02978 * "<<koala>>".succ #=> "<<koalb>>" 02979 * "1999zzz".succ #=> "2000aaa" 02980 * "ZZZ9999".succ #=> "AAAA0000" 02981 * "***".succ #=> "**+" 02982 */ 02983 02984 VALUE 02985 rb_str_succ(VALUE orig) 02986 { 02987 rb_encoding *enc; 02988 VALUE str; 02989 char *sbeg, *s, *e, *last_alnum = 0; 02990 int c = -1; 02991 long l; 02992 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1"; 02993 long carry_pos = 0, carry_len = 1; 02994 enum neighbor_char neighbor = NEIGHBOR_FOUND; 02995 02996 str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig)); 02997 rb_enc_cr_str_copy_for_substr(str, orig); 02998 OBJ_INFECT(str, orig); 02999 if (RSTRING_LEN(str) == 0) return str; 03000 03001 enc = STR_ENC_GET(orig); 03002 sbeg = RSTRING_PTR(str); 03003 s = e = sbeg + RSTRING_LEN(str); 03004 03005 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) { 03006 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) { 03007 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) : 03008 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) { 03009 s = last_alnum; 03010 break; 03011 } 03012 } 03013 if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue; 03014 neighbor = enc_succ_alnum_char(s, l, enc, carry); 03015 switch (neighbor) { 03016 case NEIGHBOR_NOT_CHAR: 03017 continue; 03018 case NEIGHBOR_FOUND: 03019 return str; 03020 case NEIGHBOR_WRAPPED: 03021 last_alnum = s; 03022 break; 03023 } 03024 c = 1; 03025 carry_pos = s - sbeg; 03026 carry_len = l; 03027 } 03028 if (c == -1) { /* str contains no alnum */ 03029 s = e; 03030 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) { 03031 enum neighbor_char neighbor; 03032 if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue; 03033 neighbor = enc_succ_char(s, l, enc); 03034 if (neighbor == NEIGHBOR_FOUND) 03035 return str; 03036 if (rb_enc_precise_mbclen(s, s+l, enc) != l) { 03037 /* wrapped to \0...\0. search next valid char. */ 03038 enc_succ_char(s, l, enc); 03039 } 03040 if (!rb_enc_asciicompat(enc)) { 03041 MEMCPY(carry, s, char, l); 03042 carry_len = l; 03043 } 03044 carry_pos = s - sbeg; 03045 } 03046 } 03047 RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len); 03048 s = RSTRING_PTR(str) + carry_pos; 03049 memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos); 03050 memmove(s, carry, carry_len); 03051 STR_SET_LEN(str, RSTRING_LEN(str) + carry_len); 03052 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; 03053 rb_enc_str_coderange(str); 03054 return str; 03055 } 03056 03057 03058 /* 03059 * call-seq: 03060 * str.succ! -> str 03061 * str.next! -> str 03062 * 03063 * Equivalent to <code>String#succ</code>, but modifies the receiver in 03064 * place. 03065 */ 03066 03067 static VALUE 03068 rb_str_succ_bang(VALUE str) 03069 { 03070 rb_str_shared_replace(str, rb_str_succ(str)); 03071 03072 return str; 03073 } 03074 03075 03076 /* 03077 * call-seq: 03078 * str.upto(other_str, exclusive=false) {|s| block } -> str 03079 * str.upto(other_str, exclusive=false) -> an_enumerator 03080 * 03081 * Iterates through successive values, starting at <i>str</i> and 03082 * ending at <i>other_str</i> inclusive, passing each value in turn to 03083 * the block. The <code>String#succ</code> method is used to generate 03084 * each value. If optional second argument exclusive is omitted or is false, 03085 * the last value will be included; otherwise it will be excluded. 03086 * 03087 * If no block is given, an enumerator is returned instead. 03088 * 03089 * "a8".upto("b6") {|s| print s, ' ' } 03090 * for s in "a8".."b6" 03091 * print s, ' ' 03092 * end 03093 * 03094 * <em>produces:</em> 03095 * 03096 * a8 a9 b0 b1 b2 b3 b4 b5 b6 03097 * a8 a9 b0 b1 b2 b3 b4 b5 b6 03098 * 03099 * If <i>str</i> and <i>other_str</i> contains only ascii numeric characters, 03100 * both are recognized as decimal numbers. In addition, the width of 03101 * string (e.g. leading zeros) is handled appropriately. 03102 * 03103 * "9".upto("11").to_a #=> ["9", "10", "11"] 03104 * "25".upto("5").to_a #=> [] 03105 * "07".upto("11").to_a #=> ["07", "08", "09", "10", "11"] 03106 */ 03107 03108 static VALUE 03109 rb_str_upto(int argc, VALUE *argv, VALUE beg) 03110 { 03111 VALUE end, exclusive; 03112 VALUE current, after_end; 03113 ID succ; 03114 int n, excl, ascii; 03115 rb_encoding *enc; 03116 03117 rb_scan_args(argc, argv, "11", &end, &exclusive); 03118 RETURN_ENUMERATOR(beg, argc, argv); 03119 excl = RTEST(exclusive); 03120 CONST_ID(succ, "succ"); 03121 StringValue(end); 03122 enc = rb_enc_check(beg, end); 03123 ascii = (is_ascii_string(beg) && is_ascii_string(end)); 03124 /* single character */ 03125 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) { 03126 char c = RSTRING_PTR(beg)[0]; 03127 char e = RSTRING_PTR(end)[0]; 03128 03129 if (c > e || (excl && c == e)) return beg; 03130 for (;;) { 03131 rb_yield(rb_enc_str_new(&c, 1, enc)); 03132 if (!excl && c == e) break; 03133 c++; 03134 if (excl && c == e) break; 03135 } 03136 return beg; 03137 } 03138 /* both edges are all digits */ 03139 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0])) { 03140 char *s, *send; 03141 VALUE b, e; 03142 int width; 03143 03144 s = RSTRING_PTR(beg); send = RSTRING_END(beg); 03145 width = rb_long2int(send - s); 03146 while (s < send) { 03147 if (!ISDIGIT(*s)) goto no_digits; 03148 s++; 03149 } 03150 s = RSTRING_PTR(end); send = RSTRING_END(end); 03151 while (s < send) { 03152 if (!ISDIGIT(*s)) goto no_digits; 03153 s++; 03154 } 03155 b = rb_str_to_inum(beg, 10, FALSE); 03156 e = rb_str_to_inum(end, 10, FALSE); 03157 if (FIXNUM_P(b) && FIXNUM_P(e)) { 03158 long bi = FIX2LONG(b); 03159 long ei = FIX2LONG(e); 03160 rb_encoding *usascii = rb_usascii_encoding(); 03161 03162 while (bi <= ei) { 03163 if (excl && bi == ei) break; 03164 rb_yield(rb_enc_sprintf(usascii, "%.*ld", width, bi)); 03165 bi++; 03166 } 03167 } 03168 else { 03169 ID op = excl ? '<' : rb_intern("<="); 03170 VALUE args[2], fmt = rb_obj_freeze(rb_usascii_str_new_cstr("%.*d")); 03171 03172 args[0] = INT2FIX(width); 03173 while (rb_funcall(b, op, 1, e)) { 03174 args[1] = b; 03175 rb_yield(rb_str_format(numberof(args), args, fmt)); 03176 b = rb_funcall(b, succ, 0, 0); 03177 } 03178 } 03179 return beg; 03180 } 03181 /* normal case */ 03182 no_digits: 03183 n = rb_str_cmp(beg, end); 03184 if (n > 0 || (excl && n == 0)) return beg; 03185 03186 after_end = rb_funcall(end, succ, 0, 0); 03187 current = rb_str_dup(beg); 03188 while (!rb_str_equal(current, after_end)) { 03189 VALUE next = Qnil; 03190 if (excl || !rb_str_equal(current, end)) 03191 next = rb_funcall(current, succ, 0, 0); 03192 rb_yield(current); 03193 if (NIL_P(next)) break; 03194 current = next; 03195 StringValue(current); 03196 if (excl && rb_str_equal(current, end)) break; 03197 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0) 03198 break; 03199 } 03200 03201 return beg; 03202 } 03203 03204 static VALUE 03205 rb_str_subpat(VALUE str, VALUE re, VALUE backref) 03206 { 03207 if (rb_reg_search(re, str, 0, 0) >= 0) { 03208 VALUE match = rb_backref_get(); 03209 int nth = rb_reg_backref_number(match, backref); 03210 return rb_reg_nth_match(nth, match); 03211 } 03212 return Qnil; 03213 } 03214 03215 static VALUE 03216 rb_str_aref(VALUE str, VALUE indx) 03217 { 03218 long idx; 03219 03220 if (FIXNUM_P(indx)) { 03221 idx = FIX2LONG(indx); 03222 03223 num_index: 03224 str = rb_str_substr(str, idx, 1); 03225 if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil; 03226 return str; 03227 } 03228 03229 if (SPECIAL_CONST_P(indx)) goto generic; 03230 switch (BUILTIN_TYPE(indx)) { 03231 case T_REGEXP: 03232 return rb_str_subpat(str, indx, INT2FIX(0)); 03233 03234 case T_STRING: 03235 if (rb_str_index(str, indx, 0) != -1) 03236 return rb_str_dup(indx); 03237 return Qnil; 03238 03239 generic: 03240 default: 03241 /* check if indx is Range */ 03242 { 03243 long beg, len; 03244 VALUE tmp; 03245 03246 len = str_strlen(str, STR_ENC_GET(str)); 03247 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) { 03248 case Qfalse: 03249 break; 03250 case Qnil: 03251 return Qnil; 03252 default: 03253 tmp = rb_str_substr(str, beg, len); 03254 return tmp; 03255 } 03256 } 03257 idx = NUM2LONG(indx); 03258 goto num_index; 03259 } 03260 03261 UNREACHABLE; 03262 } 03263 03264 03265 /* 03266 * call-seq: 03267 * str[index] -> new_str or nil 03268 * str[start, length] -> new_str or nil 03269 * str[range] -> new_str or nil 03270 * str[regexp] -> new_str or nil 03271 * str[regexp, capture] -> new_str or nil 03272 * str[match_str] -> new_str or nil 03273 * str.slice(index) -> new_str or nil 03274 * str.slice(start, length) -> new_str or nil 03275 * str.slice(range) -> new_str or nil 03276 * str.slice(regexp) -> new_str or nil 03277 * str.slice(regexp, capture) -> new_str or nil 03278 * str.slice(match_str) -> new_str or nil 03279 * 03280 * Element Reference --- If passed a single +index+, returns a substring of 03281 * one character at that index. If passed a +start+ index and a +length+, 03282 * returns a substring containing +length+ characters starting at the 03283 * +index+. If passed a +range+, its beginning and end are interpreted as 03284 * offsets delimiting the substring to be returned. 03285 * 03286 * In these three cases, if an index is negative, it is counted from the end 03287 * of the string. For the +start+ and +range+ cases the starting index 03288 * is just before a character and an index matching the string's size. 03289 * Additionally, an empty string is returned when the starting index for a 03290 * character range is at the end of the string. 03291 * 03292 * Returns +nil+ if the initial index falls outside the string or the length 03293 * is negative. 03294 * 03295 * If a +Regexp+ is supplied, the matching portion of the string is 03296 * returned. If a +capture+ follows the regular expression, which may be a 03297 * capture group index or name, follows the regular expression that component 03298 * of the MatchData is returned instead. 03299 * 03300 * If a +match_str+ is given, that string is returned if it occurs in 03301 * the string. 03302 * 03303 * Returns +nil+ if the regular expression does not match or the match string 03304 * cannot be found. 03305 * 03306 * a = "hello there" 03307 * 03308 * a[1] #=> "e" 03309 * a[2, 3] #=> "llo" 03310 * a[2..3] #=> "ll" 03311 * 03312 * a[-3, 2] #=> "er" 03313 * a[7..-2] #=> "her" 03314 * a[-4..-2] #=> "her" 03315 * a[-2..-4] #=> "" 03316 * 03317 * a[11, 0] #=> "" 03318 * a[11] #=> nil 03319 * a[12, 0] #=> nil 03320 * a[12..-1] #=> nil 03321 * 03322 * a[/[aeiou](.)\1/] #=> "ell" 03323 * a[/[aeiou](.)\1/, 0] #=> "ell" 03324 * a[/[aeiou](.)\1/, 1] #=> "l" 03325 * a[/[aeiou](.)\1/, 2] #=> nil 03326 * 03327 * a[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "non_vowel"] #=> "l" 03328 * a[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "vowel"] #=> "e" 03329 * 03330 * a["lo"] #=> "lo" 03331 * a["bye"] #=> nil 03332 */ 03333 03334 static VALUE 03335 rb_str_aref_m(int argc, VALUE *argv, VALUE str) 03336 { 03337 if (argc == 2) { 03338 if (RB_TYPE_P(argv[0], T_REGEXP)) { 03339 return rb_str_subpat(str, argv[0], argv[1]); 03340 } 03341 return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1])); 03342 } 03343 rb_check_arity(argc, 1, 2); 03344 return rb_str_aref(str, argv[0]); 03345 } 03346 03347 VALUE 03348 rb_str_drop_bytes(VALUE str, long len) 03349 { 03350 char *ptr = RSTRING_PTR(str); 03351 long olen = RSTRING_LEN(str), nlen; 03352 03353 str_modifiable(str); 03354 if (len > olen) len = olen; 03355 nlen = olen - len; 03356 if (nlen <= RSTRING_EMBED_LEN_MAX) { 03357 char *oldptr = ptr; 03358 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|ELTS_SHARED)); 03359 STR_SET_EMBED(str); 03360 STR_SET_EMBED_LEN(str, nlen); 03361 ptr = RSTRING(str)->as.ary; 03362 memmove(ptr, oldptr + len, nlen); 03363 if (fl == STR_NOEMBED) xfree(oldptr); 03364 } 03365 else { 03366 if (!STR_SHARED_P(str)) rb_str_new4(str); 03367 ptr = RSTRING(str)->as.heap.ptr += len; 03368 RSTRING(str)->as.heap.len = nlen; 03369 } 03370 ptr[nlen] = 0; 03371 ENC_CODERANGE_CLEAR(str); 03372 return str; 03373 } 03374 03375 static void 03376 rb_str_splice_0(VALUE str, long beg, long len, VALUE val) 03377 { 03378 if (beg == 0 && RSTRING_LEN(val) == 0) { 03379 rb_str_drop_bytes(str, len); 03380 OBJ_INFECT(str, val); 03381 return; 03382 } 03383 03384 rb_str_modify(str); 03385 if (len < RSTRING_LEN(val)) { 03386 /* expand string */ 03387 RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1); 03388 } 03389 03390 if (RSTRING_LEN(val) != len) { 03391 memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val), 03392 RSTRING_PTR(str) + beg + len, 03393 RSTRING_LEN(str) - (beg + len)); 03394 } 03395 if (RSTRING_LEN(val) < beg && len < 0) { 03396 MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len); 03397 } 03398 if (RSTRING_LEN(val) > 0) { 03399 memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val)); 03400 } 03401 STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len); 03402 if (RSTRING_PTR(str)) { 03403 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; 03404 } 03405 OBJ_INFECT(str, val); 03406 } 03407 03408 static void 03409 rb_str_splice(VALUE str, long beg, long len, VALUE val) 03410 { 03411 long slen; 03412 char *p, *e; 03413 rb_encoding *enc; 03414 int singlebyte = single_byte_optimizable(str); 03415 int cr; 03416 03417 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len); 03418 03419 StringValue(val); 03420 enc = rb_enc_check(str, val); 03421 slen = str_strlen(str, enc); 03422 03423 if (slen < beg) { 03424 out_of_range: 03425 rb_raise(rb_eIndexError, "index %ld out of string", beg); 03426 } 03427 if (beg < 0) { 03428 if (-beg > slen) { 03429 goto out_of_range; 03430 } 03431 beg += slen; 03432 } 03433 if (slen < len || slen < beg + len) { 03434 len = slen - beg; 03435 } 03436 str_modify_keep_cr(str); 03437 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte); 03438 if (!p) p = RSTRING_END(str); 03439 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte); 03440 if (!e) e = RSTRING_END(str); 03441 /* error check */ 03442 beg = p - RSTRING_PTR(str); /* physical position */ 03443 len = e - p; /* physical length */ 03444 rb_str_splice_0(str, beg, len, val); 03445 rb_enc_associate(str, enc); 03446 cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val)); 03447 if (cr != ENC_CODERANGE_BROKEN) 03448 ENC_CODERANGE_SET(str, cr); 03449 } 03450 03451 void 03452 rb_str_update(VALUE str, long beg, long len, VALUE val) 03453 { 03454 rb_str_splice(str, beg, len, val); 03455 } 03456 03457 static void 03458 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val) 03459 { 03460 int nth; 03461 VALUE match; 03462 long start, end, len; 03463 rb_encoding *enc; 03464 struct re_registers *regs; 03465 03466 if (rb_reg_search(re, str, 0, 0) < 0) { 03467 rb_raise(rb_eIndexError, "regexp not matched"); 03468 } 03469 match = rb_backref_get(); 03470 nth = rb_reg_backref_number(match, backref); 03471 regs = RMATCH_REGS(match); 03472 if (nth >= regs->num_regs) { 03473 out_of_range: 03474 rb_raise(rb_eIndexError, "index %d out of regexp", nth); 03475 } 03476 if (nth < 0) { 03477 if (-nth >= regs->num_regs) { 03478 goto out_of_range; 03479 } 03480 nth += regs->num_regs; 03481 } 03482 03483 start = BEG(nth); 03484 if (start == -1) { 03485 rb_raise(rb_eIndexError, "regexp group %d not matched", nth); 03486 } 03487 end = END(nth); 03488 len = end - start; 03489 StringValue(val); 03490 enc = rb_enc_check(str, val); 03491 rb_str_splice_0(str, start, len, val); 03492 rb_enc_associate(str, enc); 03493 } 03494 03495 static VALUE 03496 rb_str_aset(VALUE str, VALUE indx, VALUE val) 03497 { 03498 long idx, beg; 03499 03500 if (FIXNUM_P(indx)) { 03501 idx = FIX2LONG(indx); 03502 num_index: 03503 rb_str_splice(str, idx, 1, val); 03504 return val; 03505 } 03506 03507 if (SPECIAL_CONST_P(indx)) goto generic; 03508 switch (TYPE(indx)) { 03509 case T_REGEXP: 03510 rb_str_subpat_set(str, indx, INT2FIX(0), val); 03511 return val; 03512 03513 case T_STRING: 03514 beg = rb_str_index(str, indx, 0); 03515 if (beg < 0) { 03516 rb_raise(rb_eIndexError, "string not matched"); 03517 } 03518 beg = rb_str_sublen(str, beg); 03519 rb_str_splice(str, beg, str_strlen(indx, 0), val); 03520 return val; 03521 03522 generic: 03523 default: 03524 /* check if indx is Range */ 03525 { 03526 long beg, len; 03527 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) { 03528 rb_str_splice(str, beg, len, val); 03529 return val; 03530 } 03531 } 03532 idx = NUM2LONG(indx); 03533 goto num_index; 03534 } 03535 } 03536 03537 /* 03538 * call-seq: 03539 * str[fixnum] = new_str 03540 * str[fixnum, fixnum] = new_str 03541 * str[range] = aString 03542 * str[regexp] = new_str 03543 * str[regexp, fixnum] = new_str 03544 * str[regexp, name] = new_str 03545 * str[other_str] = new_str 03546 * 03547 * Element Assignment---Replaces some or all of the content of <i>str</i>. The 03548 * portion of the string affected is determined using the same criteria as 03549 * <code>String#[]</code>. If the replacement string is not the same length as 03550 * the text it is replacing, the string will be adjusted accordingly. If the 03551 * regular expression or string is used as the index doesn't match a position 03552 * in the string, <code>IndexError</code> is raised. If the regular expression 03553 * form is used, the optional second <code>Fixnum</code> allows you to specify 03554 * which portion of the match to replace (effectively using the 03555 * <code>MatchData</code> indexing rules. The forms that take a 03556 * <code>Fixnum</code> will raise an <code>IndexError</code> if the value is 03557 * out of range; the <code>Range</code> form will raise a 03558 * <code>RangeError</code>, and the <code>Regexp</code> and <code>String</code> 03559 * will raise an <code>IndexError</code> on negative match. 03560 */ 03561 03562 static VALUE 03563 rb_str_aset_m(int argc, VALUE *argv, VALUE str) 03564 { 03565 if (argc == 3) { 03566 if (RB_TYPE_P(argv[0], T_REGEXP)) { 03567 rb_str_subpat_set(str, argv[0], argv[1], argv[2]); 03568 } 03569 else { 03570 rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]); 03571 } 03572 return argv[2]; 03573 } 03574 rb_check_arity(argc, 2, 3); 03575 return rb_str_aset(str, argv[0], argv[1]); 03576 } 03577 03578 /* 03579 * call-seq: 03580 * str.insert(index, other_str) -> str 03581 * 03582 * Inserts <i>other_str</i> before the character at the given 03583 * <i>index</i>, modifying <i>str</i>. Negative indices count from the 03584 * end of the string, and insert <em>after</em> the given character. 03585 * The intent is insert <i>aString</i> so that it starts at the given 03586 * <i>index</i>. 03587 * 03588 * "abcd".insert(0, 'X') #=> "Xabcd" 03589 * "abcd".insert(3, 'X') #=> "abcXd" 03590 * "abcd".insert(4, 'X') #=> "abcdX" 03591 * "abcd".insert(-3, 'X') #=> "abXcd" 03592 * "abcd".insert(-1, 'X') #=> "abcdX" 03593 */ 03594 03595 static VALUE 03596 rb_str_insert(VALUE str, VALUE idx, VALUE str2) 03597 { 03598 long pos = NUM2LONG(idx); 03599 03600 if (pos == -1) { 03601 return rb_str_append(str, str2); 03602 } 03603 else if (pos < 0) { 03604 pos++; 03605 } 03606 rb_str_splice(str, pos, 0, str2); 03607 return str; 03608 } 03609 03610 03611 /* 03612 * call-seq: 03613 * str.slice!(fixnum) -> fixnum or nil 03614 * str.slice!(fixnum, fixnum) -> new_str or nil 03615 * str.slice!(range) -> new_str or nil 03616 * str.slice!(regexp) -> new_str or nil 03617 * str.slice!(other_str) -> new_str or nil 03618 * 03619 * Deletes the specified portion from <i>str</i>, and returns the portion 03620 * deleted. 03621 * 03622 * string = "this is a string" 03623 * string.slice!(2) #=> "i" 03624 * string.slice!(3..6) #=> " is " 03625 * string.slice!(/s.*t/) #=> "sa st" 03626 * string.slice!("r") #=> "r" 03627 * string #=> "thing" 03628 */ 03629 03630 static VALUE 03631 rb_str_slice_bang(int argc, VALUE *argv, VALUE str) 03632 { 03633 VALUE result; 03634 VALUE buf[3]; 03635 int i; 03636 03637 rb_check_arity(argc, 1, 2); 03638 for (i=0; i<argc; i++) { 03639 buf[i] = argv[i]; 03640 } 03641 str_modify_keep_cr(str); 03642 result = rb_str_aref_m(argc, buf, str); 03643 if (!NIL_P(result)) { 03644 buf[i] = rb_str_new(0,0); 03645 rb_str_aset_m(argc+1, buf, str); 03646 } 03647 return result; 03648 } 03649 03650 static VALUE 03651 get_pat(VALUE pat, int quote) 03652 { 03653 VALUE val; 03654 03655 switch (TYPE(pat)) { 03656 case T_REGEXP: 03657 return pat; 03658 03659 case T_STRING: 03660 break; 03661 03662 default: 03663 val = rb_check_string_type(pat); 03664 if (NIL_P(val)) { 03665 Check_Type(pat, T_REGEXP); 03666 } 03667 pat = val; 03668 } 03669 03670 if (quote) { 03671 pat = rb_reg_quote(pat); 03672 } 03673 03674 return rb_reg_regcomp(pat); 03675 } 03676 03677 03678 /* 03679 * call-seq: 03680 * str.sub!(pattern, replacement) -> str or nil 03681 * str.sub!(pattern) {|match| block } -> str or nil 03682 * 03683 * Performs the same substitution as String#sub in-place. 03684 * 03685 * Returns +str+ if a substitution was performed or +nil+ if no substitution 03686 * was performed. 03687 */ 03688 03689 static VALUE 03690 rb_str_sub_bang(int argc, VALUE *argv, VALUE str) 03691 { 03692 VALUE pat, repl, hash = Qnil; 03693 int iter = 0; 03694 int tainted = 0; 03695 int untrusted = 0; 03696 long plen; 03697 int min_arity = rb_block_given_p() ? 1 : 2; 03698 03699 rb_check_arity(argc, min_arity, 2); 03700 if (argc == 1) { 03701 iter = 1; 03702 } 03703 else { 03704 repl = argv[1]; 03705 hash = rb_check_hash_type(argv[1]); 03706 if (NIL_P(hash)) { 03707 StringValue(repl); 03708 } 03709 if (OBJ_TAINTED(repl)) tainted = 1; 03710 if (OBJ_UNTRUSTED(repl)) untrusted = 1; 03711 } 03712 03713 pat = get_pat(argv[0], 1); 03714 str_modifiable(str); 03715 if (rb_reg_search(pat, str, 0, 0) >= 0) { 03716 rb_encoding *enc; 03717 int cr = ENC_CODERANGE(str); 03718 VALUE match = rb_backref_get(); 03719 struct re_registers *regs = RMATCH_REGS(match); 03720 long beg0 = BEG(0); 03721 long end0 = END(0); 03722 char *p, *rp; 03723 long len, rlen; 03724 03725 if (iter || !NIL_P(hash)) { 03726 p = RSTRING_PTR(str); len = RSTRING_LEN(str); 03727 03728 if (iter) { 03729 repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match))); 03730 } 03731 else { 03732 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0)); 03733 repl = rb_obj_as_string(repl); 03734 } 03735 str_mod_check(str, p, len); 03736 rb_check_frozen(str); 03737 } 03738 else { 03739 repl = rb_reg_regsub(repl, str, regs, pat); 03740 } 03741 enc = rb_enc_compatible(str, repl); 03742 if (!enc) { 03743 rb_encoding *str_enc = STR_ENC_GET(str); 03744 p = RSTRING_PTR(str); len = RSTRING_LEN(str); 03745 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT || 03746 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) { 03747 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s", 03748 rb_enc_name(str_enc), 03749 rb_enc_name(STR_ENC_GET(repl))); 03750 } 03751 enc = STR_ENC_GET(repl); 03752 } 03753 rb_str_modify(str); 03754 rb_enc_associate(str, enc); 03755 if (OBJ_TAINTED(repl)) tainted = 1; 03756 if (OBJ_UNTRUSTED(repl)) untrusted = 1; 03757 if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) { 03758 int cr2 = ENC_CODERANGE(repl); 03759 if (cr2 == ENC_CODERANGE_BROKEN || 03760 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT)) 03761 cr = ENC_CODERANGE_UNKNOWN; 03762 else 03763 cr = cr2; 03764 } 03765 plen = end0 - beg0; 03766 rp = RSTRING_PTR(repl); rlen = RSTRING_LEN(repl); 03767 len = RSTRING_LEN(str); 03768 if (rlen > plen) { 03769 RESIZE_CAPA(str, len + rlen - plen); 03770 } 03771 p = RSTRING_PTR(str); 03772 if (rlen != plen) { 03773 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen); 03774 } 03775 memcpy(p + beg0, rp, rlen); 03776 len += rlen - plen; 03777 STR_SET_LEN(str, len); 03778 RSTRING_PTR(str)[len] = '\0'; 03779 ENC_CODERANGE_SET(str, cr); 03780 if (tainted) OBJ_TAINT(str); 03781 if (untrusted) OBJ_UNTRUST(str); 03782 03783 return str; 03784 } 03785 return Qnil; 03786 } 03787 03788 03789 /* 03790 * call-seq: 03791 * str.sub(pattern, replacement) -> new_str 03792 * str.sub(pattern, hash) -> new_str 03793 * str.sub(pattern) {|match| block } -> new_str 03794 * 03795 * Returns a copy of +str+ with the _first_ occurrence of +pattern+ 03796 * replaced by the second argument. The +pattern+ is typically a Regexp; if 03797 * given as a String, any regular expression metacharacters it contains will 03798 * be interpreted literally, e.g. <code>'\\\d'</code> will match a backlash 03799 * followed by 'd', instead of a digit. 03800 * 03801 * If +replacement+ is a String it will be substituted for the matched text. 03802 * It may contain back-references to the pattern's capture groups of the form 03803 * <code>"\\d"</code>, where <i>d</i> is a group number, or 03804 * <code>"\\k<n>"</code>, where <i>n</i> is a group name. If it is a 03805 * double-quoted string, both back-references must be preceded by an 03806 * additional backslash. However, within +replacement+ the special match 03807 * variables, such as <code>&$</code>, will not refer to the current match. 03808 * 03809 * If the second argument is a Hash, and the matched text is one of its keys, 03810 * the corresponding value is the replacement string. 03811 * 03812 * In the block form, the current match string is passed in as a parameter, 03813 * and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>, 03814 * <code>$&</code>, and <code>$'</code> will be set appropriately. The value 03815 * returned by the block will be substituted for the match on each call. 03816 * 03817 * The result inherits any tainting in the original string or any supplied 03818 * replacement string. 03819 * 03820 * "hello".sub(/[aeiou]/, '*') #=> "h*llo" 03821 * "hello".sub(/([aeiou])/, '<\1>') #=> "h<e>llo" 03822 * "hello".sub(/./) {|s| s.ord.to_s + ' ' } #=> "104 ello" 03823 * "hello".sub(/(?<foo>[aeiou])/, '*\k<foo>*') #=> "h*e*llo" 03824 * 'Is SHELL your preferred shell?'.sub(/[[:upper:]]{2,}/, ENV) 03825 * #=> "Is /bin/bash your preferred shell?" 03826 */ 03827 03828 static VALUE 03829 rb_str_sub(int argc, VALUE *argv, VALUE str) 03830 { 03831 str = rb_str_dup(str); 03832 rb_str_sub_bang(argc, argv, str); 03833 return str; 03834 } 03835 03836 static VALUE 03837 str_gsub(int argc, VALUE *argv, VALUE str, int bang) 03838 { 03839 VALUE pat, val, repl, match, dest, hash = Qnil; 03840 struct re_registers *regs; 03841 long beg, n; 03842 long beg0, end0; 03843 long offset, blen, slen, len, last; 03844 int iter = 0; 03845 char *sp, *cp; 03846 int tainted = 0; 03847 rb_encoding *str_enc; 03848 03849 switch (argc) { 03850 case 1: 03851 RETURN_ENUMERATOR(str, argc, argv); 03852 iter = 1; 03853 break; 03854 case 2: 03855 repl = argv[1]; 03856 hash = rb_check_hash_type(argv[1]); 03857 if (NIL_P(hash)) { 03858 StringValue(repl); 03859 } 03860 if (OBJ_TAINTED(repl)) tainted = 1; 03861 break; 03862 default: 03863 rb_check_arity(argc, 1, 2); 03864 } 03865 03866 pat = get_pat(argv[0], 1); 03867 beg = rb_reg_search(pat, str, 0, 0); 03868 if (beg < 0) { 03869 if (bang) return Qnil; /* no match, no substitution */ 03870 return rb_str_dup(str); 03871 } 03872 03873 offset = 0; 03874 n = 0; 03875 blen = RSTRING_LEN(str) + 30; /* len + margin */ 03876 dest = rb_str_buf_new(blen); 03877 sp = RSTRING_PTR(str); 03878 slen = RSTRING_LEN(str); 03879 cp = sp; 03880 str_enc = STR_ENC_GET(str); 03881 rb_enc_associate(dest, str_enc); 03882 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID); 03883 03884 do { 03885 n++; 03886 match = rb_backref_get(); 03887 regs = RMATCH_REGS(match); 03888 beg0 = BEG(0); 03889 end0 = END(0); 03890 if (iter || !NIL_P(hash)) { 03891 if (iter) { 03892 val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match))); 03893 } 03894 else { 03895 val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0))); 03896 val = rb_obj_as_string(val); 03897 } 03898 str_mod_check(str, sp, slen); 03899 if (val == dest) { /* paranoid check [ruby-dev:24827] */ 03900 rb_raise(rb_eRuntimeError, "block should not cheat"); 03901 } 03902 } 03903 else { 03904 val = rb_reg_regsub(repl, str, regs, pat); 03905 } 03906 03907 if (OBJ_TAINTED(val)) tainted = 1; 03908 03909 len = beg0 - offset; /* copy pre-match substr */ 03910 if (len) { 03911 rb_enc_str_buf_cat(dest, cp, len, str_enc); 03912 } 03913 03914 rb_str_buf_append(dest, val); 03915 03916 last = offset; 03917 offset = end0; 03918 if (beg0 == end0) { 03919 /* 03920 * Always consume at least one character of the input string 03921 * in order to prevent infinite loops. 03922 */ 03923 if (RSTRING_LEN(str) <= end0) break; 03924 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc); 03925 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc); 03926 offset = end0 + len; 03927 } 03928 cp = RSTRING_PTR(str) + offset; 03929 if (offset > RSTRING_LEN(str)) break; 03930 beg = rb_reg_search(pat, str, offset, 0); 03931 } while (beg >= 0); 03932 if (RSTRING_LEN(str) > offset) { 03933 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc); 03934 } 03935 rb_reg_search(pat, str, last, 0); 03936 if (bang) { 03937 rb_str_shared_replace(str, dest); 03938 } 03939 else { 03940 RBASIC(dest)->klass = rb_obj_class(str); 03941 OBJ_INFECT(dest, str); 03942 str = dest; 03943 } 03944 03945 if (tainted) OBJ_TAINT(str); 03946 return str; 03947 } 03948 03949 03950 /* 03951 * call-seq: 03952 * str.gsub!(pattern, replacement) -> str or nil 03953 * str.gsub!(pattern) {|match| block } -> str or nil 03954 * str.gsub!(pattern) -> an_enumerator 03955 * 03956 * Performs the substitutions of <code>String#gsub</code> in place, returning 03957 * <i>str</i>, or <code>nil</code> if no substitutions were performed. 03958 * If no block and no <i>replacement</i> is given, an enumerator is returned instead. 03959 */ 03960 03961 static VALUE 03962 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str) 03963 { 03964 str_modify_keep_cr(str); 03965 return str_gsub(argc, argv, str, 1); 03966 } 03967 03968 03969 /* 03970 * call-seq: 03971 * str.gsub(pattern, replacement) -> new_str 03972 * str.gsub(pattern, hash) -> new_str 03973 * str.gsub(pattern) {|match| block } -> new_str 03974 * str.gsub(pattern) -> enumerator 03975 * 03976 * Returns a copy of <i>str</i> with the <em>all</em> occurrences of 03977 * <i>pattern</i> substituted for the second argument. The <i>pattern</i> is 03978 * typically a <code>Regexp</code>; if given as a <code>String</code>, any 03979 * regular expression metacharacters it contains will be interpreted 03980 * literally, e.g. <code>'\\\d'</code> will match a backlash followed by 'd', 03981 * instead of a digit. 03982 * 03983 * If <i>replacement</i> is a <code>String</code> it will be substituted for 03984 * the matched text. It may contain back-references to the pattern's capture 03985 * groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or 03986 * <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a 03987 * double-quoted string, both back-references must be preceded by an 03988 * additional backslash. However, within <i>replacement</i> the special match 03989 * variables, such as <code>$&</code>, will not refer to the current match. 03990 * 03991 * If the second argument is a <code>Hash</code>, and the matched text is one 03992 * of its keys, the corresponding value is the replacement string. 03993 * 03994 * In the block form, the current match string is passed in as a parameter, 03995 * and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>, 03996 * <code>$&</code>, and <code>$'</code> will be set appropriately. The value 03997 * returned by the block will be substituted for the match on each call. 03998 * 03999 * The result inherits any tainting in the original string or any supplied 04000 * replacement string. 04001 * 04002 * When neither a block nor a second argument is supplied, an 04003 * <code>Enumerator</code> is returned. 04004 * 04005 * "hello".gsub(/[aeiou]/, '*') #=> "h*ll*" 04006 * "hello".gsub(/([aeiou])/, '<\1>') #=> "h<e>ll<o>" 04007 * "hello".gsub(/./) {|s| s.ord.to_s + ' '} #=> "104 101 108 108 111 " 04008 * "hello".gsub(/(?<foo>[aeiou])/, '{\k<foo>}') #=> "h{e}ll{o}" 04009 * 'hello'.gsub(/[eo]/, 'e' => 3, 'o' => '*') #=> "h3ll*" 04010 */ 04011 04012 static VALUE 04013 rb_str_gsub(int argc, VALUE *argv, VALUE str) 04014 { 04015 return str_gsub(argc, argv, str, 0); 04016 } 04017 04018 04019 /* 04020 * call-seq: 04021 * str.replace(other_str) -> str 04022 * 04023 * Replaces the contents and taintedness of <i>str</i> with the corresponding 04024 * values in <i>other_str</i>. 04025 * 04026 * s = "hello" #=> "hello" 04027 * s.replace "world" #=> "world" 04028 */ 04029 04030 VALUE 04031 rb_str_replace(VALUE str, VALUE str2) 04032 { 04033 str_modifiable(str); 04034 if (str == str2) return str; 04035 04036 StringValue(str2); 04037 str_discard(str); 04038 return str_replace(str, str2); 04039 } 04040 04041 /* 04042 * call-seq: 04043 * string.clear -> string 04044 * 04045 * Makes string empty. 04046 * 04047 * a = "abcde" 04048 * a.clear #=> "" 04049 */ 04050 04051 static VALUE 04052 rb_str_clear(VALUE str) 04053 { 04054 str_discard(str); 04055 STR_SET_EMBED(str); 04056 STR_SET_EMBED_LEN(str, 0); 04057 RSTRING_PTR(str)[0] = 0; 04058 if (rb_enc_asciicompat(STR_ENC_GET(str))) 04059 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT); 04060 else 04061 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID); 04062 return str; 04063 } 04064 04065 /* 04066 * call-seq: 04067 * string.chr -> string 04068 * 04069 * Returns a one-character string at the beginning of the string. 04070 * 04071 * a = "abcde" 04072 * a.chr #=> "a" 04073 */ 04074 04075 static VALUE 04076 rb_str_chr(VALUE str) 04077 { 04078 return rb_str_substr(str, 0, 1); 04079 } 04080 04081 /* 04082 * call-seq: 04083 * str.getbyte(index) -> 0 .. 255 04084 * 04085 * returns the <i>index</i>th byte as an integer. 04086 */ 04087 static VALUE 04088 rb_str_getbyte(VALUE str, VALUE index) 04089 { 04090 long pos = NUM2LONG(index); 04091 04092 if (pos < 0) 04093 pos += RSTRING_LEN(str); 04094 if (pos < 0 || RSTRING_LEN(str) <= pos) 04095 return Qnil; 04096 04097 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]); 04098 } 04099 04100 /* 04101 * call-seq: 04102 * str.setbyte(index, integer) -> integer 04103 * 04104 * modifies the <i>index</i>th byte as <i>integer</i>. 04105 */ 04106 static VALUE 04107 rb_str_setbyte(VALUE str, VALUE index, VALUE value) 04108 { 04109 long pos = NUM2LONG(index); 04110 int byte = NUM2INT(value); 04111 04112 rb_str_modify(str); 04113 04114 if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos) 04115 rb_raise(rb_eIndexError, "index %ld out of string", pos); 04116 if (pos < 0) 04117 pos += RSTRING_LEN(str); 04118 04119 RSTRING_PTR(str)[pos] = byte; 04120 04121 return value; 04122 } 04123 04124 static VALUE 04125 str_byte_substr(VALUE str, long beg, long len) 04126 { 04127 char *p, *s = RSTRING_PTR(str); 04128 long n = RSTRING_LEN(str); 04129 VALUE str2; 04130 04131 if (beg > n || len < 0) return Qnil; 04132 if (beg < 0) { 04133 beg += n; 04134 if (beg < 0) return Qnil; 04135 } 04136 if (beg + len > n) 04137 len = n - beg; 04138 if (len <= 0) { 04139 len = 0; 04140 p = 0; 04141 } 04142 else 04143 p = s + beg; 04144 04145 if (len > RSTRING_EMBED_LEN_MAX && beg + len == n) { 04146 str2 = rb_str_new4(str); 04147 str2 = str_new3(rb_obj_class(str2), str2); 04148 RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len; 04149 RSTRING(str2)->as.heap.len = len; 04150 } 04151 else { 04152 str2 = rb_str_new5(str, p, len); 04153 } 04154 04155 str_enc_copy(str2, str); 04156 04157 if (RSTRING_LEN(str2) == 0) { 04158 if (!rb_enc_asciicompat(STR_ENC_GET(str))) 04159 ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID); 04160 else 04161 ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT); 04162 } 04163 else { 04164 switch (ENC_CODERANGE(str)) { 04165 case ENC_CODERANGE_7BIT: 04166 ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT); 04167 break; 04168 default: 04169 ENC_CODERANGE_SET(str2, ENC_CODERANGE_UNKNOWN); 04170 break; 04171 } 04172 } 04173 04174 OBJ_INFECT(str2, str); 04175 04176 return str2; 04177 } 04178 04179 static VALUE 04180 str_byte_aref(VALUE str, VALUE indx) 04181 { 04182 long idx; 04183 switch (TYPE(indx)) { 04184 case T_FIXNUM: 04185 idx = FIX2LONG(indx); 04186 04187 num_index: 04188 str = str_byte_substr(str, idx, 1); 04189 if (NIL_P(str) || RSTRING_LEN(str) == 0) return Qnil; 04190 return str; 04191 04192 default: 04193 /* check if indx is Range */ 04194 { 04195 long beg, len = RSTRING_LEN(str); 04196 04197 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) { 04198 case Qfalse: 04199 break; 04200 case Qnil: 04201 return Qnil; 04202 default: 04203 return str_byte_substr(str, beg, len); 04204 } 04205 } 04206 idx = NUM2LONG(indx); 04207 goto num_index; 04208 } 04209 04210 UNREACHABLE; 04211 } 04212 04213 /* 04214 * call-seq: 04215 * str.byteslice(fixnum) -> new_str or nil 04216 * str.byteslice(fixnum, fixnum) -> new_str or nil 04217 * str.byteslice(range) -> new_str or nil 04218 * 04219 * Byte Reference---If passed a single <code>Fixnum</code>, returns a 04220 * substring of one byte at that position. If passed two <code>Fixnum</code> 04221 * objects, returns a substring starting at the offset given by the first, and 04222 * a length given by the second. If given a <code>Range</code>, a substring containing 04223 * bytes at offsets given by the range is returned. In all three cases, if 04224 * an offset is negative, it is counted from the end of <i>str</i>. Returns 04225 * <code>nil</code> if the initial offset falls outside the string, the length 04226 * is negative, or the beginning of the range is greater than the end. 04227 * The encoding of the resulted string keeps original encoding. 04228 * 04229 * "hello".byteslice(1) #=> "e" 04230 * "hello".byteslice(-1) #=> "o" 04231 * "hello".byteslice(1, 2) #=> "el" 04232 * "\x80\u3042".byteslice(1, 3) #=> "\u3042" 04233 * "\x03\u3042\xff".byteslice(1..3) #=> "\u3042" 04234 */ 04235 04236 static VALUE 04237 rb_str_byteslice(int argc, VALUE *argv, VALUE str) 04238 { 04239 if (argc == 2) { 04240 return str_byte_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1])); 04241 } 04242 rb_check_arity(argc, 1, 2); 04243 return str_byte_aref(str, argv[0]); 04244 } 04245 04246 /* 04247 * call-seq: 04248 * str.reverse -> new_str 04249 * 04250 * Returns a new string with the characters from <i>str</i> in reverse order. 04251 * 04252 * "stressed".reverse #=> "desserts" 04253 */ 04254 04255 static VALUE 04256 rb_str_reverse(VALUE str) 04257 { 04258 rb_encoding *enc; 04259 VALUE rev; 04260 char *s, *e, *p; 04261 int single = 1; 04262 04263 if (RSTRING_LEN(str) <= 1) return rb_str_dup(str); 04264 enc = STR_ENC_GET(str); 04265 rev = rb_str_new5(str, 0, RSTRING_LEN(str)); 04266 s = RSTRING_PTR(str); e = RSTRING_END(str); 04267 p = RSTRING_END(rev); 04268 04269 if (RSTRING_LEN(str) > 1) { 04270 if (single_byte_optimizable(str)) { 04271 while (s < e) { 04272 *--p = *s++; 04273 } 04274 } 04275 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID) { 04276 while (s < e) { 04277 int clen = rb_enc_fast_mbclen(s, e, enc); 04278 04279 if (clen > 1 || (*s & 0x80)) single = 0; 04280 p -= clen; 04281 memcpy(p, s, clen); 04282 s += clen; 04283 } 04284 } 04285 else { 04286 while (s < e) { 04287 int clen = rb_enc_mbclen(s, e, enc); 04288 04289 if (clen > 1 || (*s & 0x80)) single = 0; 04290 p -= clen; 04291 memcpy(p, s, clen); 04292 s += clen; 04293 } 04294 } 04295 } 04296 STR_SET_LEN(rev, RSTRING_LEN(str)); 04297 OBJ_INFECT(rev, str); 04298 if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) { 04299 if (single) { 04300 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT); 04301 } 04302 else { 04303 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID); 04304 } 04305 } 04306 rb_enc_cr_str_copy_for_substr(rev, str); 04307 04308 return rev; 04309 } 04310 04311 04312 /* 04313 * call-seq: 04314 * str.reverse! -> str 04315 * 04316 * Reverses <i>str</i> in place. 04317 */ 04318 04319 static VALUE 04320 rb_str_reverse_bang(VALUE str) 04321 { 04322 if (RSTRING_LEN(str) > 1) { 04323 if (single_byte_optimizable(str)) { 04324 char *s, *e, c; 04325 04326 str_modify_keep_cr(str); 04327 s = RSTRING_PTR(str); 04328 e = RSTRING_END(str) - 1; 04329 while (s < e) { 04330 c = *s; 04331 *s++ = *e; 04332 *e-- = c; 04333 } 04334 } 04335 else { 04336 rb_str_shared_replace(str, rb_str_reverse(str)); 04337 } 04338 } 04339 else { 04340 str_modify_keep_cr(str); 04341 } 04342 return str; 04343 } 04344 04345 04346 /* 04347 * call-seq: 04348 * str.include? other_str -> true or false 04349 * 04350 * Returns <code>true</code> if <i>str</i> contains the given string or 04351 * character. 04352 * 04353 * "hello".include? "lo" #=> true 04354 * "hello".include? "ol" #=> false 04355 * "hello".include? ?h #=> true 04356 */ 04357 04358 static VALUE 04359 rb_str_include(VALUE str, VALUE arg) 04360 { 04361 long i; 04362 04363 StringValue(arg); 04364 i = rb_str_index(str, arg, 0); 04365 04366 if (i == -1) return Qfalse; 04367 return Qtrue; 04368 } 04369 04370 04371 /* 04372 * call-seq: 04373 * str.to_i(base=10) -> integer 04374 * 04375 * Returns the result of interpreting leading characters in <i>str</i> as an 04376 * integer base <i>base</i> (between 2 and 36). Extraneous characters past the 04377 * end of a valid number are ignored. If there is not a valid number at the 04378 * start of <i>str</i>, <code>0</code> is returned. This method never raises an 04379 * exception when <i>base</i> is valid. 04380 * 04381 * "12345".to_i #=> 12345 04382 * "99 red balloons".to_i #=> 99 04383 * "0a".to_i #=> 0 04384 * "0a".to_i(16) #=> 10 04385 * "hello".to_i #=> 0 04386 * "1100101".to_i(2) #=> 101 04387 * "1100101".to_i(8) #=> 294977 04388 * "1100101".to_i(10) #=> 1100101 04389 * "1100101".to_i(16) #=> 17826049 04390 */ 04391 04392 static VALUE 04393 rb_str_to_i(int argc, VALUE *argv, VALUE str) 04394 { 04395 int base; 04396 04397 if (argc == 0) base = 10; 04398 else { 04399 VALUE b; 04400 04401 rb_scan_args(argc, argv, "01", &b); 04402 base = NUM2INT(b); 04403 } 04404 if (base < 0) { 04405 rb_raise(rb_eArgError, "invalid radix %d", base); 04406 } 04407 return rb_str_to_inum(str, base, FALSE); 04408 } 04409 04410 04411 /* 04412 * call-seq: 04413 * str.to_f -> float 04414 * 04415 * Returns the result of interpreting leading characters in <i>str</i> as a 04416 * floating point number. Extraneous characters past the end of a valid number 04417 * are ignored. If there is not a valid number at the start of <i>str</i>, 04418 * <code>0.0</code> is returned. This method never raises an exception. 04419 * 04420 * "123.45e1".to_f #=> 1234.5 04421 * "45.67 degrees".to_f #=> 45.67 04422 * "thx1138".to_f #=> 0.0 04423 */ 04424 04425 static VALUE 04426 rb_str_to_f(VALUE str) 04427 { 04428 return DBL2NUM(rb_str_to_dbl(str, FALSE)); 04429 } 04430 04431 04432 /* 04433 * call-seq: 04434 * str.to_s -> str 04435 * str.to_str -> str 04436 * 04437 * Returns the receiver. 04438 */ 04439 04440 static VALUE 04441 rb_str_to_s(VALUE str) 04442 { 04443 if (rb_obj_class(str) != rb_cString) { 04444 return str_duplicate(rb_cString, str); 04445 } 04446 return str; 04447 } 04448 04449 #if 0 04450 static void 04451 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc) 04452 { 04453 char s[RUBY_MAX_CHAR_LEN]; 04454 int n = rb_enc_codelen(c, enc); 04455 04456 rb_enc_mbcput(c, s, enc); 04457 rb_enc_str_buf_cat(str, s, n, enc); 04458 } 04459 #endif 04460 04461 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */ 04462 04463 int 04464 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p) 04465 { 04466 char buf[CHAR_ESC_LEN + 1]; 04467 int l; 04468 04469 #if SIZEOF_INT > 4 04470 c &= 0xffffffff; 04471 #endif 04472 if (unicode_p) { 04473 if (c < 0x7F && ISPRINT(c)) { 04474 snprintf(buf, CHAR_ESC_LEN, "%c", c); 04475 } 04476 else if (c < 0x10000) { 04477 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c); 04478 } 04479 else { 04480 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c); 04481 } 04482 } 04483 else { 04484 if (c < 0x100) { 04485 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c); 04486 } 04487 else { 04488 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c); 04489 } 04490 } 04491 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */ 04492 rb_str_buf_cat(result, buf, l); 04493 return l; 04494 } 04495 04496 /* 04497 * call-seq: 04498 * str.inspect -> string 04499 * 04500 * Returns a printable version of _str_, surrounded by quote marks, 04501 * with special characters escaped. 04502 * 04503 * str = "hello" 04504 * str[3] = "\b" 04505 * str.inspect #=> "\"hel\\bo\"" 04506 */ 04507 04508 VALUE 04509 rb_str_inspect(VALUE str) 04510 { 04511 rb_encoding *enc = STR_ENC_GET(str); 04512 const char *p, *pend, *prev; 04513 char buf[CHAR_ESC_LEN + 1]; 04514 VALUE result = rb_str_buf_new(0); 04515 rb_encoding *resenc = rb_default_internal_encoding(); 04516 int unicode_p = rb_enc_unicode_p(enc); 04517 int asciicompat = rb_enc_asciicompat(enc); 04518 static rb_encoding *utf16, *utf32; 04519 04520 if (!utf16) utf16 = rb_enc_find("UTF-16"); 04521 if (!utf32) utf32 = rb_enc_find("UTF-32"); 04522 if (resenc == NULL) resenc = rb_default_external_encoding(); 04523 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding(); 04524 rb_enc_associate(result, resenc); 04525 str_buf_cat2(result, "\""); 04526 04527 p = RSTRING_PTR(str); pend = RSTRING_END(str); 04528 prev = p; 04529 if (enc == utf16) { 04530 const unsigned char *q = (const unsigned char *)p; 04531 if (q[0] == 0xFE && q[1] == 0xFF) 04532 enc = rb_enc_find("UTF-16BE"); 04533 else if (q[0] == 0xFF && q[1] == 0xFE) 04534 enc = rb_enc_find("UTF-16LE"); 04535 else 04536 unicode_p = 0; 04537 } 04538 else if (enc == utf32) { 04539 const unsigned char *q = (const unsigned char *)p; 04540 if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF) 04541 enc = rb_enc_find("UTF-32BE"); 04542 else if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF) 04543 enc = rb_enc_find("UTF-32LE"); 04544 else 04545 unicode_p = 0; 04546 } 04547 while (p < pend) { 04548 unsigned int c, cc; 04549 int n; 04550 04551 n = rb_enc_precise_mbclen(p, pend, enc); 04552 if (!MBCLEN_CHARFOUND_P(n)) { 04553 if (p > prev) str_buf_cat(result, prev, p - prev); 04554 n = rb_enc_mbminlen(enc); 04555 if (pend < p + n) 04556 n = (int)(pend - p); 04557 while (n--) { 04558 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377); 04559 str_buf_cat(result, buf, strlen(buf)); 04560 prev = ++p; 04561 } 04562 continue; 04563 } 04564 n = MBCLEN_CHARFOUND_LEN(n); 04565 c = rb_enc_mbc_to_codepoint(p, pend, enc); 04566 p += n; 04567 if ((asciicompat || unicode_p) && 04568 (c == '"'|| c == '\\' || 04569 (c == '#' && 04570 p < pend && 04571 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) && 04572 (cc = rb_enc_codepoint(p,pend,enc), 04573 (cc == '$' || cc == '@' || cc == '{'))))) { 04574 if (p - n > prev) str_buf_cat(result, prev, p - n - prev); 04575 str_buf_cat2(result, "\\"); 04576 if (asciicompat || enc == resenc) { 04577 prev = p - n; 04578 continue; 04579 } 04580 } 04581 switch (c) { 04582 case '\n': cc = 'n'; break; 04583 case '\r': cc = 'r'; break; 04584 case '\t': cc = 't'; break; 04585 case '\f': cc = 'f'; break; 04586 case '\013': cc = 'v'; break; 04587 case '\010': cc = 'b'; break; 04588 case '\007': cc = 'a'; break; 04589 case 033: cc = 'e'; break; 04590 default: cc = 0; break; 04591 } 04592 if (cc) { 04593 if (p - n > prev) str_buf_cat(result, prev, p - n - prev); 04594 buf[0] = '\\'; 04595 buf[1] = (char)cc; 04596 str_buf_cat(result, buf, 2); 04597 prev = p; 04598 continue; 04599 } 04600 if ((enc == resenc && rb_enc_isprint(c, enc)) || 04601 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) { 04602 continue; 04603 } 04604 else { 04605 if (p - n > prev) str_buf_cat(result, prev, p - n - prev); 04606 rb_str_buf_cat_escaped_char(result, c, unicode_p); 04607 prev = p; 04608 continue; 04609 } 04610 } 04611 if (p > prev) str_buf_cat(result, prev, p - prev); 04612 str_buf_cat2(result, "\""); 04613 04614 OBJ_INFECT(result, str); 04615 return result; 04616 } 04617 04618 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{')) 04619 04620 /* 04621 * call-seq: 04622 * str.dump -> new_str 04623 * 04624 * Produces a version of +str+ with all non-printing characters replaced by 04625 * <code>\nnn</code> notation and all special characters escaped. 04626 * 04627 * "hello \n ''".dump #=> "\"hello \\n ''\" 04628 */ 04629 04630 VALUE 04631 rb_str_dump(VALUE str) 04632 { 04633 rb_encoding *enc = rb_enc_get(str); 04634 long len; 04635 const char *p, *pend; 04636 char *q, *qend; 04637 VALUE result; 04638 int u8 = (enc == rb_utf8_encoding()); 04639 04640 len = 2; /* "" */ 04641 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str); 04642 while (p < pend) { 04643 unsigned char c = *p++; 04644 switch (c) { 04645 case '"': case '\\': 04646 case '\n': case '\r': 04647 case '\t': case '\f': 04648 case '\013': case '\010': case '\007': case '\033': 04649 len += 2; 04650 break; 04651 04652 case '#': 04653 len += IS_EVSTR(p, pend) ? 2 : 1; 04654 break; 04655 04656 default: 04657 if (ISPRINT(c)) { 04658 len++; 04659 } 04660 else { 04661 if (u8) { /* \u{NN} */ 04662 int n = rb_enc_precise_mbclen(p-1, pend, enc); 04663 if (MBCLEN_CHARFOUND_P(n-1)) { 04664 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc); 04665 while (cc >>= 4) len++; 04666 len += 5; 04667 p += MBCLEN_CHARFOUND_LEN(n)-1; 04668 break; 04669 } 04670 } 04671 len += 4; /* \xNN */ 04672 } 04673 break; 04674 } 04675 } 04676 if (!rb_enc_asciicompat(enc)) { 04677 len += 19; /* ".force_encoding('')" */ 04678 len += strlen(enc->name); 04679 } 04680 04681 result = rb_str_new5(str, 0, len); 04682 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str); 04683 q = RSTRING_PTR(result); qend = q + len + 1; 04684 04685 *q++ = '"'; 04686 while (p < pend) { 04687 unsigned char c = *p++; 04688 04689 if (c == '"' || c == '\\') { 04690 *q++ = '\\'; 04691 *q++ = c; 04692 } 04693 else if (c == '#') { 04694 if (IS_EVSTR(p, pend)) *q++ = '\\'; 04695 *q++ = '#'; 04696 } 04697 else if (c == '\n') { 04698 *q++ = '\\'; 04699 *q++ = 'n'; 04700 } 04701 else if (c == '\r') { 04702 *q++ = '\\'; 04703 *q++ = 'r'; 04704 } 04705 else if (c == '\t') { 04706 *q++ = '\\'; 04707 *q++ = 't'; 04708 } 04709 else if (c == '\f') { 04710 *q++ = '\\'; 04711 *q++ = 'f'; 04712 } 04713 else if (c == '\013') { 04714 *q++ = '\\'; 04715 *q++ = 'v'; 04716 } 04717 else if (c == '\010') { 04718 *q++ = '\\'; 04719 *q++ = 'b'; 04720 } 04721 else if (c == '\007') { 04722 *q++ = '\\'; 04723 *q++ = 'a'; 04724 } 04725 else if (c == '\033') { 04726 *q++ = '\\'; 04727 *q++ = 'e'; 04728 } 04729 else if (ISPRINT(c)) { 04730 *q++ = c; 04731 } 04732 else { 04733 *q++ = '\\'; 04734 if (u8) { 04735 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1; 04736 if (MBCLEN_CHARFOUND_P(n)) { 04737 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc); 04738 p += n; 04739 snprintf(q, qend-q, "u{%x}", cc); 04740 q += strlen(q); 04741 continue; 04742 } 04743 } 04744 snprintf(q, qend-q, "x%02X", c); 04745 q += 3; 04746 } 04747 } 04748 *q++ = '"'; 04749 *q = '\0'; 04750 if (!rb_enc_asciicompat(enc)) { 04751 snprintf(q, qend-q, ".force_encoding(\"%s\")", enc->name); 04752 enc = rb_ascii8bit_encoding(); 04753 } 04754 OBJ_INFECT(result, str); 04755 /* result from dump is ASCII */ 04756 rb_enc_associate(result, enc); 04757 ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT); 04758 return result; 04759 } 04760 04761 04762 static void 04763 rb_str_check_dummy_enc(rb_encoding *enc) 04764 { 04765 if (rb_enc_dummy_p(enc)) { 04766 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s", 04767 rb_enc_name(enc)); 04768 } 04769 } 04770 04771 /* 04772 * call-seq: 04773 * str.upcase! -> str or nil 04774 * 04775 * Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes 04776 * were made. 04777 * Note: case replacement is effective only in ASCII region. 04778 */ 04779 04780 static VALUE 04781 rb_str_upcase_bang(VALUE str) 04782 { 04783 rb_encoding *enc; 04784 char *s, *send; 04785 int modify = 0; 04786 int n; 04787 04788 str_modify_keep_cr(str); 04789 enc = STR_ENC_GET(str); 04790 rb_str_check_dummy_enc(enc); 04791 s = RSTRING_PTR(str); send = RSTRING_END(str); 04792 if (single_byte_optimizable(str)) { 04793 while (s < send) { 04794 unsigned int c = *(unsigned char*)s; 04795 04796 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') { 04797 *s = 'A' + (c - 'a'); 04798 modify = 1; 04799 } 04800 s++; 04801 } 04802 } 04803 else { 04804 int ascompat = rb_enc_asciicompat(enc); 04805 04806 while (s < send) { 04807 unsigned int c; 04808 04809 if (ascompat && (c = *(unsigned char*)s) < 0x80) { 04810 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') { 04811 *s = 'A' + (c - 'a'); 04812 modify = 1; 04813 } 04814 s++; 04815 } 04816 else { 04817 c = rb_enc_codepoint_len(s, send, &n, enc); 04818 if (rb_enc_islower(c, enc)) { 04819 /* assuming toupper returns codepoint with same size */ 04820 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc); 04821 modify = 1; 04822 } 04823 s += n; 04824 } 04825 } 04826 } 04827 04828 if (modify) return str; 04829 return Qnil; 04830 } 04831 04832 04833 /* 04834 * call-seq: 04835 * str.upcase -> new_str 04836 * 04837 * Returns a copy of <i>str</i> with all lowercase letters replaced with their 04838 * uppercase counterparts. The operation is locale insensitive---only 04839 * characters ``a'' to ``z'' are affected. 04840 * Note: case replacement is effective only in ASCII region. 04841 * 04842 * "hEllO".upcase #=> "HELLO" 04843 */ 04844 04845 static VALUE 04846 rb_str_upcase(VALUE str) 04847 { 04848 str = rb_str_dup(str); 04849 rb_str_upcase_bang(str); 04850 return str; 04851 } 04852 04853 04854 /* 04855 * call-seq: 04856 * str.downcase! -> str or nil 04857 * 04858 * Downcases the contents of <i>str</i>, returning <code>nil</code> if no 04859 * changes were made. 04860 * Note: case replacement is effective only in ASCII region. 04861 */ 04862 04863 static VALUE 04864 rb_str_downcase_bang(VALUE str) 04865 { 04866 rb_encoding *enc; 04867 char *s, *send; 04868 int modify = 0; 04869 04870 str_modify_keep_cr(str); 04871 enc = STR_ENC_GET(str); 04872 rb_str_check_dummy_enc(enc); 04873 s = RSTRING_PTR(str); send = RSTRING_END(str); 04874 if (single_byte_optimizable(str)) { 04875 while (s < send) { 04876 unsigned int c = *(unsigned char*)s; 04877 04878 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') { 04879 *s = 'a' + (c - 'A'); 04880 modify = 1; 04881 } 04882 s++; 04883 } 04884 } 04885 else { 04886 int ascompat = rb_enc_asciicompat(enc); 04887 04888 while (s < send) { 04889 unsigned int c; 04890 int n; 04891 04892 if (ascompat && (c = *(unsigned char*)s) < 0x80) { 04893 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') { 04894 *s = 'a' + (c - 'A'); 04895 modify = 1; 04896 } 04897 s++; 04898 } 04899 else { 04900 c = rb_enc_codepoint_len(s, send, &n, enc); 04901 if (rb_enc_isupper(c, enc)) { 04902 /* assuming toupper returns codepoint with same size */ 04903 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc); 04904 modify = 1; 04905 } 04906 s += n; 04907 } 04908 } 04909 } 04910 04911 if (modify) return str; 04912 return Qnil; 04913 } 04914 04915 04916 /* 04917 * call-seq: 04918 * str.downcase -> new_str 04919 * 04920 * Returns a copy of <i>str</i> with all uppercase letters replaced with their 04921 * lowercase counterparts. The operation is locale insensitive---only 04922 * characters ``A'' to ``Z'' are affected. 04923 * Note: case replacement is effective only in ASCII region. 04924 * 04925 * "hEllO".downcase #=> "hello" 04926 */ 04927 04928 static VALUE 04929 rb_str_downcase(VALUE str) 04930 { 04931 str = rb_str_dup(str); 04932 rb_str_downcase_bang(str); 04933 return str; 04934 } 04935 04936 04937 /* 04938 * call-seq: 04939 * str.capitalize! -> str or nil 04940 * 04941 * Modifies <i>str</i> by converting the first character to uppercase and the 04942 * remainder to lowercase. Returns <code>nil</code> if no changes are made. 04943 * Note: case conversion is effective only in ASCII region. 04944 * 04945 * a = "hello" 04946 * a.capitalize! #=> "Hello" 04947 * a #=> "Hello" 04948 * a.capitalize! #=> nil 04949 */ 04950 04951 static VALUE 04952 rb_str_capitalize_bang(VALUE str) 04953 { 04954 rb_encoding *enc; 04955 char *s, *send; 04956 int modify = 0; 04957 unsigned int c; 04958 int n; 04959 04960 str_modify_keep_cr(str); 04961 enc = STR_ENC_GET(str); 04962 rb_str_check_dummy_enc(enc); 04963 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil; 04964 s = RSTRING_PTR(str); send = RSTRING_END(str); 04965 04966 c = rb_enc_codepoint_len(s, send, &n, enc); 04967 if (rb_enc_islower(c, enc)) { 04968 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc); 04969 modify = 1; 04970 } 04971 s += n; 04972 while (s < send) { 04973 c = rb_enc_codepoint_len(s, send, &n, enc); 04974 if (rb_enc_isupper(c, enc)) { 04975 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc); 04976 modify = 1; 04977 } 04978 s += n; 04979 } 04980 04981 if (modify) return str; 04982 return Qnil; 04983 } 04984 04985 04986 /* 04987 * call-seq: 04988 * str.capitalize -> new_str 04989 * 04990 * Returns a copy of <i>str</i> with the first character converted to uppercase 04991 * and the remainder to lowercase. 04992 * Note: case conversion is effective only in ASCII region. 04993 * 04994 * "hello".capitalize #=> "Hello" 04995 * "HELLO".capitalize #=> "Hello" 04996 * "123ABC".capitalize #=> "123abc" 04997 */ 04998 04999 static VALUE 05000 rb_str_capitalize(VALUE str) 05001 { 05002 str = rb_str_dup(str); 05003 rb_str_capitalize_bang(str); 05004 return str; 05005 } 05006 05007 05008 /* 05009 * call-seq: 05010 * str.swapcase! -> str or nil 05011 * 05012 * Equivalent to <code>String#swapcase</code>, but modifies the receiver in 05013 * place, returning <i>str</i>, or <code>nil</code> if no changes were made. 05014 * Note: case conversion is effective only in ASCII region. 05015 */ 05016 05017 static VALUE 05018 rb_str_swapcase_bang(VALUE str) 05019 { 05020 rb_encoding *enc; 05021 char *s, *send; 05022 int modify = 0; 05023 int n; 05024 05025 str_modify_keep_cr(str); 05026 enc = STR_ENC_GET(str); 05027 rb_str_check_dummy_enc(enc); 05028 s = RSTRING_PTR(str); send = RSTRING_END(str); 05029 while (s < send) { 05030 unsigned int c = rb_enc_codepoint_len(s, send, &n, enc); 05031 05032 if (rb_enc_isupper(c, enc)) { 05033 /* assuming toupper returns codepoint with same size */ 05034 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc); 05035 modify = 1; 05036 } 05037 else if (rb_enc_islower(c, enc)) { 05038 /* assuming tolower returns codepoint with same size */ 05039 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc); 05040 modify = 1; 05041 } 05042 s += n; 05043 } 05044 05045 if (modify) return str; 05046 return Qnil; 05047 } 05048 05049 05050 /* 05051 * call-seq: 05052 * str.swapcase -> new_str 05053 * 05054 * Returns a copy of <i>str</i> with uppercase alphabetic characters converted 05055 * to lowercase and lowercase characters converted to uppercase. 05056 * Note: case conversion is effective only in ASCII region. 05057 * 05058 * "Hello".swapcase #=> "hELLO" 05059 * "cYbEr_PuNk11".swapcase #=> "CyBeR_pUnK11" 05060 */ 05061 05062 static VALUE 05063 rb_str_swapcase(VALUE str) 05064 { 05065 str = rb_str_dup(str); 05066 rb_str_swapcase_bang(str); 05067 return str; 05068 } 05069 05070 typedef unsigned char *USTR; 05071 05072 struct tr { 05073 int gen; 05074 unsigned int now, max; 05075 char *p, *pend; 05076 }; 05077 05078 static unsigned int 05079 trnext(struct tr *t, rb_encoding *enc) 05080 { 05081 int n; 05082 05083 for (;;) { 05084 if (!t->gen) { 05085 nextpart: 05086 if (t->p == t->pend) return -1; 05087 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) { 05088 t->p += n; 05089 } 05090 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc); 05091 t->p += n; 05092 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) { 05093 t->p += n; 05094 if (t->p < t->pend) { 05095 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc); 05096 t->p += n; 05097 if (t->now > c) { 05098 if (t->now < 0x80 && c < 0x80) { 05099 rb_raise(rb_eArgError, 05100 "invalid range \"%c-%c\" in string transliteration", 05101 t->now, c); 05102 } 05103 else { 05104 rb_raise(rb_eArgError, "invalid range in string transliteration"); 05105 } 05106 continue; /* not reached */ 05107 } 05108 t->gen = 1; 05109 t->max = c; 05110 } 05111 } 05112 return t->now; 05113 } 05114 else { 05115 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) { 05116 if (t->now == t->max) { 05117 t->gen = 0; 05118 goto nextpart; 05119 } 05120 } 05121 if (t->now < t->max) { 05122 return t->now; 05123 } 05124 else { 05125 t->gen = 0; 05126 return t->max; 05127 } 05128 } 05129 } 05130 } 05131 05132 static VALUE rb_str_delete_bang(int,VALUE*,VALUE); 05133 05134 static VALUE 05135 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag) 05136 { 05137 const unsigned int errc = -1; 05138 unsigned int trans[256]; 05139 rb_encoding *enc, *e1, *e2; 05140 struct tr trsrc, trrepl; 05141 int cflag = 0; 05142 unsigned int c, c0, last = 0; 05143 int modify = 0, i, l; 05144 char *s, *send; 05145 VALUE hash = 0; 05146 int singlebyte = single_byte_optimizable(str); 05147 int cr; 05148 05149 #define CHECK_IF_ASCII(c) \ 05150 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \ 05151 (cr = ENC_CODERANGE_VALID) : 0) 05152 05153 StringValue(src); 05154 StringValue(repl); 05155 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil; 05156 if (RSTRING_LEN(repl) == 0) { 05157 return rb_str_delete_bang(1, &src, str); 05158 } 05159 05160 cr = ENC_CODERANGE(str); 05161 e1 = rb_enc_check(str, src); 05162 e2 = rb_enc_check(str, repl); 05163 if (e1 == e2) { 05164 enc = e1; 05165 } 05166 else { 05167 enc = rb_enc_check(src, repl); 05168 } 05169 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src); 05170 if (RSTRING_LEN(src) > 1 && 05171 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' && 05172 trsrc.p + l < trsrc.pend) { 05173 cflag = 1; 05174 trsrc.p += l; 05175 } 05176 trrepl.p = RSTRING_PTR(repl); 05177 trrepl.pend = trrepl.p + RSTRING_LEN(repl); 05178 trsrc.gen = trrepl.gen = 0; 05179 trsrc.now = trrepl.now = 0; 05180 trsrc.max = trrepl.max = 0; 05181 05182 if (cflag) { 05183 for (i=0; i<256; i++) { 05184 trans[i] = 1; 05185 } 05186 while ((c = trnext(&trsrc, enc)) != errc) { 05187 if (c < 256) { 05188 trans[c] = errc; 05189 } 05190 else { 05191 if (!hash) hash = rb_hash_new(); 05192 rb_hash_aset(hash, UINT2NUM(c), Qtrue); 05193 } 05194 } 05195 while ((c = trnext(&trrepl, enc)) != errc) 05196 /* retrieve last replacer */; 05197 last = trrepl.now; 05198 for (i=0; i<256; i++) { 05199 if (trans[i] != errc) { 05200 trans[i] = last; 05201 } 05202 } 05203 } 05204 else { 05205 unsigned int r; 05206 05207 for (i=0; i<256; i++) { 05208 trans[i] = errc; 05209 } 05210 while ((c = trnext(&trsrc, enc)) != errc) { 05211 r = trnext(&trrepl, enc); 05212 if (r == errc) r = trrepl.now; 05213 if (c < 256) { 05214 trans[c] = r; 05215 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0; 05216 } 05217 else { 05218 if (!hash) hash = rb_hash_new(); 05219 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r)); 05220 } 05221 } 05222 } 05223 05224 if (cr == ENC_CODERANGE_VALID) 05225 cr = ENC_CODERANGE_7BIT; 05226 str_modify_keep_cr(str); 05227 s = RSTRING_PTR(str); send = RSTRING_END(str); 05228 if (sflag) { 05229 int clen, tlen; 05230 long offset, max = RSTRING_LEN(str); 05231 unsigned int save = -1; 05232 char *buf = ALLOC_N(char, max), *t = buf; 05233 05234 while (s < send) { 05235 int may_modify = 0; 05236 05237 c0 = c = rb_enc_codepoint_len(s, send, &clen, e1); 05238 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc); 05239 05240 s += clen; 05241 if (c < 256) { 05242 c = trans[c]; 05243 } 05244 else if (hash) { 05245 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c)); 05246 if (NIL_P(tmp)) { 05247 if (cflag) c = last; 05248 else c = errc; 05249 } 05250 else if (cflag) c = errc; 05251 else c = NUM2INT(tmp); 05252 } 05253 else { 05254 c = errc; 05255 } 05256 if (c != (unsigned int)-1) { 05257 if (save == c) { 05258 CHECK_IF_ASCII(c); 05259 continue; 05260 } 05261 save = c; 05262 tlen = rb_enc_codelen(c, enc); 05263 modify = 1; 05264 } 05265 else { 05266 save = -1; 05267 c = c0; 05268 if (enc != e1) may_modify = 1; 05269 } 05270 while (t - buf + tlen >= max) { 05271 offset = t - buf; 05272 max *= 2; 05273 REALLOC_N(buf, char, max); 05274 t = buf + offset; 05275 } 05276 rb_enc_mbcput(c, t, enc); 05277 if (may_modify && memcmp(s, t, tlen) != 0) { 05278 modify = 1; 05279 } 05280 CHECK_IF_ASCII(c); 05281 t += tlen; 05282 } 05283 if (!STR_EMBED_P(str)) { 05284 xfree(RSTRING(str)->as.heap.ptr); 05285 } 05286 *t = '\0'; 05287 RSTRING(str)->as.heap.ptr = buf; 05288 RSTRING(str)->as.heap.len = t - buf; 05289 STR_SET_NOEMBED(str); 05290 RSTRING(str)->as.heap.aux.capa = max; 05291 } 05292 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) { 05293 while (s < send) { 05294 c = (unsigned char)*s; 05295 if (trans[c] != errc) { 05296 if (!cflag) { 05297 c = trans[c]; 05298 *s = c; 05299 modify = 1; 05300 } 05301 else { 05302 *s = last; 05303 modify = 1; 05304 } 05305 } 05306 CHECK_IF_ASCII(c); 05307 s++; 05308 } 05309 } 05310 else { 05311 int clen, tlen, max = (int)(RSTRING_LEN(str) * 1.2); 05312 long offset; 05313 char *buf = ALLOC_N(char, max), *t = buf; 05314 05315 while (s < send) { 05316 int may_modify = 0; 05317 c0 = c = rb_enc_codepoint_len(s, send, &clen, e1); 05318 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc); 05319 05320 if (c < 256) { 05321 c = trans[c]; 05322 } 05323 else if (hash) { 05324 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c)); 05325 if (NIL_P(tmp)) { 05326 if (cflag) c = last; 05327 else c = errc; 05328 } 05329 else if (cflag) c = errc; 05330 else c = NUM2INT(tmp); 05331 } 05332 else { 05333 c = cflag ? last : errc; 05334 } 05335 if (c != errc) { 05336 tlen = rb_enc_codelen(c, enc); 05337 modify = 1; 05338 } 05339 else { 05340 c = c0; 05341 if (enc != e1) may_modify = 1; 05342 } 05343 while (t - buf + tlen >= max) { 05344 offset = t - buf; 05345 max *= 2; 05346 REALLOC_N(buf, char, max); 05347 t = buf + offset; 05348 } 05349 if (s != t) { 05350 rb_enc_mbcput(c, t, enc); 05351 if (may_modify && memcmp(s, t, tlen) != 0) { 05352 modify = 1; 05353 } 05354 } 05355 CHECK_IF_ASCII(c); 05356 s += clen; 05357 t += tlen; 05358 } 05359 if (!STR_EMBED_P(str)) { 05360 xfree(RSTRING(str)->as.heap.ptr); 05361 } 05362 *t = '\0'; 05363 RSTRING(str)->as.heap.ptr = buf; 05364 RSTRING(str)->as.heap.len = t - buf; 05365 STR_SET_NOEMBED(str); 05366 RSTRING(str)->as.heap.aux.capa = max; 05367 } 05368 05369 if (modify) { 05370 if (cr != ENC_CODERANGE_BROKEN) 05371 ENC_CODERANGE_SET(str, cr); 05372 rb_enc_associate(str, enc); 05373 return str; 05374 } 05375 return Qnil; 05376 } 05377 05378 05379 /* 05380 * call-seq: 05381 * str.tr!(from_str, to_str) -> str or nil 05382 * 05383 * Translates <i>str</i> in place, using the same rules as 05384 * <code>String#tr</code>. Returns <i>str</i>, or <code>nil</code> if no 05385 * changes were made. 05386 */ 05387 05388 static VALUE 05389 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl) 05390 { 05391 return tr_trans(str, src, repl, 0); 05392 } 05393 05394 05395 /* 05396 * call-seq: 05397 * str.tr(from_str, to_str) => new_str 05398 * 05399 * Returns a copy of +str+ with the characters in +from_str+ replaced by the 05400 * corresponding characters in +to_str+. If +to_str+ is shorter than 05401 * +from_str+, it is padded with its last character in order to maintain the 05402 * correspondence. 05403 * 05404 * "hello".tr('el', 'ip') #=> "hippo" 05405 * "hello".tr('aeiou', '*') #=> "h*ll*" 05406 * "hello".tr('aeiou', 'AA*') #=> "hAll*" 05407 * 05408 * Both strings may use the <code>c1-c2</code> notation to denote ranges of 05409 * characters, and +from_str+ may start with a <code>^</code>, which denotes 05410 * all characters except those listed. 05411 * 05412 * "hello".tr('a-y', 'b-z') #=> "ifmmp" 05413 * "hello".tr('^aeiou', '*') #=> "*e**o" 05414 * 05415 * The backslash character <code></code> can be used to escape 05416 * <code>^</code> or <code>-</code> and is otherwise ignored unless it 05417 * appears at the end of a range or the end of the +from_str+ or +to_str+: 05418 * 05419 * "hello^world".tr("\\^aeiou", "*") #=> "h*ll**w*rld" 05420 * "hello-world".tr("a\\-eo", "*") #=> "h*ll**w*rld" 05421 * 05422 * "hello\r\nworld".tr("\r", "") #=> "hello\nworld" 05423 * "hello\r\nworld".tr("\\r", "") #=> "hello\r\nwold" 05424 * "hello\r\nworld".tr("\\\r", "") #=> "hello\nworld" 05425 * 05426 * "X['\\b']".tr("X\\", "") #=> "['b']" 05427 * "X['\\b']".tr("X-\\]", "") #=> "'b'" 05428 */ 05429 05430 static VALUE 05431 rb_str_tr(VALUE str, VALUE src, VALUE repl) 05432 { 05433 str = rb_str_dup(str); 05434 tr_trans(str, src, repl, 0); 05435 return str; 05436 } 05437 05438 #define TR_TABLE_SIZE 257 05439 static void 05440 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first, 05441 VALUE *tablep, VALUE *ctablep, rb_encoding *enc) 05442 { 05443 const unsigned int errc = -1; 05444 char buf[256]; 05445 struct tr tr; 05446 unsigned int c; 05447 VALUE table = 0, ptable = 0; 05448 int i, l, cflag = 0; 05449 05450 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str); 05451 tr.gen = tr.now = tr.max = 0; 05452 05453 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') { 05454 cflag = 1; 05455 tr.p += l; 05456 } 05457 if (first) { 05458 for (i=0; i<256; i++) { 05459 stable[i] = 1; 05460 } 05461 stable[256] = cflag; 05462 } 05463 else if (stable[256] && !cflag) { 05464 stable[256] = 0; 05465 } 05466 for (i=0; i<256; i++) { 05467 buf[i] = cflag; 05468 } 05469 05470 while ((c = trnext(&tr, enc)) != errc) { 05471 if (c < 256) { 05472 buf[c & 0xff] = !cflag; 05473 } 05474 else { 05475 VALUE key = UINT2NUM(c); 05476 05477 if (!table && (first || *tablep || stable[256])) { 05478 if (cflag) { 05479 ptable = *ctablep; 05480 table = ptable ? ptable : rb_hash_new(); 05481 *ctablep = table; 05482 } 05483 else { 05484 table = rb_hash_new(); 05485 ptable = *tablep; 05486 *tablep = table; 05487 } 05488 } 05489 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) { 05490 rb_hash_aset(table, key, Qtrue); 05491 } 05492 } 05493 } 05494 for (i=0; i<256; i++) { 05495 stable[i] = stable[i] && buf[i]; 05496 } 05497 if (!table && !cflag) { 05498 *tablep = 0; 05499 } 05500 } 05501 05502 05503 static int 05504 tr_find(unsigned int c, char table[TR_TABLE_SIZE], VALUE del, VALUE nodel) 05505 { 05506 if (c < 256) { 05507 return table[c] != 0; 05508 } 05509 else { 05510 VALUE v = UINT2NUM(c); 05511 05512 if (del) { 05513 if (!NIL_P(rb_hash_lookup(del, v)) && 05514 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) { 05515 return TRUE; 05516 } 05517 } 05518 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) { 05519 return FALSE; 05520 } 05521 return table[256] ? TRUE : FALSE; 05522 } 05523 } 05524 05525 /* 05526 * call-seq: 05527 * str.delete!([other_str]+) -> str or nil 05528 * 05529 * Performs a <code>delete</code> operation in place, returning <i>str</i>, or 05530 * <code>nil</code> if <i>str</i> was not modified. 05531 */ 05532 05533 static VALUE 05534 rb_str_delete_bang(int argc, VALUE *argv, VALUE str) 05535 { 05536 char squeez[TR_TABLE_SIZE]; 05537 rb_encoding *enc = 0; 05538 char *s, *send, *t; 05539 VALUE del = 0, nodel = 0; 05540 int modify = 0; 05541 int i, ascompat, cr; 05542 05543 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil; 05544 rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS); 05545 for (i=0; i<argc; i++) { 05546 VALUE s = argv[i]; 05547 05548 StringValue(s); 05549 enc = rb_enc_check(str, s); 05550 tr_setup_table(s, squeez, i==0, &del, &nodel, enc); 05551 } 05552 05553 str_modify_keep_cr(str); 05554 ascompat = rb_enc_asciicompat(enc); 05555 s = t = RSTRING_PTR(str); 05556 send = RSTRING_END(str); 05557 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID; 05558 while (s < send) { 05559 unsigned int c; 05560 int clen; 05561 05562 if (ascompat && (c = *(unsigned char*)s) < 0x80) { 05563 if (squeez[c]) { 05564 modify = 1; 05565 } 05566 else { 05567 if (t != s) *t = c; 05568 t++; 05569 } 05570 s++; 05571 } 05572 else { 05573 c = rb_enc_codepoint_len(s, send, &clen, enc); 05574 05575 if (tr_find(c, squeez, del, nodel)) { 05576 modify = 1; 05577 } 05578 else { 05579 if (t != s) rb_enc_mbcput(c, t, enc); 05580 t += clen; 05581 if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID; 05582 } 05583 s += clen; 05584 } 05585 } 05586 *t = '\0'; 05587 STR_SET_LEN(str, t - RSTRING_PTR(str)); 05588 ENC_CODERANGE_SET(str, cr); 05589 05590 if (modify) return str; 05591 return Qnil; 05592 } 05593 05594 05595 /* 05596 * call-seq: 05597 * str.delete([other_str]+) -> new_str 05598 * 05599 * Returns a copy of <i>str</i> with all characters in the intersection of its 05600 * arguments deleted. Uses the same rules for building the set of characters as 05601 * <code>String#count</code>. 05602 * 05603 * "hello".delete "l","lo" #=> "heo" 05604 * "hello".delete "lo" #=> "he" 05605 * "hello".delete "aeiou", "^e" #=> "hell" 05606 * "hello".delete "ej-m" #=> "ho" 05607 */ 05608 05609 static VALUE 05610 rb_str_delete(int argc, VALUE *argv, VALUE str) 05611 { 05612 str = rb_str_dup(str); 05613 rb_str_delete_bang(argc, argv, str); 05614 return str; 05615 } 05616 05617 05618 /* 05619 * call-seq: 05620 * str.squeeze!([other_str]*) -> str or nil 05621 * 05622 * Squeezes <i>str</i> in place, returning either <i>str</i>, or 05623 * <code>nil</code> if no changes were made. 05624 */ 05625 05626 static VALUE 05627 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str) 05628 { 05629 char squeez[TR_TABLE_SIZE]; 05630 rb_encoding *enc = 0; 05631 VALUE del = 0, nodel = 0; 05632 char *s, *send, *t; 05633 int i, modify = 0; 05634 int ascompat, singlebyte = single_byte_optimizable(str); 05635 unsigned int save; 05636 05637 if (argc == 0) { 05638 enc = STR_ENC_GET(str); 05639 } 05640 else { 05641 for (i=0; i<argc; i++) { 05642 VALUE s = argv[i]; 05643 05644 StringValue(s); 05645 enc = rb_enc_check(str, s); 05646 if (singlebyte && !single_byte_optimizable(s)) 05647 singlebyte = 0; 05648 tr_setup_table(s, squeez, i==0, &del, &nodel, enc); 05649 } 05650 } 05651 05652 str_modify_keep_cr(str); 05653 s = t = RSTRING_PTR(str); 05654 if (!s || RSTRING_LEN(str) == 0) return Qnil; 05655 send = RSTRING_END(str); 05656 save = -1; 05657 ascompat = rb_enc_asciicompat(enc); 05658 05659 if (singlebyte) { 05660 while (s < send) { 05661 unsigned int c = *(unsigned char*)s++; 05662 if (c != save || (argc > 0 && !squeez[c])) { 05663 *t++ = save = c; 05664 } 05665 } 05666 } else { 05667 while (s < send) { 05668 unsigned int c; 05669 int clen; 05670 05671 if (ascompat && (c = *(unsigned char*)s) < 0x80) { 05672 if (c != save || (argc > 0 && !squeez[c])) { 05673 *t++ = save = c; 05674 } 05675 s++; 05676 } 05677 else { 05678 c = rb_enc_codepoint_len(s, send, &clen, enc); 05679 05680 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) { 05681 if (t != s) rb_enc_mbcput(c, t, enc); 05682 save = c; 05683 t += clen; 05684 } 05685 s += clen; 05686 } 05687 } 05688 } 05689 05690 *t = '\0'; 05691 if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) { 05692 STR_SET_LEN(str, t - RSTRING_PTR(str)); 05693 modify = 1; 05694 } 05695 05696 if (modify) return str; 05697 return Qnil; 05698 } 05699 05700 05701 /* 05702 * call-seq: 05703 * str.squeeze([other_str]*) -> new_str 05704 * 05705 * Builds a set of characters from the <i>other_str</i> parameter(s) using the 05706 * procedure described for <code>String#count</code>. Returns a new string 05707 * where runs of the same character that occur in this set are replaced by a 05708 * single character. If no arguments are given, all runs of identical 05709 * characters are replaced by a single character. 05710 * 05711 * "yellow moon".squeeze #=> "yelow mon" 05712 * " now is the".squeeze(" ") #=> " now is the" 05713 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls" 05714 */ 05715 05716 static VALUE 05717 rb_str_squeeze(int argc, VALUE *argv, VALUE str) 05718 { 05719 str = rb_str_dup(str); 05720 rb_str_squeeze_bang(argc, argv, str); 05721 return str; 05722 } 05723 05724 05725 /* 05726 * call-seq: 05727 * str.tr_s!(from_str, to_str) -> str or nil 05728 * 05729 * Performs <code>String#tr_s</code> processing on <i>str</i> in place, 05730 * returning <i>str</i>, or <code>nil</code> if no changes were made. 05731 */ 05732 05733 static VALUE 05734 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl) 05735 { 05736 return tr_trans(str, src, repl, 1); 05737 } 05738 05739 05740 /* 05741 * call-seq: 05742 * str.tr_s(from_str, to_str) -> new_str 05743 * 05744 * Processes a copy of <i>str</i> as described under <code>String#tr</code>, 05745 * then removes duplicate characters in regions that were affected by the 05746 * translation. 05747 * 05748 * "hello".tr_s('l', 'r') #=> "hero" 05749 * "hello".tr_s('el', '*') #=> "h*o" 05750 * "hello".tr_s('el', 'hx') #=> "hhxo" 05751 */ 05752 05753 static VALUE 05754 rb_str_tr_s(VALUE str, VALUE src, VALUE repl) 05755 { 05756 str = rb_str_dup(str); 05757 tr_trans(str, src, repl, 1); 05758 return str; 05759 } 05760 05761 05762 /* 05763 * call-seq: 05764 * str.count([other_str]+) -> fixnum 05765 * 05766 * Each +other_str+ parameter defines a set of characters to count. The 05767 * intersection of these sets defines the characters to count in +str+. Any 05768 * +other_str+ that starts with a caret <code>^</code> is negated. The 05769 * sequence <code>c1-c2</code> means all characters between c1 and c2. The 05770 * backslash character <code></code> can be used to escape <code>^</code> or 05771 * <code>-</code> and is otherwise ignored unless it appears at the end of a 05772 * sequence or the end of a +other_str+. 05773 * 05774 * a = "hello world" 05775 * a.count "lo" #=> 5 05776 * a.count "lo", "o" #=> 2 05777 * a.count "hello", "^l" #=> 4 05778 * a.count "ej-m" #=> 4 05779 * 05780 * "hello^world".count "\\^aeiou" #=> 4 05781 * "hello-world".count "a\\-eo" #=> 4 05782 * 05783 * c = "hello world\\r\\n" 05784 * c.count "\\" #=> 2 05785 * c.count "\\A" #=> 0 05786 * c.count "X-\\w" #=> 3 05787 */ 05788 05789 static VALUE 05790 rb_str_count(int argc, VALUE *argv, VALUE str) 05791 { 05792 char table[TR_TABLE_SIZE]; 05793 rb_encoding *enc = 0; 05794 VALUE del = 0, nodel = 0; 05795 char *s, *send; 05796 int i; 05797 int ascompat; 05798 05799 rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS); 05800 for (i=0; i<argc; i++) { 05801 VALUE tstr = argv[i]; 05802 unsigned char c; 05803 05804 StringValue(tstr); 05805 enc = rb_enc_check(str, tstr); 05806 if (argc == 1 && RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) && 05807 (c = RSTRING_PTR(tstr)[0]) < 0x80 && !is_broken_string(str)) { 05808 int n = 0; 05809 05810 s = RSTRING_PTR(str); 05811 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0); 05812 send = RSTRING_END(str); 05813 while (s < send) { 05814 if (*(unsigned char*)s++ == c) n++; 05815 } 05816 return INT2NUM(n); 05817 } 05818 tr_setup_table(tstr, table, i==0, &del, &nodel, enc); 05819 } 05820 05821 s = RSTRING_PTR(str); 05822 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0); 05823 send = RSTRING_END(str); 05824 ascompat = rb_enc_asciicompat(enc); 05825 i = 0; 05826 while (s < send) { 05827 unsigned int c; 05828 05829 if (ascompat && (c = *(unsigned char*)s) < 0x80) { 05830 if (table[c]) { 05831 i++; 05832 } 05833 s++; 05834 } 05835 else { 05836 int clen; 05837 c = rb_enc_codepoint_len(s, send, &clen, enc); 05838 if (tr_find(c, table, del, nodel)) { 05839 i++; 05840 } 05841 s += clen; 05842 } 05843 } 05844 05845 return INT2NUM(i); 05846 } 05847 05848 static const char isspacetable[256] = { 05849 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 05850 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05851 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05852 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05853 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05854 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05855 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05856 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05857 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05858 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05859 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05860 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05861 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05862 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05863 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05864 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 05865 }; 05866 05867 #define ascii_isspace(c) isspacetable[(unsigned char)(c)] 05868 05869 /* 05870 * call-seq: 05871 * str.split(pattern=$;, [limit]) -> anArray 05872 * 05873 * Divides <i>str</i> into substrings based on a delimiter, returning an array 05874 * of these substrings. 05875 * 05876 * If <i>pattern</i> is a <code>String</code>, then its contents are used as 05877 * the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single 05878 * space, <i>str</i> is split on whitespace, with leading whitespace and runs 05879 * of contiguous whitespace characters ignored. 05880 * 05881 * If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the 05882 * pattern matches. Whenever the pattern matches a zero-length string, 05883 * <i>str</i> is split into individual characters. If <i>pattern</i> contains 05884 * groups, the respective matches will be returned in the array as well. 05885 * 05886 * If <i>pattern</i> is omitted, the value of <code>$;</code> is used. If 05887 * <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is 05888 * split on whitespace as if ` ' were specified. 05889 * 05890 * If the <i>limit</i> parameter is omitted, trailing null fields are 05891 * suppressed. If <i>limit</i> is a positive number, at most that number of 05892 * fields will be returned (if <i>limit</i> is <code>1</code>, the entire 05893 * string is returned as the only entry in an array). If negative, there is no 05894 * limit to the number of fields returned, and trailing null fields are not 05895 * suppressed. 05896 * 05897 * When the input +str+ is empty an empty Array is returned as the string is 05898 * considered to have no fields to split. 05899 * 05900 * " now's the time".split #=> ["now's", "the", "time"] 05901 * " now's the time".split(' ') #=> ["now's", "the", "time"] 05902 * " now's the time".split(/ /) #=> ["", "now's", "", "the", "time"] 05903 * "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"] 05904 * "hello".split(//) #=> ["h", "e", "l", "l", "o"] 05905 * "hello".split(//, 3) #=> ["h", "e", "llo"] 05906 * "hi mom".split(%r{\s*}) #=> ["h", "i", "m", "o", "m"] 05907 * 05908 * "mellow yellow".split("ello") #=> ["m", "w y", "w"] 05909 * "1,2,,3,4,,".split(',') #=> ["1", "2", "", "3", "4"] 05910 * "1,2,,3,4,,".split(',', 4) #=> ["1", "2", "", "3,4,,"] 05911 * "1,2,,3,4,,".split(',', -4) #=> ["1", "2", "", "3", "4", "", ""] 05912 * 05913 * "".split(',', -1) #=> [] 05914 */ 05915 05916 static VALUE 05917 rb_str_split_m(int argc, VALUE *argv, VALUE str) 05918 { 05919 rb_encoding *enc; 05920 VALUE spat; 05921 VALUE limit; 05922 enum {awk, string, regexp} split_type; 05923 long beg, end, i = 0; 05924 int lim = 0; 05925 VALUE result, tmp; 05926 05927 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) { 05928 lim = NUM2INT(limit); 05929 if (lim <= 0) limit = Qnil; 05930 else if (lim == 1) { 05931 if (RSTRING_LEN(str) == 0) 05932 return rb_ary_new2(0); 05933 return rb_ary_new3(1, str); 05934 } 05935 i = 1; 05936 } 05937 05938 enc = STR_ENC_GET(str); 05939 if (NIL_P(spat)) { 05940 if (!NIL_P(rb_fs)) { 05941 spat = rb_fs; 05942 goto fs_set; 05943 } 05944 split_type = awk; 05945 } 05946 else { 05947 fs_set: 05948 if (RB_TYPE_P(spat, T_STRING)) { 05949 rb_encoding *enc2 = STR_ENC_GET(spat); 05950 05951 split_type = string; 05952 if (RSTRING_LEN(spat) == 0) { 05953 /* Special case - split into chars */ 05954 spat = rb_reg_regcomp(spat); 05955 split_type = regexp; 05956 } 05957 else if (rb_enc_asciicompat(enc2) == 1) { 05958 if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){ 05959 split_type = awk; 05960 } 05961 } 05962 else { 05963 int l; 05964 if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' && 05965 RSTRING_LEN(spat) == l) { 05966 split_type = awk; 05967 } 05968 } 05969 } 05970 else { 05971 spat = get_pat(spat, 1); 05972 split_type = regexp; 05973 } 05974 } 05975 05976 result = rb_ary_new(); 05977 beg = 0; 05978 if (split_type == awk) { 05979 char *ptr = RSTRING_PTR(str); 05980 char *eptr = RSTRING_END(str); 05981 char *bptr = ptr; 05982 int skip = 1; 05983 unsigned int c; 05984 05985 end = beg; 05986 if (is_ascii_string(str)) { 05987 while (ptr < eptr) { 05988 c = (unsigned char)*ptr++; 05989 if (skip) { 05990 if (ascii_isspace(c)) { 05991 beg = ptr - bptr; 05992 } 05993 else { 05994 end = ptr - bptr; 05995 skip = 0; 05996 if (!NIL_P(limit) && lim <= i) break; 05997 } 05998 } 05999 else if (ascii_isspace(c)) { 06000 rb_ary_push(result, rb_str_subseq(str, beg, end-beg)); 06001 skip = 1; 06002 beg = ptr - bptr; 06003 if (!NIL_P(limit)) ++i; 06004 } 06005 else { 06006 end = ptr - bptr; 06007 } 06008 } 06009 } 06010 else { 06011 while (ptr < eptr) { 06012 int n; 06013 06014 c = rb_enc_codepoint_len(ptr, eptr, &n, enc); 06015 ptr += n; 06016 if (skip) { 06017 if (rb_isspace(c)) { 06018 beg = ptr - bptr; 06019 } 06020 else { 06021 end = ptr - bptr; 06022 skip = 0; 06023 if (!NIL_P(limit) && lim <= i) break; 06024 } 06025 } 06026 else if (rb_isspace(c)) { 06027 rb_ary_push(result, rb_str_subseq(str, beg, end-beg)); 06028 skip = 1; 06029 beg = ptr - bptr; 06030 if (!NIL_P(limit)) ++i; 06031 } 06032 else { 06033 end = ptr - bptr; 06034 } 06035 } 06036 } 06037 } 06038 else if (split_type == string) { 06039 char *ptr = RSTRING_PTR(str); 06040 char *temp = ptr; 06041 char *eptr = RSTRING_END(str); 06042 char *sptr = RSTRING_PTR(spat); 06043 long slen = RSTRING_LEN(spat); 06044 06045 if (is_broken_string(str)) { 06046 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str))); 06047 } 06048 if (is_broken_string(spat)) { 06049 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat))); 06050 } 06051 enc = rb_enc_check(str, spat); 06052 while (ptr < eptr && 06053 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) { 06054 /* Check we are at the start of a char */ 06055 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc); 06056 if (t != ptr + end) { 06057 ptr = t; 06058 continue; 06059 } 06060 rb_ary_push(result, rb_str_subseq(str, ptr - temp, end)); 06061 ptr += end + slen; 06062 if (!NIL_P(limit) && lim <= ++i) break; 06063 } 06064 beg = ptr - temp; 06065 } 06066 else { 06067 char *ptr = RSTRING_PTR(str); 06068 long len = RSTRING_LEN(str); 06069 long start = beg; 06070 long idx; 06071 int last_null = 0; 06072 struct re_registers *regs; 06073 06074 while ((end = rb_reg_search(spat, str, start, 0)) >= 0) { 06075 regs = RMATCH_REGS(rb_backref_get()); 06076 if (start == end && BEG(0) == END(0)) { 06077 if (!ptr) { 06078 rb_ary_push(result, str_new_empty(str)); 06079 break; 06080 } 06081 else if (last_null == 1) { 06082 rb_ary_push(result, rb_str_subseq(str, beg, 06083 rb_enc_fast_mbclen(ptr+beg, 06084 ptr+len, 06085 enc))); 06086 beg = start; 06087 } 06088 else { 06089 if (ptr+start == ptr+len) 06090 start++; 06091 else 06092 start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc); 06093 last_null = 1; 06094 continue; 06095 } 06096 } 06097 else { 06098 rb_ary_push(result, rb_str_subseq(str, beg, end-beg)); 06099 beg = start = END(0); 06100 } 06101 last_null = 0; 06102 06103 for (idx=1; idx < regs->num_regs; idx++) { 06104 if (BEG(idx) == -1) continue; 06105 if (BEG(idx) == END(idx)) 06106 tmp = str_new_empty(str); 06107 else 06108 tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx)); 06109 rb_ary_push(result, tmp); 06110 } 06111 if (!NIL_P(limit) && lim <= ++i) break; 06112 } 06113 } 06114 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) { 06115 if (RSTRING_LEN(str) == beg) 06116 tmp = str_new_empty(str); 06117 else 06118 tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg); 06119 rb_ary_push(result, tmp); 06120 } 06121 if (NIL_P(limit) && lim == 0) { 06122 long len; 06123 while ((len = RARRAY_LEN(result)) > 0 && 06124 (tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0)) 06125 rb_ary_pop(result); 06126 } 06127 06128 return result; 06129 } 06130 06131 VALUE 06132 rb_str_split(VALUE str, const char *sep0) 06133 { 06134 VALUE sep; 06135 06136 StringValue(str); 06137 sep = rb_str_new2(sep0); 06138 return rb_str_split_m(1, &sep, str); 06139 } 06140 06141 06142 static VALUE 06143 rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, int wantarray) 06144 { 06145 rb_encoding *enc; 06146 VALUE rs; 06147 unsigned int newline; 06148 const char *p, *pend, *s, *ptr; 06149 long len, rslen; 06150 VALUE line; 06151 int n; 06152 VALUE orig = str; 06153 VALUE UNINITIALIZED_VAR(ary); 06154 06155 if (argc == 0) { 06156 rs = rb_rs; 06157 } 06158 else { 06159 rb_scan_args(argc, argv, "01", &rs); 06160 } 06161 06162 if (rb_block_given_p()) { 06163 if (wantarray) { 06164 #if 0 /* next major */ 06165 rb_warn("given block not used"); 06166 ary = rb_ary_new(); 06167 #else 06168 rb_warning("passing a block to String#lines is deprecated"); 06169 wantarray = 0; 06170 #endif 06171 } 06172 } 06173 else { 06174 if (wantarray) 06175 ary = rb_ary_new(); 06176 else 06177 RETURN_ENUMERATOR(str, argc, argv); 06178 } 06179 06180 if (NIL_P(rs)) { 06181 if (wantarray) { 06182 rb_ary_push(ary, str); 06183 return ary; 06184 } 06185 else { 06186 rb_yield(str); 06187 return orig; 06188 } 06189 } 06190 str = rb_str_new4(str); 06191 ptr = p = s = RSTRING_PTR(str); 06192 pend = p + RSTRING_LEN(str); 06193 len = RSTRING_LEN(str); 06194 StringValue(rs); 06195 if (rs == rb_default_rs) { 06196 enc = rb_enc_get(str); 06197 while (p < pend) { 06198 char *p0; 06199 06200 p = memchr(p, '\n', pend - p); 06201 if (!p) break; 06202 p0 = rb_enc_left_char_head(s, p, pend, enc); 06203 if (!rb_enc_is_newline(p0, pend, enc)) { 06204 p++; 06205 continue; 06206 } 06207 p = p0 + rb_enc_mbclen(p0, pend, enc); 06208 line = rb_str_subseq(str, s - ptr, p - s); 06209 if (wantarray) 06210 rb_ary_push(ary, line); 06211 else 06212 rb_yield(line); 06213 str_mod_check(str, ptr, len); 06214 s = p; 06215 } 06216 goto finish; 06217 } 06218 06219 enc = rb_enc_check(str, rs); 06220 rslen = RSTRING_LEN(rs); 06221 if (rslen == 0) { 06222 newline = '\n'; 06223 } 06224 else { 06225 newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc); 06226 } 06227 06228 while (p < pend) { 06229 unsigned int c = rb_enc_codepoint_len(p, pend, &n, enc); 06230 06231 again: 06232 if (rslen == 0 && c == newline) { 06233 p += n; 06234 if (p < pend && (c = rb_enc_codepoint_len(p, pend, &n, enc)) != newline) { 06235 goto again; 06236 } 06237 while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) { 06238 p += n; 06239 } 06240 p -= n; 06241 } 06242 if (c == newline && 06243 (rslen <= 1 || 06244 (pend - p >= rslen && memcmp(RSTRING_PTR(rs), p, rslen) == 0))) { 06245 const char *pp = p + (rslen ? rslen : n); 06246 line = rb_str_subseq(str, s - ptr, pp - s); 06247 if (wantarray) 06248 rb_ary_push(ary, line); 06249 else 06250 rb_yield(line); 06251 str_mod_check(str, ptr, len); 06252 s = pp; 06253 } 06254 p += n; 06255 } 06256 06257 finish: 06258 if (s != pend) { 06259 line = rb_str_subseq(str, s - ptr, pend - s); 06260 if (wantarray) 06261 rb_ary_push(ary, line); 06262 else 06263 rb_yield(line); 06264 RB_GC_GUARD(str); 06265 } 06266 06267 if (wantarray) 06268 return ary; 06269 else 06270 return orig; 06271 } 06272 06273 /* 06274 * call-seq: 06275 * str.each_line(separator=$/) {|substr| block } -> str 06276 * str.each_line(separator=$/) -> an_enumerator 06277 * 06278 * Splits <i>str</i> using the supplied parameter as the record 06279 * separator (<code>$/</code> by default), passing each substring in 06280 * turn to the supplied block. If a zero-length record separator is 06281 * supplied, the string is split into paragraphs delimited by 06282 * multiple successive newlines. 06283 * 06284 * If no block is given, an enumerator is returned instead. 06285 * 06286 * print "Example one\n" 06287 * "hello\nworld".each_line {|s| p s} 06288 * print "Example two\n" 06289 * "hello\nworld".each_line('l') {|s| p s} 06290 * print "Example three\n" 06291 * "hello\n\n\nworld".each_line('') {|s| p s} 06292 * 06293 * <em>produces:</em> 06294 * 06295 * Example one 06296 * "hello\n" 06297 * "world" 06298 * Example two 06299 * "hel" 06300 * "l" 06301 * "o\nworl" 06302 * "d" 06303 * Example three 06304 * "hello\n\n\n" 06305 * "world" 06306 */ 06307 06308 static VALUE 06309 rb_str_each_line(int argc, VALUE *argv, VALUE str) 06310 { 06311 return rb_str_enumerate_lines(argc, argv, str, 0); 06312 } 06313 06314 /* 06315 * call-seq: 06316 * str.lines(separator=$/) -> an_array 06317 * 06318 * Returns an array of lines in <i>str</i> split using the supplied 06319 * record separator (<code>$/</code> by default). This is a 06320 * shorthand for <code>str.each_line(separator).to_a</code>. 06321 * 06322 * If a block is given, which is a deprecated form, works the same as 06323 * <code>each_line</code>. 06324 */ 06325 06326 static VALUE 06327 rb_str_lines(int argc, VALUE *argv, VALUE str) 06328 { 06329 return rb_str_enumerate_lines(argc, argv, str, 1); 06330 } 06331 06332 static VALUE 06333 rb_str_each_byte_size(VALUE str, VALUE args) 06334 { 06335 return LONG2FIX(RSTRING_LEN(str)); 06336 } 06337 06338 static VALUE 06339 rb_str_enumerate_bytes(VALUE str, int wantarray) 06340 { 06341 long i; 06342 VALUE UNINITIALIZED_VAR(ary); 06343 06344 if (rb_block_given_p()) { 06345 if (wantarray) { 06346 #if 0 /* next major */ 06347 rb_warn("given block not used"); 06348 ary = rb_ary_new(); 06349 #else 06350 rb_warning("passing a block to String#bytes is deprecated"); 06351 wantarray = 0; 06352 #endif 06353 } 06354 } 06355 else { 06356 if (wantarray) 06357 ary = rb_ary_new2(RSTRING_LEN(str)); 06358 else 06359 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size); 06360 } 06361 06362 for (i=0; i<RSTRING_LEN(str); i++) { 06363 if (wantarray) 06364 rb_ary_push(ary, INT2FIX(RSTRING_PTR(str)[i] & 0xff)); 06365 else 06366 rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff)); 06367 } 06368 if (wantarray) 06369 return ary; 06370 else 06371 return str; 06372 } 06373 06374 /* 06375 * call-seq: 06376 * str.each_byte {|fixnum| block } -> str 06377 * str.each_byte -> an_enumerator 06378 * 06379 * Passes each byte in <i>str</i> to the given block, or returns an 06380 * enumerator if no block is given. 06381 * 06382 * "hello".each_byte {|c| print c, ' ' } 06383 * 06384 * <em>produces:</em> 06385 * 06386 * 104 101 108 108 111 06387 */ 06388 06389 static VALUE 06390 rb_str_each_byte(VALUE str) 06391 { 06392 return rb_str_enumerate_bytes(str, 0); 06393 } 06394 06395 /* 06396 * call-seq: 06397 * str.bytes -> an_array 06398 * 06399 * Returns an array of bytes in <i>str</i>. This is a shorthand for 06400 * <code>str.each_byte.to_a</code>. 06401 * 06402 * If a block is given, which is a deprecated form, works the same as 06403 * <code>each_byte</code>. 06404 */ 06405 06406 static VALUE 06407 rb_str_bytes(VALUE str) 06408 { 06409 return rb_str_enumerate_bytes(str, 1); 06410 } 06411 06412 static VALUE 06413 rb_str_each_char_size(VALUE str) 06414 { 06415 long len = RSTRING_LEN(str); 06416 if (!single_byte_optimizable(str)) { 06417 const char *ptr = RSTRING_PTR(str); 06418 rb_encoding *enc = rb_enc_get(str); 06419 const char *end_ptr = ptr + len; 06420 for (len = 0; ptr < end_ptr; ++len) { 06421 ptr += rb_enc_mbclen(ptr, end_ptr, enc); 06422 } 06423 } 06424 return LONG2FIX(len); 06425 } 06426 06427 static VALUE 06428 rb_str_enumerate_chars(VALUE str, int wantarray) 06429 { 06430 VALUE orig = str; 06431 VALUE substr; 06432 long i, len, n; 06433 const char *ptr; 06434 rb_encoding *enc; 06435 VALUE UNINITIALIZED_VAR(ary); 06436 06437 if (rb_block_given_p()) { 06438 if (wantarray) { 06439 #if 0 /* next major */ 06440 rb_warn("given block not used"); 06441 ary = rb_ary_new(); 06442 #else 06443 rb_warning("passing a block to String#chars is deprecated"); 06444 wantarray = 0; 06445 #endif 06446 } 06447 } 06448 else { 06449 if (wantarray) 06450 ary = rb_ary_new(); 06451 else 06452 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size); 06453 } 06454 06455 str = rb_str_new4(str); 06456 ptr = RSTRING_PTR(str); 06457 len = RSTRING_LEN(str); 06458 enc = rb_enc_get(str); 06459 switch (ENC_CODERANGE(str)) { 06460 case ENC_CODERANGE_VALID: 06461 case ENC_CODERANGE_7BIT: 06462 for (i = 0; i < len; i += n) { 06463 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc); 06464 substr = rb_str_subseq(str, i, n); 06465 if (wantarray) 06466 rb_ary_push(ary, substr); 06467 else 06468 rb_yield(substr); 06469 } 06470 break; 06471 default: 06472 for (i = 0; i < len; i += n) { 06473 n = rb_enc_mbclen(ptr + i, ptr + len, enc); 06474 substr = rb_str_subseq(str, i, n); 06475 if (wantarray) 06476 rb_ary_push(ary, substr); 06477 else 06478 rb_yield(substr); 06479 } 06480 } 06481 RB_GC_GUARD(str); 06482 if (wantarray) 06483 return ary; 06484 else 06485 return orig; 06486 } 06487 06488 /* 06489 * call-seq: 06490 * str.each_char {|cstr| block } -> str 06491 * str.each_char -> an_enumerator 06492 * 06493 * Passes each character in <i>str</i> to the given block, or returns 06494 * an enumerator if no block is given. 06495 * 06496 * "hello".each_char {|c| print c, ' ' } 06497 * 06498 * <em>produces:</em> 06499 * 06500 * h e l l o 06501 */ 06502 06503 static VALUE 06504 rb_str_each_char(VALUE str) 06505 { 06506 return rb_str_enumerate_chars(str, 0); 06507 } 06508 06509 /* 06510 * call-seq: 06511 * str.chars -> an_array 06512 * 06513 * Returns an array of characters in <i>str</i>. This is a shorthand 06514 * for <code>str.each_char.to_a</code>. 06515 * 06516 * If a block is given, which is a deprecated form, works the same as 06517 * <code>each_char</code>. 06518 */ 06519 06520 static VALUE 06521 rb_str_chars(VALUE str) 06522 { 06523 return rb_str_enumerate_chars(str, 1); 06524 } 06525 06526 06527 static VALUE 06528 rb_str_enumerate_codepoints(VALUE str, int wantarray) 06529 { 06530 VALUE orig = str; 06531 int n; 06532 unsigned int c; 06533 const char *ptr, *end; 06534 rb_encoding *enc; 06535 VALUE UNINITIALIZED_VAR(ary); 06536 06537 if (single_byte_optimizable(str)) 06538 return rb_str_enumerate_bytes(str, wantarray); 06539 06540 if (rb_block_given_p()) { 06541 if (wantarray) { 06542 #if 0 /* next major */ 06543 rb_warn("given block not used"); 06544 ary = rb_ary_new(); 06545 #else 06546 rb_warning("passing a block to String#codepoints is deprecated"); 06547 wantarray = 0; 06548 #endif 06549 } 06550 } 06551 else { 06552 if (wantarray) 06553 ary = rb_ary_new(); 06554 else 06555 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size); 06556 } 06557 06558 str = rb_str_new4(str); 06559 ptr = RSTRING_PTR(str); 06560 end = RSTRING_END(str); 06561 enc = STR_ENC_GET(str); 06562 while (ptr < end) { 06563 c = rb_enc_codepoint_len(ptr, end, &n, enc); 06564 if (wantarray) 06565 rb_ary_push(ary, UINT2NUM(c)); 06566 else 06567 rb_yield(UINT2NUM(c)); 06568 ptr += n; 06569 } 06570 RB_GC_GUARD(str); 06571 if (wantarray) 06572 return ary; 06573 else 06574 return orig; 06575 } 06576 06577 /* 06578 * call-seq: 06579 * str.each_codepoint {|integer| block } -> str 06580 * str.each_codepoint -> an_enumerator 06581 * 06582 * Passes the <code>Integer</code> ordinal of each character in <i>str</i>, 06583 * also known as a <i>codepoint</i> when applied to Unicode strings to the 06584 * given block. 06585 * 06586 * If no block is given, an enumerator is returned instead. 06587 * 06588 * "hello\u0639".each_codepoint {|c| print c, ' ' } 06589 * 06590 * <em>produces:</em> 06591 * 06592 * 104 101 108 108 111 1593 06593 */ 06594 06595 static VALUE 06596 rb_str_each_codepoint(VALUE str) 06597 { 06598 return rb_str_enumerate_codepoints(str, 0); 06599 } 06600 06601 /* 06602 * call-seq: 06603 * str.codepoints -> an_array 06604 * 06605 * Returns an array of the <code>Integer</code> ordinals of the 06606 * characters in <i>str</i>. This is a shorthand for 06607 * <code>str.each_codepoint.to_a</code>. 06608 * 06609 * If a block is given, which is a deprecated form, works the same as 06610 * <code>each_codepoint</code>. 06611 */ 06612 06613 static VALUE 06614 rb_str_codepoints(VALUE str) 06615 { 06616 return rb_str_enumerate_codepoints(str, 1); 06617 } 06618 06619 06620 static long 06621 chopped_length(VALUE str) 06622 { 06623 rb_encoding *enc = STR_ENC_GET(str); 06624 const char *p, *p2, *beg, *end; 06625 06626 beg = RSTRING_PTR(str); 06627 end = beg + RSTRING_LEN(str); 06628 if (beg > end) return 0; 06629 p = rb_enc_prev_char(beg, end, end, enc); 06630 if (!p) return 0; 06631 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') { 06632 p2 = rb_enc_prev_char(beg, p, end, enc); 06633 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2; 06634 } 06635 return p - beg; 06636 } 06637 06638 /* 06639 * call-seq: 06640 * str.chop! -> str or nil 06641 * 06642 * Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>, 06643 * or <code>nil</code> if <i>str</i> is the empty string. See also 06644 * <code>String#chomp!</code>. 06645 */ 06646 06647 static VALUE 06648 rb_str_chop_bang(VALUE str) 06649 { 06650 str_modify_keep_cr(str); 06651 if (RSTRING_LEN(str) > 0) { 06652 long len; 06653 len = chopped_length(str); 06654 STR_SET_LEN(str, len); 06655 RSTRING_PTR(str)[len] = '\0'; 06656 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) { 06657 ENC_CODERANGE_CLEAR(str); 06658 } 06659 return str; 06660 } 06661 return Qnil; 06662 } 06663 06664 06665 /* 06666 * call-seq: 06667 * str.chop -> new_str 06668 * 06669 * Returns a new <code>String</code> with the last character removed. If the 06670 * string ends with <code>\r\n</code>, both characters are removed. Applying 06671 * <code>chop</code> to an empty string returns an empty 06672 * string. <code>String#chomp</code> is often a safer alternative, as it leaves 06673 * the string unchanged if it doesn't end in a record separator. 06674 * 06675 * "string\r\n".chop #=> "string" 06676 * "string\n\r".chop #=> "string\n" 06677 * "string\n".chop #=> "string" 06678 * "string".chop #=> "strin" 06679 * "x".chop.chop #=> "" 06680 */ 06681 06682 static VALUE 06683 rb_str_chop(VALUE str) 06684 { 06685 return rb_str_subseq(str, 0, chopped_length(str)); 06686 } 06687 06688 06689 /* 06690 * call-seq: 06691 * str.chomp!(separator=$/) -> str or nil 06692 * 06693 * Modifies <i>str</i> in place as described for <code>String#chomp</code>, 06694 * returning <i>str</i>, or <code>nil</code> if no modifications were made. 06695 */ 06696 06697 static VALUE 06698 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str) 06699 { 06700 rb_encoding *enc; 06701 VALUE rs; 06702 int newline; 06703 char *p, *pp, *e; 06704 long len, rslen; 06705 06706 str_modify_keep_cr(str); 06707 len = RSTRING_LEN(str); 06708 if (len == 0) return Qnil; 06709 p = RSTRING_PTR(str); 06710 e = p + len; 06711 if (argc == 0) { 06712 rs = rb_rs; 06713 if (rs == rb_default_rs) { 06714 smart_chomp: 06715 enc = rb_enc_get(str); 06716 if (rb_enc_mbminlen(enc) > 1) { 06717 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc); 06718 if (rb_enc_is_newline(pp, e, enc)) { 06719 e = pp; 06720 } 06721 pp = e - rb_enc_mbminlen(enc); 06722 if (pp >= p) { 06723 pp = rb_enc_left_char_head(p, pp, e, enc); 06724 if (rb_enc_ascget(pp, e, 0, enc) == '\r') { 06725 e = pp; 06726 } 06727 } 06728 if (e == RSTRING_END(str)) { 06729 return Qnil; 06730 } 06731 len = e - RSTRING_PTR(str); 06732 STR_SET_LEN(str, len); 06733 } 06734 else { 06735 if (RSTRING_PTR(str)[len-1] == '\n') { 06736 STR_DEC_LEN(str); 06737 if (RSTRING_LEN(str) > 0 && 06738 RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') { 06739 STR_DEC_LEN(str); 06740 } 06741 } 06742 else if (RSTRING_PTR(str)[len-1] == '\r') { 06743 STR_DEC_LEN(str); 06744 } 06745 else { 06746 return Qnil; 06747 } 06748 } 06749 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; 06750 return str; 06751 } 06752 } 06753 else { 06754 rb_scan_args(argc, argv, "01", &rs); 06755 } 06756 if (NIL_P(rs)) return Qnil; 06757 StringValue(rs); 06758 rslen = RSTRING_LEN(rs); 06759 if (rslen == 0) { 06760 while (len>0 && p[len-1] == '\n') { 06761 len--; 06762 if (len>0 && p[len-1] == '\r') 06763 len--; 06764 } 06765 if (len < RSTRING_LEN(str)) { 06766 STR_SET_LEN(str, len); 06767 RSTRING_PTR(str)[len] = '\0'; 06768 return str; 06769 } 06770 return Qnil; 06771 } 06772 if (rslen > len) return Qnil; 06773 newline = RSTRING_PTR(rs)[rslen-1]; 06774 if (rslen == 1 && newline == '\n') 06775 goto smart_chomp; 06776 06777 enc = rb_enc_check(str, rs); 06778 if (is_broken_string(rs)) { 06779 return Qnil; 06780 } 06781 pp = e - rslen; 06782 if (p[len-1] == newline && 06783 (rslen <= 1 || 06784 memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) { 06785 if (rb_enc_left_char_head(p, pp, e, enc) != pp) 06786 return Qnil; 06787 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) { 06788 ENC_CODERANGE_CLEAR(str); 06789 } 06790 STR_SET_LEN(str, RSTRING_LEN(str) - rslen); 06791 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; 06792 return str; 06793 } 06794 return Qnil; 06795 } 06796 06797 06798 /* 06799 * call-seq: 06800 * str.chomp(separator=$/) -> new_str 06801 * 06802 * Returns a new <code>String</code> with the given record separator removed 06803 * from the end of <i>str</i> (if present). If <code>$/</code> has not been 06804 * changed from the default Ruby record separator, then <code>chomp</code> also 06805 * removes carriage return characters (that is it will remove <code>\n</code>, 06806 * <code>\r</code>, and <code>\r\n</code>). 06807 * 06808 * "hello".chomp #=> "hello" 06809 * "hello\n".chomp #=> "hello" 06810 * "hello\r\n".chomp #=> "hello" 06811 * "hello\n\r".chomp #=> "hello\n" 06812 * "hello\r".chomp #=> "hello" 06813 * "hello \n there".chomp #=> "hello \n there" 06814 * "hello".chomp("llo") #=> "he" 06815 */ 06816 06817 static VALUE 06818 rb_str_chomp(int argc, VALUE *argv, VALUE str) 06819 { 06820 str = rb_str_dup(str); 06821 rb_str_chomp_bang(argc, argv, str); 06822 return str; 06823 } 06824 06825 /* 06826 * call-seq: 06827 * str.lstrip! -> self or nil 06828 * 06829 * Removes leading whitespace from <i>str</i>, returning <code>nil</code> if no 06830 * change was made. See also <code>String#rstrip!</code> and 06831 * <code>String#strip!</code>. 06832 * 06833 * " hello ".lstrip #=> "hello " 06834 * "hello".lstrip! #=> nil 06835 */ 06836 06837 static VALUE 06838 rb_str_lstrip_bang(VALUE str) 06839 { 06840 rb_encoding *enc; 06841 char *s, *t, *e; 06842 06843 str_modify_keep_cr(str); 06844 enc = STR_ENC_GET(str); 06845 s = RSTRING_PTR(str); 06846 if (!s || RSTRING_LEN(str) == 0) return Qnil; 06847 e = t = RSTRING_END(str); 06848 /* remove spaces at head */ 06849 while (s < e) { 06850 int n; 06851 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc); 06852 06853 if (!rb_isspace(cc)) break; 06854 s += n; 06855 } 06856 06857 if (s > RSTRING_PTR(str)) { 06858 STR_SET_LEN(str, t-s); 06859 memmove(RSTRING_PTR(str), s, RSTRING_LEN(str)); 06860 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; 06861 return str; 06862 } 06863 return Qnil; 06864 } 06865 06866 06867 /* 06868 * call-seq: 06869 * str.lstrip -> new_str 06870 * 06871 * Returns a copy of <i>str</i> with leading whitespace removed. See also 06872 * <code>String#rstrip</code> and <code>String#strip</code>. 06873 * 06874 * " hello ".lstrip #=> "hello " 06875 * "hello".lstrip #=> "hello" 06876 */ 06877 06878 static VALUE 06879 rb_str_lstrip(VALUE str) 06880 { 06881 str = rb_str_dup(str); 06882 rb_str_lstrip_bang(str); 06883 return str; 06884 } 06885 06886 06887 /* 06888 * call-seq: 06889 * str.rstrip! -> self or nil 06890 * 06891 * Removes trailing whitespace from <i>str</i>, returning <code>nil</code> if 06892 * no change was made. See also <code>String#lstrip!</code> and 06893 * <code>String#strip!</code>. 06894 * 06895 * " hello ".rstrip #=> " hello" 06896 * "hello".rstrip! #=> nil 06897 */ 06898 06899 static VALUE 06900 rb_str_rstrip_bang(VALUE str) 06901 { 06902 rb_encoding *enc; 06903 char *s, *t, *e; 06904 06905 str_modify_keep_cr(str); 06906 enc = STR_ENC_GET(str); 06907 rb_str_check_dummy_enc(enc); 06908 s = RSTRING_PTR(str); 06909 if (!s || RSTRING_LEN(str) == 0) return Qnil; 06910 t = e = RSTRING_END(str); 06911 06912 /* remove trailing spaces or '\0's */ 06913 if (single_byte_optimizable(str)) { 06914 unsigned char c; 06915 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--; 06916 } 06917 else { 06918 char *tp; 06919 06920 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) { 06921 unsigned int c = rb_enc_codepoint(tp, e, enc); 06922 if (c && !rb_isspace(c)) break; 06923 t = tp; 06924 } 06925 } 06926 if (t < e) { 06927 long len = t-RSTRING_PTR(str); 06928 06929 STR_SET_LEN(str, len); 06930 RSTRING_PTR(str)[len] = '\0'; 06931 return str; 06932 } 06933 return Qnil; 06934 } 06935 06936 06937 /* 06938 * call-seq: 06939 * str.rstrip -> new_str 06940 * 06941 * Returns a copy of <i>str</i> with trailing whitespace removed. See also 06942 * <code>String#lstrip</code> and <code>String#strip</code>. 06943 * 06944 * " hello ".rstrip #=> " hello" 06945 * "hello".rstrip #=> "hello" 06946 */ 06947 06948 static VALUE 06949 rb_str_rstrip(VALUE str) 06950 { 06951 str = rb_str_dup(str); 06952 rb_str_rstrip_bang(str); 06953 return str; 06954 } 06955 06956 06957 /* 06958 * call-seq: 06959 * str.strip! -> str or nil 06960 * 06961 * Removes leading and trailing whitespace from <i>str</i>. Returns 06962 * <code>nil</code> if <i>str</i> was not altered. 06963 */ 06964 06965 static VALUE 06966 rb_str_strip_bang(VALUE str) 06967 { 06968 VALUE l = rb_str_lstrip_bang(str); 06969 VALUE r = rb_str_rstrip_bang(str); 06970 06971 if (NIL_P(l) && NIL_P(r)) return Qnil; 06972 return str; 06973 } 06974 06975 06976 /* 06977 * call-seq: 06978 * str.strip -> new_str 06979 * 06980 * Returns a copy of <i>str</i> with leading and trailing whitespace removed. 06981 * 06982 * " hello ".strip #=> "hello" 06983 * "\tgoodbye\r\n".strip #=> "goodbye" 06984 */ 06985 06986 static VALUE 06987 rb_str_strip(VALUE str) 06988 { 06989 str = rb_str_dup(str); 06990 rb_str_strip_bang(str); 06991 return str; 06992 } 06993 06994 static VALUE 06995 scan_once(VALUE str, VALUE pat, long *start) 06996 { 06997 VALUE result, match; 06998 struct re_registers *regs; 06999 int i; 07000 07001 if (rb_reg_search(pat, str, *start, 0) >= 0) { 07002 match = rb_backref_get(); 07003 regs = RMATCH_REGS(match); 07004 if (BEG(0) == END(0)) { 07005 rb_encoding *enc = STR_ENC_GET(str); 07006 /* 07007 * Always consume at least one character of the input string 07008 */ 07009 if (RSTRING_LEN(str) > END(0)) 07010 *start = END(0)+rb_enc_fast_mbclen(RSTRING_PTR(str)+END(0), 07011 RSTRING_END(str), enc); 07012 else 07013 *start = END(0)+1; 07014 } 07015 else { 07016 *start = END(0); 07017 } 07018 if (regs->num_regs == 1) { 07019 return rb_reg_nth_match(0, match); 07020 } 07021 result = rb_ary_new2(regs->num_regs); 07022 for (i=1; i < regs->num_regs; i++) { 07023 rb_ary_push(result, rb_reg_nth_match(i, match)); 07024 } 07025 07026 return result; 07027 } 07028 return Qnil; 07029 } 07030 07031 07032 /* 07033 * call-seq: 07034 * str.scan(pattern) -> array 07035 * str.scan(pattern) {|match, ...| block } -> str 07036 * 07037 * Both forms iterate through <i>str</i>, matching the pattern (which may be a 07038 * <code>Regexp</code> or a <code>String</code>). For each match, a result is 07039 * generated and either added to the result array or passed to the block. If 07040 * the pattern contains no groups, each individual result consists of the 07041 * matched string, <code>$&</code>. If the pattern contains groups, each 07042 * individual result is itself an array containing one entry per group. 07043 * 07044 * a = "cruel world" 07045 * a.scan(/\w+/) #=> ["cruel", "world"] 07046 * a.scan(/.../) #=> ["cru", "el ", "wor"] 07047 * a.scan(/(...)/) #=> [["cru"], ["el "], ["wor"]] 07048 * a.scan(/(..)(..)/) #=> [["cr", "ue"], ["l ", "wo"]] 07049 * 07050 * And the block form: 07051 * 07052 * a.scan(/\w+/) {|w| print "<<#{w}>> " } 07053 * print "\n" 07054 * a.scan(/(.)(.)/) {|x,y| print y, x } 07055 * print "\n" 07056 * 07057 * <em>produces:</em> 07058 * 07059 * <<cruel>> <<world>> 07060 * rceu lowlr 07061 */ 07062 07063 static VALUE 07064 rb_str_scan(VALUE str, VALUE pat) 07065 { 07066 VALUE result; 07067 long start = 0; 07068 long last = -1, prev = 0; 07069 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str); 07070 07071 pat = get_pat(pat, 1); 07072 if (!rb_block_given_p()) { 07073 VALUE ary = rb_ary_new(); 07074 07075 while (!NIL_P(result = scan_once(str, pat, &start))) { 07076 last = prev; 07077 prev = start; 07078 rb_ary_push(ary, result); 07079 } 07080 if (last >= 0) rb_reg_search(pat, str, last, 0); 07081 return ary; 07082 } 07083 07084 while (!NIL_P(result = scan_once(str, pat, &start))) { 07085 last = prev; 07086 prev = start; 07087 rb_yield(result); 07088 str_mod_check(str, p, len); 07089 } 07090 if (last >= 0) rb_reg_search(pat, str, last, 0); 07091 return str; 07092 } 07093 07094 07095 /* 07096 * call-seq: 07097 * str.hex -> integer 07098 * 07099 * Treats leading characters from <i>str</i> as a string of hexadecimal digits 07100 * (with an optional sign and an optional <code>0x</code>) and returns the 07101 * corresponding number. Zero is returned on error. 07102 * 07103 * "0x0a".hex #=> 10 07104 * "-1234".hex #=> -4660 07105 * "0".hex #=> 0 07106 * "wombat".hex #=> 0 07107 */ 07108 07109 static VALUE 07110 rb_str_hex(VALUE str) 07111 { 07112 return rb_str_to_inum(str, 16, FALSE); 07113 } 07114 07115 07116 /* 07117 * call-seq: 07118 * str.oct -> integer 07119 * 07120 * Treats leading characters of <i>str</i> as a string of octal digits (with an 07121 * optional sign) and returns the corresponding number. Returns 0 if the 07122 * conversion fails. 07123 * 07124 * "123".oct #=> 83 07125 * "-377".oct #=> -255 07126 * "bad".oct #=> 0 07127 * "0377bad".oct #=> 255 07128 */ 07129 07130 static VALUE 07131 rb_str_oct(VALUE str) 07132 { 07133 return rb_str_to_inum(str, -8, FALSE); 07134 } 07135 07136 07137 /* 07138 * call-seq: 07139 * str.crypt(salt_str) -> new_str 07140 * 07141 * Applies a one-way cryptographic hash to <i>str</i> by invoking the 07142 * standard library function <code>crypt(3)</code> with the given 07143 * salt string. While the format and the result are system and 07144 * implementation dependent, using a salt matching the regular 07145 * expression <code>\A[a-zA-Z0-9./]{2}</code> should be valid and 07146 * safe on any platform, in which only the first two characters are 07147 * significant. 07148 * 07149 * This method is for use in system specific scripts, so if you want 07150 * a cross-platform hash function consider using Digest or OpenSSL 07151 * instead. 07152 */ 07153 07154 static VALUE 07155 rb_str_crypt(VALUE str, VALUE salt) 07156 { 07157 extern char *crypt(const char *, const char *); 07158 VALUE result; 07159 const char *s, *saltp; 07160 char *res; 07161 #ifdef BROKEN_CRYPT 07162 char salt_8bit_clean[3]; 07163 #endif 07164 07165 StringValue(salt); 07166 if (RSTRING_LEN(salt) < 2) 07167 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)"); 07168 07169 s = RSTRING_PTR(str); 07170 if (!s) s = ""; 07171 saltp = RSTRING_PTR(salt); 07172 #ifdef BROKEN_CRYPT 07173 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) { 07174 salt_8bit_clean[0] = saltp[0] & 0x7f; 07175 salt_8bit_clean[1] = saltp[1] & 0x7f; 07176 salt_8bit_clean[2] = '\0'; 07177 saltp = salt_8bit_clean; 07178 } 07179 #endif 07180 res = crypt(s, saltp); 07181 if (!res) { 07182 rb_sys_fail("crypt"); 07183 } 07184 result = rb_str_new2(res); 07185 OBJ_INFECT(result, str); 07186 OBJ_INFECT(result, salt); 07187 return result; 07188 } 07189 07190 07191 /* 07192 * call-seq: 07193 * str.intern -> symbol 07194 * str.to_sym -> symbol 07195 * 07196 * Returns the <code>Symbol</code> corresponding to <i>str</i>, creating the 07197 * symbol if it did not previously exist. See <code>Symbol#id2name</code>. 07198 * 07199 * "Koala".intern #=> :Koala 07200 * s = 'cat'.to_sym #=> :cat 07201 * s == :cat #=> true 07202 * s = '@cat'.to_sym #=> :@cat 07203 * s == :@cat #=> true 07204 * 07205 * This can also be used to create symbols that cannot be represented using the 07206 * <code>:xxx</code> notation. 07207 * 07208 * 'cat and dog'.to_sym #=> :"cat and dog" 07209 */ 07210 07211 VALUE 07212 rb_str_intern(VALUE s) 07213 { 07214 VALUE str = RB_GC_GUARD(s); 07215 ID id; 07216 07217 id = rb_intern_str(str); 07218 return ID2SYM(id); 07219 } 07220 07221 07222 /* 07223 * call-seq: 07224 * str.ord -> integer 07225 * 07226 * Return the <code>Integer</code> ordinal of a one-character string. 07227 * 07228 * "a".ord #=> 97 07229 */ 07230 07231 VALUE 07232 rb_str_ord(VALUE s) 07233 { 07234 unsigned int c; 07235 07236 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s)); 07237 return UINT2NUM(c); 07238 } 07239 /* 07240 * call-seq: 07241 * str.sum(n=16) -> integer 07242 * 07243 * Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>, 07244 * where <em>n</em> is the optional <code>Fixnum</code> parameter, defaulting 07245 * to 16. The result is simply the sum of the binary value of each character in 07246 * <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good 07247 * checksum. 07248 */ 07249 07250 static VALUE 07251 rb_str_sum(int argc, VALUE *argv, VALUE str) 07252 { 07253 VALUE vbits; 07254 int bits; 07255 char *ptr, *p, *pend; 07256 long len; 07257 VALUE sum = INT2FIX(0); 07258 unsigned long sum0 = 0; 07259 07260 if (argc == 0) { 07261 bits = 16; 07262 } 07263 else { 07264 rb_scan_args(argc, argv, "01", &vbits); 07265 bits = NUM2INT(vbits); 07266 } 07267 ptr = p = RSTRING_PTR(str); 07268 len = RSTRING_LEN(str); 07269 pend = p + len; 07270 07271 while (p < pend) { 07272 if (FIXNUM_MAX - UCHAR_MAX < sum0) { 07273 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0)); 07274 str_mod_check(str, ptr, len); 07275 sum0 = 0; 07276 } 07277 sum0 += (unsigned char)*p; 07278 p++; 07279 } 07280 07281 if (bits == 0) { 07282 if (sum0) { 07283 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0)); 07284 } 07285 } 07286 else { 07287 if (sum == INT2FIX(0)) { 07288 if (bits < (int)sizeof(long)*CHAR_BIT) { 07289 sum0 &= (((unsigned long)1)<<bits)-1; 07290 } 07291 sum = LONG2FIX(sum0); 07292 } 07293 else { 07294 VALUE mod; 07295 07296 if (sum0) { 07297 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0)); 07298 } 07299 07300 mod = rb_funcall(INT2FIX(1), rb_intern("<<"), 1, INT2FIX(bits)); 07301 mod = rb_funcall(mod, '-', 1, INT2FIX(1)); 07302 sum = rb_funcall(sum, '&', 1, mod); 07303 } 07304 } 07305 return sum; 07306 } 07307 07308 static VALUE 07309 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag) 07310 { 07311 rb_encoding *enc; 07312 VALUE w; 07313 long width, len, flen = 1, fclen = 1; 07314 VALUE res; 07315 char *p; 07316 const char *f = " "; 07317 long n, size, llen, rlen, llen2 = 0, rlen2 = 0; 07318 volatile VALUE pad; 07319 int singlebyte = 1, cr; 07320 07321 rb_scan_args(argc, argv, "11", &w, &pad); 07322 enc = STR_ENC_GET(str); 07323 width = NUM2LONG(w); 07324 if (argc == 2) { 07325 StringValue(pad); 07326 enc = rb_enc_check(str, pad); 07327 f = RSTRING_PTR(pad); 07328 flen = RSTRING_LEN(pad); 07329 fclen = str_strlen(pad, enc); 07330 singlebyte = single_byte_optimizable(pad); 07331 if (flen == 0 || fclen == 0) { 07332 rb_raise(rb_eArgError, "zero width padding"); 07333 } 07334 } 07335 len = str_strlen(str, enc); 07336 if (width < 0 || len >= width) return rb_str_dup(str); 07337 n = width - len; 07338 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2); 07339 rlen = n - llen; 07340 cr = ENC_CODERANGE(str); 07341 if (flen > 1) { 07342 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte); 07343 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte); 07344 } 07345 size = RSTRING_LEN(str); 07346 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen || 07347 (len *= flen) >= LONG_MAX - llen2 - rlen2 || 07348 (len += llen2 + rlen2) >= LONG_MAX - size) { 07349 rb_raise(rb_eArgError, "argument too big"); 07350 } 07351 len += size; 07352 res = rb_str_new5(str, 0, len); 07353 p = RSTRING_PTR(res); 07354 if (flen <= 1) { 07355 memset(p, *f, llen); 07356 p += llen; 07357 } 07358 else { 07359 while (llen >= fclen) { 07360 memcpy(p,f,flen); 07361 p += flen; 07362 llen -= fclen; 07363 } 07364 if (llen > 0) { 07365 memcpy(p, f, llen2); 07366 p += llen2; 07367 } 07368 } 07369 memcpy(p, RSTRING_PTR(str), size); 07370 p += size; 07371 if (flen <= 1) { 07372 memset(p, *f, rlen); 07373 p += rlen; 07374 } 07375 else { 07376 while (rlen >= fclen) { 07377 memcpy(p,f,flen); 07378 p += flen; 07379 rlen -= fclen; 07380 } 07381 if (rlen > 0) { 07382 memcpy(p, f, rlen2); 07383 p += rlen2; 07384 } 07385 } 07386 *p = '\0'; 07387 STR_SET_LEN(res, p-RSTRING_PTR(res)); 07388 OBJ_INFECT(res, str); 07389 if (!NIL_P(pad)) OBJ_INFECT(res, pad); 07390 rb_enc_associate(res, enc); 07391 if (argc == 2) 07392 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad)); 07393 if (cr != ENC_CODERANGE_BROKEN) 07394 ENC_CODERANGE_SET(res, cr); 07395 return res; 07396 } 07397 07398 07399 /* 07400 * call-seq: 07401 * str.ljust(integer, padstr=' ') -> new_str 07402 * 07403 * If <i>integer</i> is greater than the length of <i>str</i>, returns a new 07404 * <code>String</code> of length <i>integer</i> with <i>str</i> left justified 07405 * and padded with <i>padstr</i>; otherwise, returns <i>str</i>. 07406 * 07407 * "hello".ljust(4) #=> "hello" 07408 * "hello".ljust(20) #=> "hello " 07409 * "hello".ljust(20, '1234') #=> "hello123412341234123" 07410 */ 07411 07412 static VALUE 07413 rb_str_ljust(int argc, VALUE *argv, VALUE str) 07414 { 07415 return rb_str_justify(argc, argv, str, 'l'); 07416 } 07417 07418 07419 /* 07420 * call-seq: 07421 * str.rjust(integer, padstr=' ') -> new_str 07422 * 07423 * If <i>integer</i> is greater than the length of <i>str</i>, returns a new 07424 * <code>String</code> of length <i>integer</i> with <i>str</i> right justified 07425 * and padded with <i>padstr</i>; otherwise, returns <i>str</i>. 07426 * 07427 * "hello".rjust(4) #=> "hello" 07428 * "hello".rjust(20) #=> " hello" 07429 * "hello".rjust(20, '1234') #=> "123412341234123hello" 07430 */ 07431 07432 static VALUE 07433 rb_str_rjust(int argc, VALUE *argv, VALUE str) 07434 { 07435 return rb_str_justify(argc, argv, str, 'r'); 07436 } 07437 07438 07439 /* 07440 * call-seq: 07441 * str.center(width, padstr=' ') -> new_str 07442 * 07443 * Centers +str+ in +width+. If +width+ is greater than the length of +str+, 07444 * returns a new String of length +width+ with +str+ centered and padded with 07445 * +padstr+; otherwise, returns +str+. 07446 * 07447 * "hello".center(4) #=> "hello" 07448 * "hello".center(20) #=> " hello " 07449 * "hello".center(20, '123') #=> "1231231hello12312312" 07450 */ 07451 07452 static VALUE 07453 rb_str_center(int argc, VALUE *argv, VALUE str) 07454 { 07455 return rb_str_justify(argc, argv, str, 'c'); 07456 } 07457 07458 /* 07459 * call-seq: 07460 * str.partition(sep) -> [head, sep, tail] 07461 * str.partition(regexp) -> [head, match, tail] 07462 * 07463 * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string 07464 * and returns the part before it, the match, and the part 07465 * after it. 07466 * If it is not found, returns two empty strings and <i>str</i>. 07467 * 07468 * "hello".partition("l") #=> ["he", "l", "lo"] 07469 * "hello".partition("x") #=> ["hello", "", ""] 07470 * "hello".partition(/.l/) #=> ["h", "el", "lo"] 07471 */ 07472 07473 static VALUE 07474 rb_str_partition(VALUE str, VALUE sep) 07475 { 07476 long pos; 07477 int regex = FALSE; 07478 07479 if (RB_TYPE_P(sep, T_REGEXP)) { 07480 pos = rb_reg_search(sep, str, 0, 0); 07481 regex = TRUE; 07482 } 07483 else { 07484 VALUE tmp; 07485 07486 tmp = rb_check_string_type(sep); 07487 if (NIL_P(tmp)) { 07488 rb_raise(rb_eTypeError, "type mismatch: %s given", 07489 rb_obj_classname(sep)); 07490 } 07491 sep = tmp; 07492 pos = rb_str_index(str, sep, 0); 07493 } 07494 if (pos < 0) { 07495 failed: 07496 return rb_ary_new3(3, str, str_new_empty(str), str_new_empty(str)); 07497 } 07498 if (regex) { 07499 sep = rb_str_subpat(str, sep, INT2FIX(0)); 07500 if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed; 07501 } 07502 return rb_ary_new3(3, rb_str_subseq(str, 0, pos), 07503 sep, 07504 rb_str_subseq(str, pos+RSTRING_LEN(sep), 07505 RSTRING_LEN(str)-pos-RSTRING_LEN(sep))); 07506 } 07507 07508 /* 07509 * call-seq: 07510 * str.rpartition(sep) -> [head, sep, tail] 07511 * str.rpartition(regexp) -> [head, match, tail] 07512 * 07513 * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end 07514 * of the string, and returns the part before it, the match, and the part 07515 * after it. 07516 * If it is not found, returns two empty strings and <i>str</i>. 07517 * 07518 * "hello".rpartition("l") #=> ["hel", "l", "o"] 07519 * "hello".rpartition("x") #=> ["", "", "hello"] 07520 * "hello".rpartition(/.l/) #=> ["he", "ll", "o"] 07521 */ 07522 07523 static VALUE 07524 rb_str_rpartition(VALUE str, VALUE sep) 07525 { 07526 long pos = RSTRING_LEN(str); 07527 int regex = FALSE; 07528 07529 if (RB_TYPE_P(sep, T_REGEXP)) { 07530 pos = rb_reg_search(sep, str, pos, 1); 07531 regex = TRUE; 07532 } 07533 else { 07534 VALUE tmp; 07535 07536 tmp = rb_check_string_type(sep); 07537 if (NIL_P(tmp)) { 07538 rb_raise(rb_eTypeError, "type mismatch: %s given", 07539 rb_obj_classname(sep)); 07540 } 07541 sep = tmp; 07542 pos = rb_str_sublen(str, pos); 07543 pos = rb_str_rindex(str, sep, pos); 07544 } 07545 if (pos < 0) { 07546 return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), str); 07547 } 07548 if (regex) { 07549 sep = rb_reg_nth_match(0, rb_backref_get()); 07550 } 07551 return rb_ary_new3(3, rb_str_substr(str, 0, pos), 07552 sep, 07553 rb_str_substr(str,pos+str_strlen(sep,STR_ENC_GET(sep)),RSTRING_LEN(str))); 07554 } 07555 07556 /* 07557 * call-seq: 07558 * str.start_with?([prefixes]+) -> true or false 07559 * 07560 * Returns true if +str+ starts with one of the +prefixes+ given. 07561 * 07562 * "hello".start_with?("hell") #=> true 07563 * 07564 * # returns true if one of the prefixes matches. 07565 * "hello".start_with?("heaven", "hell") #=> true 07566 * "hello".start_with?("heaven", "paradise") #=> false 07567 */ 07568 07569 static VALUE 07570 rb_str_start_with(int argc, VALUE *argv, VALUE str) 07571 { 07572 int i; 07573 07574 for (i=0; i<argc; i++) { 07575 VALUE tmp = argv[i]; 07576 StringValue(tmp); 07577 rb_enc_check(str, tmp); 07578 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue; 07579 if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0) 07580 return Qtrue; 07581 } 07582 return Qfalse; 07583 } 07584 07585 /* 07586 * call-seq: 07587 * str.end_with?([suffixes]+) -> true or false 07588 * 07589 * Returns true if +str+ ends with one of the +suffixes+ given. 07590 */ 07591 07592 static VALUE 07593 rb_str_end_with(int argc, VALUE *argv, VALUE str) 07594 { 07595 int i; 07596 char *p, *s, *e; 07597 rb_encoding *enc; 07598 07599 for (i=0; i<argc; i++) { 07600 VALUE tmp = argv[i]; 07601 StringValue(tmp); 07602 enc = rb_enc_check(str, tmp); 07603 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue; 07604 p = RSTRING_PTR(str); 07605 e = p + RSTRING_LEN(str); 07606 s = e - RSTRING_LEN(tmp); 07607 if (rb_enc_left_char_head(p, s, e, enc) != s) 07608 continue; 07609 if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0) 07610 return Qtrue; 07611 } 07612 return Qfalse; 07613 } 07614 07615 void 07616 rb_str_setter(VALUE val, ID id, VALUE *var) 07617 { 07618 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) { 07619 rb_raise(rb_eTypeError, "value of %s must be String", rb_id2name(id)); 07620 } 07621 *var = val; 07622 } 07623 07624 07625 /* 07626 * call-seq: 07627 * str.force_encoding(encoding) -> str 07628 * 07629 * Changes the encoding to +encoding+ and returns self. 07630 */ 07631 07632 static VALUE 07633 rb_str_force_encoding(VALUE str, VALUE enc) 07634 { 07635 str_modifiable(str); 07636 rb_enc_associate(str, rb_to_encoding(enc)); 07637 ENC_CODERANGE_CLEAR(str); 07638 return str; 07639 } 07640 07641 /* 07642 * call-seq: 07643 * str.b -> str 07644 * 07645 * Returns a copied string whose encoding is ASCII-8BIT. 07646 */ 07647 07648 static VALUE 07649 rb_str_b(VALUE str) 07650 { 07651 VALUE str2 = str_alloc(rb_cString); 07652 str_replace_shared_without_enc(str2, str); 07653 OBJ_INFECT(str2, str); 07654 ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID); 07655 return str2; 07656 } 07657 07658 /* 07659 * call-seq: 07660 * str.valid_encoding? -> true or false 07661 * 07662 * Returns true for a string which encoded correctly. 07663 * 07664 * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? #=> true 07665 * "\xc2".force_encoding("UTF-8").valid_encoding? #=> false 07666 * "\x80".force_encoding("UTF-8").valid_encoding? #=> false 07667 */ 07668 07669 static VALUE 07670 rb_str_valid_encoding_p(VALUE str) 07671 { 07672 int cr = rb_enc_str_coderange(str); 07673 07674 return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue; 07675 } 07676 07677 /* 07678 * call-seq: 07679 * str.ascii_only? -> true or false 07680 * 07681 * Returns true for a string which has only ASCII characters. 07682 * 07683 * "abc".force_encoding("UTF-8").ascii_only? #=> true 07684 * "abc\u{6666}".force_encoding("UTF-8").ascii_only? #=> false 07685 */ 07686 07687 static VALUE 07688 rb_str_is_ascii_only_p(VALUE str) 07689 { 07690 int cr = rb_enc_str_coderange(str); 07691 07692 return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse; 07693 } 07694 07709 VALUE 07710 rb_str_ellipsize(VALUE str, long len) 07711 { 07712 static const char ellipsis[] = "..."; 07713 const long ellipsislen = sizeof(ellipsis) - 1; 07714 rb_encoding *const enc = rb_enc_get(str); 07715 const long blen = RSTRING_LEN(str); 07716 const char *const p = RSTRING_PTR(str), *e = p + blen; 07717 VALUE estr, ret = 0; 07718 07719 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len); 07720 if (len * rb_enc_mbminlen(enc) >= blen || 07721 (e = rb_enc_nth(p, e, len, enc)) - p == blen) { 07722 ret = str; 07723 } 07724 else if (len <= ellipsislen || 07725 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) { 07726 if (rb_enc_asciicompat(enc)) { 07727 ret = rb_str_new_with_class(str, ellipsis, len); 07728 rb_enc_associate(ret, enc); 07729 } 07730 else { 07731 estr = rb_usascii_str_new(ellipsis, len); 07732 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil); 07733 } 07734 } 07735 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) { 07736 rb_str_cat(ret, ellipsis, ellipsislen); 07737 } 07738 else { 07739 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen), 07740 rb_enc_from_encoding(enc), 0, Qnil); 07741 rb_str_append(ret, estr); 07742 } 07743 return ret; 07744 } 07745 07746 /********************************************************************** 07747 * Document-class: Symbol 07748 * 07749 * <code>Symbol</code> objects represent names and some strings 07750 * inside the Ruby 07751 * interpreter. They are generated using the <code>:name</code> and 07752 * <code>:"string"</code> literals 07753 * syntax, and by the various <code>to_sym</code> methods. The same 07754 * <code>Symbol</code> object will be created for a given name or string 07755 * for the duration of a program's execution, regardless of the context 07756 * or meaning of that name. Thus if <code>Fred</code> is a constant in 07757 * one context, a method in another, and a class in a third, the 07758 * <code>Symbol</code> <code>:Fred</code> will be the same object in 07759 * all three contexts. 07760 * 07761 * module One 07762 * class Fred 07763 * end 07764 * $f1 = :Fred 07765 * end 07766 * module Two 07767 * Fred = 1 07768 * $f2 = :Fred 07769 * end 07770 * def Fred() 07771 * end 07772 * $f3 = :Fred 07773 * $f1.object_id #=> 2514190 07774 * $f2.object_id #=> 2514190 07775 * $f3.object_id #=> 2514190 07776 * 07777 */ 07778 07779 07780 /* 07781 * call-seq: 07782 * sym == obj -> true or false 07783 * 07784 * Equality---If <i>sym</i> and <i>obj</i> are exactly the same 07785 * symbol, returns <code>true</code>. 07786 */ 07787 07788 static VALUE 07789 sym_equal(VALUE sym1, VALUE sym2) 07790 { 07791 if (sym1 == sym2) return Qtrue; 07792 return Qfalse; 07793 } 07794 07795 07796 static int 07797 sym_printable(const char *s, const char *send, rb_encoding *enc) 07798 { 07799 while (s < send) { 07800 int n; 07801 int c = rb_enc_codepoint_len(s, send, &n, enc); 07802 07803 if (!rb_enc_isprint(c, enc)) return FALSE; 07804 s += n; 07805 } 07806 return TRUE; 07807 } 07808 07809 int 07810 rb_str_symname_p(VALUE sym) 07811 { 07812 rb_encoding *enc; 07813 const char *ptr; 07814 long len; 07815 rb_encoding *resenc = rb_default_internal_encoding(); 07816 07817 if (resenc == NULL) resenc = rb_default_external_encoding(); 07818 enc = STR_ENC_GET(sym); 07819 ptr = RSTRING_PTR(sym); 07820 len = RSTRING_LEN(sym); 07821 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) || 07822 !rb_enc_symname_p(ptr, enc) || !sym_printable(ptr, ptr + len, enc)) { 07823 return FALSE; 07824 } 07825 return TRUE; 07826 } 07827 07828 VALUE 07829 rb_str_quote_unprintable(VALUE str) 07830 { 07831 rb_encoding *enc; 07832 const char *ptr; 07833 long len; 07834 rb_encoding *resenc; 07835 07836 Check_Type(str, T_STRING); 07837 resenc = rb_default_internal_encoding(); 07838 if (resenc == NULL) resenc = rb_default_external_encoding(); 07839 enc = STR_ENC_GET(str); 07840 ptr = RSTRING_PTR(str); 07841 len = RSTRING_LEN(str); 07842 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) || 07843 !sym_printable(ptr, ptr + len, enc)) { 07844 return rb_str_inspect(str); 07845 } 07846 return str; 07847 } 07848 07849 VALUE 07850 rb_id_quote_unprintable(ID id) 07851 { 07852 return rb_str_quote_unprintable(rb_id2str(id)); 07853 } 07854 07855 /* 07856 * call-seq: 07857 * sym.inspect -> string 07858 * 07859 * Returns the representation of <i>sym</i> as a symbol literal. 07860 * 07861 * :fred.inspect #=> ":fred" 07862 */ 07863 07864 static VALUE 07865 sym_inspect(VALUE sym) 07866 { 07867 VALUE str; 07868 const char *ptr; 07869 long len; 07870 ID id = SYM2ID(sym); 07871 char *dest; 07872 07873 sym = rb_id2str(id); 07874 if (!rb_str_symname_p(sym)) { 07875 str = rb_str_inspect(sym); 07876 len = RSTRING_LEN(str); 07877 rb_str_resize(str, len + 1); 07878 dest = RSTRING_PTR(str); 07879 memmove(dest + 1, dest, len); 07880 dest[0] = ':'; 07881 } 07882 else { 07883 rb_encoding *enc = STR_ENC_GET(sym); 07884 ptr = RSTRING_PTR(sym); 07885 len = RSTRING_LEN(sym); 07886 str = rb_enc_str_new(0, len + 1, enc); 07887 dest = RSTRING_PTR(str); 07888 dest[0] = ':'; 07889 memcpy(dest + 1, ptr, len); 07890 } 07891 return str; 07892 } 07893 07894 07895 /* 07896 * call-seq: 07897 * sym.id2name -> string 07898 * sym.to_s -> string 07899 * 07900 * Returns the name or string corresponding to <i>sym</i>. 07901 * 07902 * :fred.id2name #=> "fred" 07903 */ 07904 07905 07906 VALUE 07907 rb_sym_to_s(VALUE sym) 07908 { 07909 ID id = SYM2ID(sym); 07910 07911 return str_new3(rb_cString, rb_id2str(id)); 07912 } 07913 07914 07915 /* 07916 * call-seq: 07917 * sym.to_sym -> sym 07918 * sym.intern -> sym 07919 * 07920 * In general, <code>to_sym</code> returns the <code>Symbol</code> corresponding 07921 * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned 07922 * in this case. 07923 */ 07924 07925 static VALUE 07926 sym_to_sym(VALUE sym) 07927 { 07928 return sym; 07929 } 07930 07931 static VALUE 07932 sym_call(VALUE args, VALUE sym, int argc, VALUE *argv, VALUE passed_proc) 07933 { 07934 VALUE obj; 07935 07936 if (argc < 1) { 07937 rb_raise(rb_eArgError, "no receiver given"); 07938 } 07939 obj = argv[0]; 07940 return rb_funcall_with_block(obj, (ID)sym, argc - 1, argv + 1, passed_proc); 07941 } 07942 07943 /* 07944 * call-seq: 07945 * sym.to_proc 07946 * 07947 * Returns a _Proc_ object which respond to the given method by _sym_. 07948 * 07949 * (1..3).collect(&:to_s) #=> ["1", "2", "3"] 07950 */ 07951 07952 static VALUE 07953 sym_to_proc(VALUE sym) 07954 { 07955 static VALUE sym_proc_cache = Qfalse; 07956 enum {SYM_PROC_CACHE_SIZE = 67}; 07957 VALUE proc; 07958 long id, index; 07959 VALUE *aryp; 07960 07961 if (!sym_proc_cache) { 07962 sym_proc_cache = rb_ary_tmp_new(SYM_PROC_CACHE_SIZE * 2); 07963 rb_gc_register_mark_object(sym_proc_cache); 07964 rb_ary_store(sym_proc_cache, SYM_PROC_CACHE_SIZE*2 - 1, Qnil); 07965 } 07966 07967 id = SYM2ID(sym); 07968 index = (id % SYM_PROC_CACHE_SIZE) << 1; 07969 07970 aryp = RARRAY_PTR(sym_proc_cache); 07971 if (aryp[index] == sym) { 07972 return aryp[index + 1]; 07973 } 07974 else { 07975 proc = rb_proc_new(sym_call, (VALUE)id); 07976 aryp[index] = sym; 07977 aryp[index + 1] = proc; 07978 return proc; 07979 } 07980 } 07981 07982 /* 07983 * call-seq: 07984 * 07985 * sym.succ 07986 * 07987 * Same as <code>sym.to_s.succ.intern</code>. 07988 */ 07989 07990 static VALUE 07991 sym_succ(VALUE sym) 07992 { 07993 return rb_str_intern(rb_str_succ(rb_sym_to_s(sym))); 07994 } 07995 07996 /* 07997 * call-seq: 07998 * 07999 * symbol <=> other_symbol -> -1, 0, +1 or nil 08000 * 08001 * Compares +symbol+ with +other_symbol+ after calling #to_s on each of the 08002 * symbols. Returns -1, 0, +1 or nil depending on whether +symbol+ is less 08003 * than, equal to, or greater than +other_symbol+. 08004 * 08005 * +nil+ is returned if the two values are incomparable. 08006 * 08007 * See String#<=> for more information. 08008 */ 08009 08010 static VALUE 08011 sym_cmp(VALUE sym, VALUE other) 08012 { 08013 if (!SYMBOL_P(other)) { 08014 return Qnil; 08015 } 08016 return rb_str_cmp_m(rb_sym_to_s(sym), rb_sym_to_s(other)); 08017 } 08018 08019 /* 08020 * call-seq: 08021 * 08022 * sym.casecmp(other) -> -1, 0, +1 or nil 08023 * 08024 * Case-insensitive version of <code>Symbol#<=></code>. 08025 */ 08026 08027 static VALUE 08028 sym_casecmp(VALUE sym, VALUE other) 08029 { 08030 if (!SYMBOL_P(other)) { 08031 return Qnil; 08032 } 08033 return rb_str_casecmp(rb_sym_to_s(sym), rb_sym_to_s(other)); 08034 } 08035 08036 /* 08037 * call-seq: 08038 * sym =~ obj -> fixnum or nil 08039 * 08040 * Returns <code>sym.to_s =~ obj</code>. 08041 */ 08042 08043 static VALUE 08044 sym_match(VALUE sym, VALUE other) 08045 { 08046 return rb_str_match(rb_sym_to_s(sym), other); 08047 } 08048 08049 /* 08050 * call-seq: 08051 * sym[idx] -> char 08052 * sym[b, n] -> char 08053 * 08054 * Returns <code>sym.to_s[]</code>. 08055 */ 08056 08057 static VALUE 08058 sym_aref(int argc, VALUE *argv, VALUE sym) 08059 { 08060 return rb_str_aref_m(argc, argv, rb_sym_to_s(sym)); 08061 } 08062 08063 /* 08064 * call-seq: 08065 * sym.length -> integer 08066 * 08067 * Same as <code>sym.to_s.length</code>. 08068 */ 08069 08070 static VALUE 08071 sym_length(VALUE sym) 08072 { 08073 return rb_str_length(rb_id2str(SYM2ID(sym))); 08074 } 08075 08076 /* 08077 * call-seq: 08078 * sym.empty? -> true or false 08079 * 08080 * Returns that _sym_ is :"" or not. 08081 */ 08082 08083 static VALUE 08084 sym_empty(VALUE sym) 08085 { 08086 return rb_str_empty(rb_id2str(SYM2ID(sym))); 08087 } 08088 08089 /* 08090 * call-seq: 08091 * sym.upcase -> symbol 08092 * 08093 * Same as <code>sym.to_s.upcase.intern</code>. 08094 */ 08095 08096 static VALUE 08097 sym_upcase(VALUE sym) 08098 { 08099 return rb_str_intern(rb_str_upcase(rb_id2str(SYM2ID(sym)))); 08100 } 08101 08102 /* 08103 * call-seq: 08104 * sym.downcase -> symbol 08105 * 08106 * Same as <code>sym.to_s.downcase.intern</code>. 08107 */ 08108 08109 static VALUE 08110 sym_downcase(VALUE sym) 08111 { 08112 return rb_str_intern(rb_str_downcase(rb_id2str(SYM2ID(sym)))); 08113 } 08114 08115 /* 08116 * call-seq: 08117 * sym.capitalize -> symbol 08118 * 08119 * Same as <code>sym.to_s.capitalize.intern</code>. 08120 */ 08121 08122 static VALUE 08123 sym_capitalize(VALUE sym) 08124 { 08125 return rb_str_intern(rb_str_capitalize(rb_id2str(SYM2ID(sym)))); 08126 } 08127 08128 /* 08129 * call-seq: 08130 * sym.swapcase -> symbol 08131 * 08132 * Same as <code>sym.to_s.swapcase.intern</code>. 08133 */ 08134 08135 static VALUE 08136 sym_swapcase(VALUE sym) 08137 { 08138 return rb_str_intern(rb_str_swapcase(rb_id2str(SYM2ID(sym)))); 08139 } 08140 08141 /* 08142 * call-seq: 08143 * sym.encoding -> encoding 08144 * 08145 * Returns the Encoding object that represents the encoding of _sym_. 08146 */ 08147 08148 static VALUE 08149 sym_encoding(VALUE sym) 08150 { 08151 return rb_obj_encoding(rb_id2str(SYM2ID(sym))); 08152 } 08153 08154 ID 08155 rb_to_id(VALUE name) 08156 { 08157 VALUE tmp; 08158 08159 switch (TYPE(name)) { 08160 default: 08161 tmp = rb_check_string_type(name); 08162 if (NIL_P(tmp)) { 08163 tmp = rb_inspect(name); 08164 rb_raise(rb_eTypeError, "%s is not a symbol", 08165 RSTRING_PTR(tmp)); 08166 } 08167 name = tmp; 08168 /* fall through */ 08169 case T_STRING: 08170 name = rb_str_intern(name); 08171 /* fall through */ 08172 case T_SYMBOL: 08173 return SYM2ID(name); 08174 } 08175 08176 UNREACHABLE; 08177 } 08178 08179 /* 08180 * A <code>String</code> object holds and manipulates an arbitrary sequence of 08181 * bytes, typically representing characters. String objects may be created 08182 * using <code>String::new</code> or as literals. 08183 * 08184 * Because of aliasing issues, users of strings should be aware of the methods 08185 * that modify the contents of a <code>String</code> object. Typically, 08186 * methods with names ending in ``!'' modify their receiver, while those 08187 * without a ``!'' return a new <code>String</code>. However, there are 08188 * exceptions, such as <code>String#[]=</code>. 08189 * 08190 */ 08191 08192 void 08193 Init_String(void) 08194 { 08195 #undef rb_intern 08196 #define rb_intern(str) rb_intern_const(str) 08197 08198 rb_cString = rb_define_class("String", rb_cObject); 08199 rb_include_module(rb_cString, rb_mComparable); 08200 rb_define_alloc_func(rb_cString, empty_str_alloc); 08201 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1); 08202 rb_define_method(rb_cString, "initialize", rb_str_init, -1); 08203 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1); 08204 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1); 08205 rb_define_method(rb_cString, "==", rb_str_equal, 1); 08206 rb_define_method(rb_cString, "===", rb_str_equal, 1); 08207 rb_define_method(rb_cString, "eql?", rb_str_eql, 1); 08208 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0); 08209 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1); 08210 rb_define_method(rb_cString, "+", rb_str_plus, 1); 08211 rb_define_method(rb_cString, "*", rb_str_times, 1); 08212 rb_define_method(rb_cString, "%", rb_str_format_m, 1); 08213 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1); 08214 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1); 08215 rb_define_method(rb_cString, "insert", rb_str_insert, 2); 08216 rb_define_method(rb_cString, "length", rb_str_length, 0); 08217 rb_define_method(rb_cString, "size", rb_str_length, 0); 08218 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0); 08219 rb_define_method(rb_cString, "empty?", rb_str_empty, 0); 08220 rb_define_method(rb_cString, "=~", rb_str_match, 1); 08221 rb_define_method(rb_cString, "match", rb_str_match_m, -1); 08222 rb_define_method(rb_cString, "succ", rb_str_succ, 0); 08223 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0); 08224 rb_define_method(rb_cString, "next", rb_str_succ, 0); 08225 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0); 08226 rb_define_method(rb_cString, "upto", rb_str_upto, -1); 08227 rb_define_method(rb_cString, "index", rb_str_index_m, -1); 08228 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1); 08229 rb_define_method(rb_cString, "replace", rb_str_replace, 1); 08230 rb_define_method(rb_cString, "clear", rb_str_clear, 0); 08231 rb_define_method(rb_cString, "chr", rb_str_chr, 0); 08232 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1); 08233 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2); 08234 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1); 08235 08236 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1); 08237 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0); 08238 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0); 08239 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0); 08240 rb_define_method(rb_cString, "inspect", rb_str_inspect, 0); 08241 rb_define_method(rb_cString, "dump", rb_str_dump, 0); 08242 08243 rb_define_method(rb_cString, "upcase", rb_str_upcase, 0); 08244 rb_define_method(rb_cString, "downcase", rb_str_downcase, 0); 08245 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, 0); 08246 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, 0); 08247 08248 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, 0); 08249 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, 0); 08250 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, 0); 08251 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, 0); 08252 08253 rb_define_method(rb_cString, "hex", rb_str_hex, 0); 08254 rb_define_method(rb_cString, "oct", rb_str_oct, 0); 08255 rb_define_method(rb_cString, "split", rb_str_split_m, -1); 08256 rb_define_method(rb_cString, "lines", rb_str_lines, -1); 08257 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0); 08258 rb_define_method(rb_cString, "chars", rb_str_chars, 0); 08259 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0); 08260 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0); 08261 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0); 08262 rb_define_method(rb_cString, "concat", rb_str_concat, 1); 08263 rb_define_method(rb_cString, "<<", rb_str_concat, 1); 08264 rb_define_method(rb_cString, "prepend", rb_str_prepend, 1); 08265 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1); 08266 rb_define_method(rb_cString, "intern", rb_str_intern, 0); 08267 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); 08268 rb_define_method(rb_cString, "ord", rb_str_ord, 0); 08269 08270 rb_define_method(rb_cString, "include?", rb_str_include, 1); 08271 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1); 08272 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1); 08273 08274 rb_define_method(rb_cString, "scan", rb_str_scan, 1); 08275 08276 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1); 08277 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1); 08278 rb_define_method(rb_cString, "center", rb_str_center, -1); 08279 08280 rb_define_method(rb_cString, "sub", rb_str_sub, -1); 08281 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1); 08282 rb_define_method(rb_cString, "chop", rb_str_chop, 0); 08283 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1); 08284 rb_define_method(rb_cString, "strip", rb_str_strip, 0); 08285 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0); 08286 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0); 08287 08288 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1); 08289 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1); 08290 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0); 08291 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1); 08292 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0); 08293 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0); 08294 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0); 08295 08296 rb_define_method(rb_cString, "tr", rb_str_tr, 2); 08297 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2); 08298 rb_define_method(rb_cString, "delete", rb_str_delete, -1); 08299 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1); 08300 rb_define_method(rb_cString, "count", rb_str_count, -1); 08301 08302 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2); 08303 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2); 08304 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1); 08305 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1); 08306 08307 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1); 08308 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0); 08309 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0); 08310 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0); 08311 08312 rb_define_method(rb_cString, "sum", rb_str_sum, -1); 08313 08314 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1); 08315 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1); 08316 08317 rb_define_method(rb_cString, "partition", rb_str_partition, 1); 08318 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1); 08319 08320 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */ 08321 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1); 08322 rb_define_method(rb_cString, "b", rb_str_b, 0); 08323 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0); 08324 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0); 08325 08326 id_to_s = rb_intern("to_s"); 08327 08328 rb_fs = Qnil; 08329 rb_define_variable("$;", &rb_fs); 08330 rb_define_variable("$-F", &rb_fs); 08331 08332 rb_cSymbol = rb_define_class("Symbol", rb_cObject); 08333 rb_include_module(rb_cSymbol, rb_mComparable); 08334 rb_undef_alloc_func(rb_cSymbol); 08335 rb_undef_method(CLASS_OF(rb_cSymbol), "new"); 08336 rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0); /* in parse.y */ 08337 08338 rb_define_method(rb_cSymbol, "==", sym_equal, 1); 08339 rb_define_method(rb_cSymbol, "===", sym_equal, 1); 08340 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0); 08341 rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0); 08342 rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0); 08343 rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0); 08344 rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0); 08345 rb_define_method(rb_cSymbol, "to_proc", sym_to_proc, 0); 08346 rb_define_method(rb_cSymbol, "succ", sym_succ, 0); 08347 rb_define_method(rb_cSymbol, "next", sym_succ, 0); 08348 08349 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1); 08350 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1); 08351 rb_define_method(rb_cSymbol, "=~", sym_match, 1); 08352 08353 rb_define_method(rb_cSymbol, "[]", sym_aref, -1); 08354 rb_define_method(rb_cSymbol, "slice", sym_aref, -1); 08355 rb_define_method(rb_cSymbol, "length", sym_length, 0); 08356 rb_define_method(rb_cSymbol, "size", sym_length, 0); 08357 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0); 08358 rb_define_method(rb_cSymbol, "match", sym_match, 1); 08359 08360 rb_define_method(rb_cSymbol, "upcase", sym_upcase, 0); 08361 rb_define_method(rb_cSymbol, "downcase", sym_downcase, 0); 08362 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, 0); 08363 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, 0); 08364 08365 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0); 08366 } 08367
1.7.6.1