Ruby  2.0.0p481(2014-05-08revision45883)
string.c
Go to the documentation of this file.
00001 /**********************************************************************
00002 
00003   string.c -
00004 
00005   $Author: nagachika $
00006   created at: Mon Aug  9 17:12:58 JST 1993
00007 
00008   Copyright (C) 1993-2007 Yukihiro Matsumoto
00009   Copyright (C) 2000  Network Applied Communication Laboratory, Inc.
00010   Copyright (C) 2000  Information-technology Promotion Agency, Japan
00011 
00012 **********************************************************************/
00013 
00014 #include "ruby/ruby.h"
00015 #include "ruby/re.h"
00016 #include "ruby/encoding.h"
00017 #include "vm_core.h"
00018 #include "internal.h"
00019 #include "probes.h"
00020 #include <assert.h>
00021 
00022 #define BEG(no) (regs->beg[(no)])
00023 #define END(no) (regs->end[(no)])
00024 
00025 #include <math.h>
00026 #include <ctype.h>
00027 
00028 #ifdef HAVE_UNISTD_H
00029 #include <unistd.h>
00030 #endif
00031 
00032 #define numberof(array) (int)(sizeof(array) / sizeof((array)[0]))
00033 
00034 #undef rb_str_new_cstr
00035 #undef rb_tainted_str_new_cstr
00036 #undef rb_usascii_str_new_cstr
00037 #undef rb_external_str_new_cstr
00038 #undef rb_locale_str_new_cstr
00039 #undef rb_str_new2
00040 #undef rb_str_new3
00041 #undef rb_str_new4
00042 #undef rb_str_new5
00043 #undef rb_tainted_str_new2
00044 #undef rb_usascii_str_new2
00045 #undef rb_str_dup_frozen
00046 #undef rb_str_buf_new_cstr
00047 #undef rb_str_buf_new2
00048 #undef rb_str_buf_cat2
00049 #undef rb_str_cat2
00050 
00051 static VALUE rb_str_clear(VALUE str);
00052 
00053 VALUE rb_cString;
00054 VALUE rb_cSymbol;
00055 
00056 #define RUBY_MAX_CHAR_LEN 16
00057 #define STR_TMPLOCK FL_USER7
00058 #define STR_NOEMBED FL_USER1
00059 #define STR_SHARED  FL_USER2 /* = ELTS_SHARED */
00060 #define STR_ASSOC   FL_USER3
00061 #define STR_SHARED_P(s) FL_ALL((s), STR_NOEMBED|ELTS_SHARED)
00062 #define STR_ASSOC_P(s)  FL_ALL((s), STR_NOEMBED|STR_ASSOC)
00063 #define STR_NOCAPA  (STR_NOEMBED|ELTS_SHARED|STR_ASSOC)
00064 #define STR_NOCAPA_P(s) (FL_TEST((s),STR_NOEMBED) && FL_ANY((s),ELTS_SHARED|STR_ASSOC))
00065 #define STR_UNSET_NOCAPA(s) do {\
00066     if (FL_TEST((s),STR_NOEMBED)) FL_UNSET((s),(ELTS_SHARED|STR_ASSOC));\
00067 } while (0)
00068 
00069 
00070 #define STR_SET_NOEMBED(str) do {\
00071     FL_SET((str), STR_NOEMBED);\
00072     STR_SET_EMBED_LEN((str), 0);\
00073 } while (0)
00074 #define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED)
00075 #define STR_EMBED_P(str) (!FL_TEST((str), STR_NOEMBED))
00076 #define STR_SET_EMBED_LEN(str, n) do { \
00077     long tmp_n = (n);\
00078     RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
00079     RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
00080 } while (0)
00081 
00082 #define STR_SET_LEN(str, n) do { \
00083     if (STR_EMBED_P(str)) {\
00084         STR_SET_EMBED_LEN((str), (n));\
00085     }\
00086     else {\
00087         RSTRING(str)->as.heap.len = (n);\
00088     }\
00089 } while (0)
00090 
00091 #define STR_DEC_LEN(str) do {\
00092     if (STR_EMBED_P(str)) {\
00093         long n = RSTRING_LEN(str);\
00094         n--;\
00095         STR_SET_EMBED_LEN((str), n);\
00096     }\
00097     else {\
00098         RSTRING(str)->as.heap.len--;\
00099     }\
00100 } while (0)
00101 
00102 #define RESIZE_CAPA(str,capacity) do {\
00103     if (STR_EMBED_P(str)) {\
00104         if ((capacity) > RSTRING_EMBED_LEN_MAX) {\
00105             char *tmp = ALLOC_N(char, (capacity)+1);\
00106             memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\
00107             RSTRING(str)->as.heap.ptr = tmp;\
00108             RSTRING(str)->as.heap.len = RSTRING_LEN(str);\
00109             STR_SET_NOEMBED(str);\
00110             RSTRING(str)->as.heap.aux.capa = (capacity);\
00111         }\
00112     }\
00113     else {\
00114         REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\
00115         if (!STR_NOCAPA_P(str))\
00116             RSTRING(str)->as.heap.aux.capa = (capacity);\
00117     }\
00118 } while (0)
00119 
00120 #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
00121 #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN)
00122 
00123 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
00124 
00125 static inline int
00126 single_byte_optimizable(VALUE str)
00127 {
00128     rb_encoding *enc;
00129 
00130     /* Conservative.  It may be ENC_CODERANGE_UNKNOWN. */
00131     if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
00132         return 1;
00133 
00134     enc = STR_ENC_GET(str);
00135     if (rb_enc_mbmaxlen(enc) == 1)
00136         return 1;
00137 
00138     /* Conservative.  Possibly single byte.
00139      * "\xa1" in Shift_JIS for example. */
00140     return 0;
00141 }
00142 
00143 VALUE rb_fs;
00144 
00145 static inline const char *
00146 search_nonascii(const char *p, const char *e)
00147 {
00148 #if SIZEOF_VALUE == 8
00149 # define NONASCII_MASK 0x8080808080808080ULL
00150 #elif SIZEOF_VALUE == 4
00151 # define NONASCII_MASK 0x80808080UL
00152 #endif
00153 #ifdef NONASCII_MASK
00154     if ((int)sizeof(VALUE) * 2 < e - p) {
00155         const VALUE *s, *t;
00156         const VALUE lowbits = sizeof(VALUE) - 1;
00157         s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
00158         while (p < (const char *)s) {
00159             if (!ISASCII(*p))
00160                 return p;
00161             p++;
00162         }
00163         t = (const VALUE*)(~lowbits & (VALUE)e);
00164         while (s < t) {
00165             if (*s & NONASCII_MASK) {
00166                 t = s;
00167                 break;
00168             }
00169             s++;
00170         }
00171         p = (const char *)t;
00172     }
00173 #endif
00174     while (p < e) {
00175         if (!ISASCII(*p))
00176             return p;
00177         p++;
00178     }
00179     return NULL;
00180 }
00181 
00182 static int
00183 coderange_scan(const char *p, long len, rb_encoding *enc)
00184 {
00185     const char *e = p + len;
00186 
00187     if (rb_enc_to_index(enc) == 0) {
00188         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
00189         p = search_nonascii(p, e);
00190         return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
00191     }
00192 
00193     if (rb_enc_asciicompat(enc)) {
00194         p = search_nonascii(p, e);
00195         if (!p) {
00196             return ENC_CODERANGE_7BIT;
00197         }
00198         while (p < e) {
00199             int ret = rb_enc_precise_mbclen(p, e, enc);
00200             if (!MBCLEN_CHARFOUND_P(ret)) {
00201                 return ENC_CODERANGE_BROKEN;
00202             }
00203             p += MBCLEN_CHARFOUND_LEN(ret);
00204             if (p < e) {
00205                 p = search_nonascii(p, e);
00206                 if (!p) {
00207                     return ENC_CODERANGE_VALID;
00208                 }
00209             }
00210         }
00211         if (e < p) {
00212             return ENC_CODERANGE_BROKEN;
00213         }
00214         return ENC_CODERANGE_VALID;
00215     }
00216 
00217     while (p < e) {
00218         int ret = rb_enc_precise_mbclen(p, e, enc);
00219 
00220         if (!MBCLEN_CHARFOUND_P(ret)) {
00221             return ENC_CODERANGE_BROKEN;
00222         }
00223         p += MBCLEN_CHARFOUND_LEN(ret);
00224     }
00225     if (e < p) {
00226         return ENC_CODERANGE_BROKEN;
00227     }
00228     return ENC_CODERANGE_VALID;
00229 }
00230 
00231 long
00232 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
00233 {
00234     const char *p = s;
00235 
00236     if (*cr == ENC_CODERANGE_BROKEN)
00237         return e - s;
00238 
00239     if (rb_enc_to_index(enc) == 0) {
00240         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
00241         p = search_nonascii(p, e);
00242         *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
00243         return e - s;
00244     }
00245     else if (rb_enc_asciicompat(enc)) {
00246         p = search_nonascii(p, e);
00247         if (!p) {
00248             if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
00249             return e - s;
00250         }
00251         while (p < e) {
00252             int ret = rb_enc_precise_mbclen(p, e, enc);
00253             if (!MBCLEN_CHARFOUND_P(ret)) {
00254                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
00255                 return p - s;
00256             }
00257             p += MBCLEN_CHARFOUND_LEN(ret);
00258             if (p < e) {
00259                 p = search_nonascii(p, e);
00260                 if (!p) {
00261                     *cr = ENC_CODERANGE_VALID;
00262                     return e - s;
00263                 }
00264             }
00265         }
00266         *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
00267         return p - s;
00268     }
00269     else {
00270         while (p < e) {
00271             int ret = rb_enc_precise_mbclen(p, e, enc);
00272             if (!MBCLEN_CHARFOUND_P(ret)) {
00273                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
00274                 return p - s;
00275             }
00276             p += MBCLEN_CHARFOUND_LEN(ret);
00277         }
00278         *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
00279         return p - s;
00280     }
00281 }
00282 
00283 static inline void
00284 str_enc_copy(VALUE str1, VALUE str2)
00285 {
00286     rb_enc_set_index(str1, ENCODING_GET(str2));
00287 }
00288 
00289 static void
00290 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
00291 {
00292     /* this function is designed for copying encoding and coderange
00293      * from src to new string "dest" which is made from the part of src.
00294      */
00295     str_enc_copy(dest, src);
00296     if (RSTRING_LEN(dest) == 0) {
00297         if (!rb_enc_asciicompat(STR_ENC_GET(src)))
00298             ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
00299         else
00300             ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00301         return;
00302     }
00303     switch (ENC_CODERANGE(src)) {
00304       case ENC_CODERANGE_7BIT:
00305         ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00306         break;
00307       case ENC_CODERANGE_VALID:
00308         if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
00309             search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
00310             ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
00311         else
00312             ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00313         break;
00314       default:
00315         break;
00316     }
00317 }
00318 
00319 static void
00320 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
00321 {
00322     str_enc_copy(dest, src);
00323     ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
00324 }
00325 
00326 int
00327 rb_enc_str_coderange(VALUE str)
00328 {
00329     int cr = ENC_CODERANGE(str);
00330 
00331     if (cr == ENC_CODERANGE_UNKNOWN) {
00332         rb_encoding *enc = STR_ENC_GET(str);
00333         cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
00334         ENC_CODERANGE_SET(str, cr);
00335     }
00336     return cr;
00337 }
00338 
00339 int
00340 rb_enc_str_asciionly_p(VALUE str)
00341 {
00342     rb_encoding *enc = STR_ENC_GET(str);
00343 
00344     if (!rb_enc_asciicompat(enc))
00345         return FALSE;
00346     else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
00347         return TRUE;
00348     return FALSE;
00349 }
00350 
00351 static inline void
00352 str_mod_check(VALUE s, const char *p, long len)
00353 {
00354     if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
00355         rb_raise(rb_eRuntimeError, "string modified");
00356     }
00357 }
00358 
00359 size_t
00360 rb_str_capacity(VALUE str)
00361 {
00362     if (STR_EMBED_P(str)) {
00363         return RSTRING_EMBED_LEN_MAX;
00364     }
00365     else if (STR_NOCAPA_P(str)) {
00366         return RSTRING(str)->as.heap.len;
00367     }
00368     else {
00369         return RSTRING(str)->as.heap.aux.capa;
00370     }
00371 }
00372 
00373 static inline VALUE
00374 str_alloc(VALUE klass)
00375 {
00376     NEWOBJ_OF(str, struct RString, klass, T_STRING);
00377 
00378     str->as.heap.ptr = 0;
00379     str->as.heap.len = 0;
00380     str->as.heap.aux.capa = 0;
00381 
00382     return (VALUE)str;
00383 }
00384 
00385 static inline VALUE
00386 empty_str_alloc(VALUE klass)
00387 {
00388     if (RUBY_DTRACE_STRING_CREATE_ENABLED()) {
00389         RUBY_DTRACE_STRING_CREATE(0, rb_sourcefile(), rb_sourceline());
00390     }
00391     return str_alloc(klass);
00392 }
00393 
00394 static VALUE
00395 str_new(VALUE klass, const char *ptr, long len)
00396 {
00397     VALUE str;
00398 
00399     if (len < 0) {
00400         rb_raise(rb_eArgError, "negative string size (or size too big)");
00401     }
00402 
00403     if (RUBY_DTRACE_STRING_CREATE_ENABLED()) {
00404         RUBY_DTRACE_STRING_CREATE(len, rb_sourcefile(), rb_sourceline());
00405     }
00406 
00407     str = str_alloc(klass);
00408     if (len > RSTRING_EMBED_LEN_MAX) {
00409         RSTRING(str)->as.heap.aux.capa = len;
00410         RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1);
00411         STR_SET_NOEMBED(str);
00412     }
00413     else if (len == 0) {
00414         ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
00415     }
00416     if (ptr) {
00417         memcpy(RSTRING_PTR(str), ptr, len);
00418     }
00419     STR_SET_LEN(str, len);
00420     RSTRING_PTR(str)[len] = '\0';
00421     return str;
00422 }
00423 
00424 VALUE
00425 rb_str_new(const char *ptr, long len)
00426 {
00427     return str_new(rb_cString, ptr, len);
00428 }
00429 
00430 VALUE
00431 rb_usascii_str_new(const char *ptr, long len)
00432 {
00433     VALUE str = rb_str_new(ptr, len);
00434     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
00435     return str;
00436 }
00437 
00438 VALUE
00439 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
00440 {
00441     VALUE str = rb_str_new(ptr, len);
00442     rb_enc_associate(str, enc);
00443     return str;
00444 }
00445 
00446 VALUE
00447 rb_str_new_cstr(const char *ptr)
00448 {
00449     if (!ptr) {
00450         rb_raise(rb_eArgError, "NULL pointer given");
00451     }
00452     return rb_str_new(ptr, strlen(ptr));
00453 }
00454 
00455 RUBY_ALIAS_FUNCTION(rb_str_new2(const char *ptr), rb_str_new_cstr, (ptr))
00456 #define rb_str_new2 rb_str_new_cstr
00457 
00458 VALUE
00459 rb_usascii_str_new_cstr(const char *ptr)
00460 {
00461     VALUE str = rb_str_new2(ptr);
00462     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
00463     return str;
00464 }
00465 
00466 RUBY_ALIAS_FUNCTION(rb_usascii_str_new2(const char *ptr), rb_usascii_str_new_cstr, (ptr))
00467 #define rb_usascii_str_new2 rb_usascii_str_new_cstr
00468 
00469 VALUE
00470 rb_tainted_str_new(const char *ptr, long len)
00471 {
00472     VALUE str = rb_str_new(ptr, len);
00473 
00474     OBJ_TAINT(str);
00475     return str;
00476 }
00477 
00478 VALUE
00479 rb_tainted_str_new_cstr(const char *ptr)
00480 {
00481     VALUE str = rb_str_new2(ptr);
00482 
00483     OBJ_TAINT(str);
00484     return str;
00485 }
00486 
00487 RUBY_ALIAS_FUNCTION(rb_tainted_str_new2(const char *ptr), rb_tainted_str_new_cstr, (ptr))
00488 #define rb_tainted_str_new2 rb_tainted_str_new_cstr
00489 
00490 VALUE
00491 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
00492 {
00493     extern VALUE rb_cEncodingConverter;
00494     rb_econv_t *ec;
00495     rb_econv_result_t ret;
00496     long len, olen;
00497     VALUE econv_wrapper;
00498     VALUE newstr;
00499     const unsigned char *start, *sp;
00500     unsigned char *dest, *dp;
00501     size_t converted_output = 0;
00502 
00503     if (!to) return str;
00504     if (!from) from = rb_enc_get(str);
00505     if (from == to) return str;
00506     if ((rb_enc_asciicompat(to) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) ||
00507         to == rb_ascii8bit_encoding()) {
00508         if (STR_ENC_GET(str) != to) {
00509             str = rb_str_dup(str);
00510             rb_enc_associate(str, to);
00511         }
00512         return str;
00513     }
00514 
00515     len = RSTRING_LEN(str);
00516     newstr = rb_str_new(0, len);
00517     olen = len;
00518 
00519     econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
00520     RBASIC(econv_wrapper)->klass = 0;
00521     ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
00522     if (!ec) return str;
00523     DATA_PTR(econv_wrapper) = ec;
00524 
00525     sp = (unsigned char*)RSTRING_PTR(str);
00526     start = sp;
00527     while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
00528            (dp = dest + converted_output),
00529            (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
00530            ret == econv_destination_buffer_full) {
00531         /* destination buffer short */
00532         size_t converted_input = sp - start;
00533         size_t rest = len - converted_input;
00534         converted_output = dp - dest;
00535         rb_str_set_len(newstr, converted_output);
00536         if (converted_input && converted_output &&
00537             rest < (LONG_MAX / converted_output)) {
00538             rest = (rest * converted_output) / converted_input;
00539         }
00540         else {
00541             rest = olen;
00542         }
00543         olen += rest < 2 ? 2 : rest;
00544         rb_str_resize(newstr, olen);
00545     }
00546     DATA_PTR(econv_wrapper) = 0;
00547     rb_econv_close(ec);
00548     rb_gc_force_recycle(econv_wrapper);
00549     switch (ret) {
00550       case econv_finished:
00551         len = dp - (unsigned char*)RSTRING_PTR(newstr);
00552         rb_str_set_len(newstr, len);
00553         rb_enc_associate(newstr, to);
00554         return newstr;
00555 
00556       default:
00557         /* some error, return original */
00558         return str;
00559     }
00560 }
00561 
00562 VALUE
00563 rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
00564 {
00565     return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
00566 }
00567 
00568 VALUE
00569 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
00570 {
00571     VALUE str;
00572 
00573     str = rb_tainted_str_new(ptr, len);
00574     if (eenc == rb_usascii_encoding() &&
00575         rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
00576         rb_enc_associate(str, rb_ascii8bit_encoding());
00577         return str;
00578     }
00579     rb_enc_associate(str, eenc);
00580     return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
00581 }
00582 
00583 VALUE
00584 rb_external_str_new(const char *ptr, long len)
00585 {
00586     return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
00587 }
00588 
00589 VALUE
00590 rb_external_str_new_cstr(const char *ptr)
00591 {
00592     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
00593 }
00594 
00595 VALUE
00596 rb_locale_str_new(const char *ptr, long len)
00597 {
00598     return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
00599 }
00600 
00601 VALUE
00602 rb_locale_str_new_cstr(const char *ptr)
00603 {
00604     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
00605 }
00606 
00607 VALUE
00608 rb_filesystem_str_new(const char *ptr, long len)
00609 {
00610     return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
00611 }
00612 
00613 VALUE
00614 rb_filesystem_str_new_cstr(const char *ptr)
00615 {
00616     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
00617 }
00618 
00619 VALUE
00620 rb_str_export(VALUE str)
00621 {
00622     return rb_str_conv_enc(str, STR_ENC_GET(str), rb_default_external_encoding());
00623 }
00624 
00625 VALUE
00626 rb_str_export_locale(VALUE str)
00627 {
00628     return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding());
00629 }
00630 
00631 VALUE
00632 rb_str_export_to_enc(VALUE str, rb_encoding *enc)
00633 {
00634     return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
00635 }
00636 
00637 static VALUE
00638 str_replace_shared_without_enc(VALUE str2, VALUE str)
00639 {
00640     if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) {
00641         STR_SET_EMBED(str2);
00642         memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1);
00643         STR_SET_EMBED_LEN(str2, RSTRING_LEN(str));
00644     }
00645     else {
00646         str = rb_str_new_frozen(str);
00647         FL_SET(str2, STR_NOEMBED);
00648         RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
00649         RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
00650         RSTRING(str2)->as.heap.aux.shared = str;
00651         FL_SET(str2, ELTS_SHARED);
00652     }
00653     return str2;
00654 }
00655 
00656 static VALUE
00657 str_replace_shared(VALUE str2, VALUE str)
00658 {
00659     str_replace_shared_without_enc(str2, str);
00660     rb_enc_cr_str_exact_copy(str2, str);
00661     return str2;
00662 }
00663 
00664 static VALUE
00665 str_new_shared(VALUE klass, VALUE str)
00666 {
00667     return str_replace_shared(str_alloc(klass), str);
00668 }
00669 
00670 static VALUE
00671 str_new3(VALUE klass, VALUE str)
00672 {
00673     return str_new_shared(klass, str);
00674 }
00675 
00676 VALUE
00677 rb_str_new_shared(VALUE str)
00678 {
00679     VALUE str2 = str_new3(rb_obj_class(str), str);
00680 
00681     OBJ_INFECT(str2, str);
00682     return str2;
00683 }
00684 
00685 RUBY_ALIAS_FUNCTION(rb_str_new3(VALUE str), rb_str_new_shared, (str))
00686 #define rb_str_new3 rb_str_new_shared
00687 
00688 static VALUE
00689 str_new4(VALUE klass, VALUE str)
00690 {
00691     VALUE str2;
00692 
00693     str2 = str_alloc(klass);
00694     STR_SET_NOEMBED(str2);
00695     RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
00696     RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
00697     if (STR_SHARED_P(str)) {
00698         VALUE shared = RSTRING(str)->as.heap.aux.shared;
00699         assert(OBJ_FROZEN(shared));
00700         FL_SET(str2, ELTS_SHARED);
00701         RSTRING(str2)->as.heap.aux.shared = shared;
00702     }
00703     else {
00704         FL_SET(str, ELTS_SHARED);
00705         RSTRING(str)->as.heap.aux.shared = str2;
00706     }
00707     rb_enc_cr_str_exact_copy(str2, str);
00708     OBJ_INFECT(str2, str);
00709     return str2;
00710 }
00711 
00712 VALUE
00713 rb_str_new_frozen(VALUE orig)
00714 {
00715     VALUE klass, str;
00716 
00717     if (OBJ_FROZEN(orig)) return orig;
00718     klass = rb_obj_class(orig);
00719     if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) {
00720         long ofs;
00721         assert(OBJ_FROZEN(str));
00722         ofs = RSTRING_LEN(str) - RSTRING_LEN(orig);
00723         if ((ofs > 0) || (klass != RBASIC(str)->klass) ||
00724             ((RBASIC(str)->flags ^ RBASIC(orig)->flags) & (FL_TAINT|FL_UNTRUSTED)) ||
00725             ENCODING_GET(str) != ENCODING_GET(orig)) {
00726             str = str_new3(klass, str);
00727             RSTRING(str)->as.heap.ptr += ofs;
00728             RSTRING(str)->as.heap.len -= ofs;
00729             rb_enc_cr_str_exact_copy(str, orig);
00730             OBJ_INFECT(str, orig);
00731         }
00732     }
00733     else if (STR_EMBED_P(orig)) {
00734         str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
00735         rb_enc_cr_str_exact_copy(str, orig);
00736         OBJ_INFECT(str, orig);
00737     }
00738     else if (STR_ASSOC_P(orig)) {
00739         VALUE assoc = RSTRING(orig)->as.heap.aux.shared;
00740         FL_UNSET(orig, STR_ASSOC);
00741         str = str_new4(klass, orig);
00742         FL_SET(str, STR_ASSOC);
00743         RSTRING(str)->as.heap.aux.shared = assoc;
00744     }
00745     else {
00746         str = str_new4(klass, orig);
00747     }
00748     OBJ_FREEZE(str);
00749     return str;
00750 }
00751 
00752 RUBY_ALIAS_FUNCTION(rb_str_new4(VALUE orig), rb_str_new_frozen, (orig))
00753 #define rb_str_new4 rb_str_new_frozen
00754 
00755 VALUE
00756 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
00757 {
00758     return str_new(rb_obj_class(obj), ptr, len);
00759 }
00760 
00761 RUBY_ALIAS_FUNCTION(rb_str_new5(VALUE obj, const char *ptr, long len),
00762            rb_str_new_with_class, (obj, ptr, len))
00763 #define rb_str_new5 rb_str_new_with_class
00764 
00765 static VALUE
00766 str_new_empty(VALUE str)
00767 {
00768     VALUE v = rb_str_new5(str, 0, 0);
00769     rb_enc_copy(v, str);
00770     OBJ_INFECT(v, str);
00771     return v;
00772 }
00773 
00774 #define STR_BUF_MIN_SIZE 128
00775 
00776 VALUE
00777 rb_str_buf_new(long capa)
00778 {
00779     VALUE str = str_alloc(rb_cString);
00780 
00781     if (capa < STR_BUF_MIN_SIZE) {
00782         capa = STR_BUF_MIN_SIZE;
00783     }
00784     FL_SET(str, STR_NOEMBED);
00785     RSTRING(str)->as.heap.aux.capa = capa;
00786     RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1);
00787     RSTRING(str)->as.heap.ptr[0] = '\0';
00788 
00789     return str;
00790 }
00791 
00792 VALUE
00793 rb_str_buf_new_cstr(const char *ptr)
00794 {
00795     VALUE str;
00796     long len = strlen(ptr);
00797 
00798     str = rb_str_buf_new(len);
00799     rb_str_buf_cat(str, ptr, len);
00800 
00801     return str;
00802 }
00803 
00804 RUBY_ALIAS_FUNCTION(rb_str_buf_new2(const char *ptr), rb_str_buf_new_cstr, (ptr))
00805 #define rb_str_buf_new2 rb_str_buf_new_cstr
00806 
00807 VALUE
00808 rb_str_tmp_new(long len)
00809 {
00810     return str_new(0, 0, len);
00811 }
00812 
00813 void *
00814 rb_alloc_tmp_buffer(volatile VALUE *store, long len)
00815 {
00816     VALUE s = rb_str_tmp_new(len);
00817     *store = s;
00818     return RSTRING_PTR(s);
00819 }
00820 
00821 void
00822 rb_free_tmp_buffer(volatile VALUE *store)
00823 {
00824     VALUE s = *store;
00825     *store = 0;
00826     if (s) rb_str_clear(s);
00827 }
00828 
00829 void
00830 rb_str_free(VALUE str)
00831 {
00832     if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
00833         xfree(RSTRING(str)->as.heap.ptr);
00834     }
00835 }
00836 
00837 RUBY_FUNC_EXPORTED size_t
00838 rb_str_memsize(VALUE str)
00839 {
00840     if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
00841         return RSTRING(str)->as.heap.aux.capa;
00842     }
00843     else {
00844         return 0;
00845     }
00846 }
00847 
00848 VALUE
00849 rb_str_to_str(VALUE str)
00850 {
00851     return rb_convert_type(str, T_STRING, "String", "to_str");
00852 }
00853 
00854 static inline void str_discard(VALUE str);
00855 
00856 void
00857 rb_str_shared_replace(VALUE str, VALUE str2)
00858 {
00859     rb_encoding *enc;
00860     int cr;
00861     if (str == str2) return;
00862     enc = STR_ENC_GET(str2);
00863     cr = ENC_CODERANGE(str2);
00864     str_discard(str);
00865     OBJ_INFECT(str, str2);
00866     if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) {
00867         STR_SET_EMBED(str);
00868         memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
00869         STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
00870         rb_enc_associate(str, enc);
00871         ENC_CODERANGE_SET(str, cr);
00872         return;
00873     }
00874     STR_SET_NOEMBED(str);
00875     STR_UNSET_NOCAPA(str);
00876     RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
00877     RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
00878     if (STR_NOCAPA_P(str2)) {
00879         FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA);
00880         RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
00881     }
00882     else {
00883         RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
00884     }
00885     STR_SET_EMBED(str2);        /* abandon str2 */
00886     RSTRING_PTR(str2)[0] = 0;
00887     STR_SET_EMBED_LEN(str2, 0);
00888     rb_enc_associate(str, enc);
00889     ENC_CODERANGE_SET(str, cr);
00890 }
00891 
00892 static ID id_to_s;
00893 
00894 VALUE
00895 rb_obj_as_string(VALUE obj)
00896 {
00897     VALUE str;
00898 
00899     if (RB_TYPE_P(obj, T_STRING)) {
00900         return obj;
00901     }
00902     str = rb_funcall(obj, id_to_s, 0);
00903     if (!RB_TYPE_P(str, T_STRING))
00904         return rb_any_to_s(obj);
00905     if (OBJ_TAINTED(obj)) OBJ_TAINT(str);
00906     return str;
00907 }
00908 
00909 static VALUE
00910 str_replace(VALUE str, VALUE str2)
00911 {
00912     long len;
00913 
00914     len = RSTRING_LEN(str2);
00915     if (STR_ASSOC_P(str2)) {
00916         str2 = rb_str_new4(str2);
00917     }
00918     if (STR_SHARED_P(str2)) {
00919         VALUE shared = RSTRING(str2)->as.heap.aux.shared;
00920         assert(OBJ_FROZEN(shared));
00921         STR_SET_NOEMBED(str);
00922         RSTRING(str)->as.heap.len = len;
00923         RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
00924         FL_SET(str, ELTS_SHARED);
00925         FL_UNSET(str, STR_ASSOC);
00926         RSTRING(str)->as.heap.aux.shared = shared;
00927     }
00928     else {
00929         str_replace_shared(str, str2);
00930     }
00931 
00932     OBJ_INFECT(str, str2);
00933     rb_enc_cr_str_exact_copy(str, str2);
00934     return str;
00935 }
00936 
00937 static VALUE
00938 str_duplicate(VALUE klass, VALUE str)
00939 {
00940     VALUE dup = str_alloc(klass);
00941     str_replace(dup, str);
00942     return dup;
00943 }
00944 
00945 VALUE
00946 rb_str_dup(VALUE str)
00947 {
00948     return str_duplicate(rb_obj_class(str), str);
00949 }
00950 
00951 VALUE
00952 rb_str_resurrect(VALUE str)
00953 {
00954     if (RUBY_DTRACE_STRING_CREATE_ENABLED()) {
00955         RUBY_DTRACE_STRING_CREATE(RSTRING_LEN(str),
00956                                   rb_sourcefile(), rb_sourceline());
00957     }
00958     return str_replace(str_alloc(rb_cString), str);
00959 }
00960 
00961 /*
00962  *  call-seq:
00963  *     String.new(str="")   -> new_str
00964  *
00965  *  Returns a new string object containing a copy of <i>str</i>.
00966  */
00967 
00968 static VALUE
00969 rb_str_init(int argc, VALUE *argv, VALUE str)
00970 {
00971     VALUE orig;
00972 
00973     if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1)
00974         rb_str_replace(str, orig);
00975     return str;
00976 }
00977 
00978 static inline long
00979 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
00980 {
00981     long c;
00982     const char *q;
00983 
00984     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
00985         return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
00986     }
00987     else if (rb_enc_asciicompat(enc)) {
00988         c = 0;
00989         if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) {
00990             while (p < e) {
00991                 if (ISASCII(*p)) {
00992                     q = search_nonascii(p, e);
00993                     if (!q)
00994                         return c + (e - p);
00995                     c += q - p;
00996                     p = q;
00997                 }
00998                 p += rb_enc_fast_mbclen(p, e, enc);
00999                 c++;
01000             }
01001         }
01002         else {
01003             while (p < e) {
01004                 if (ISASCII(*p)) {
01005                     q = search_nonascii(p, e);
01006                     if (!q)
01007                         return c + (e - p);
01008                     c += q - p;
01009                     p = q;
01010                 }
01011                 p += rb_enc_mbclen(p, e, enc);
01012                 c++;
01013             }
01014         }
01015         return c;
01016     }
01017 
01018     for (c=0; p<e; c++) {
01019         p += rb_enc_mbclen(p, e, enc);
01020     }
01021     return c;
01022 }
01023 
01024 long
01025 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
01026 {
01027     return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
01028 }
01029 
01030 long
01031 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
01032 {
01033     long c;
01034     const char *q;
01035     int ret;
01036 
01037     *cr = 0;
01038     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01039         return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
01040     }
01041     else if (rb_enc_asciicompat(enc)) {
01042         c = 0;
01043         while (p < e) {
01044             if (ISASCII(*p)) {
01045                 q = search_nonascii(p, e);
01046                 if (!q) {
01047                     if (!*cr) *cr = ENC_CODERANGE_7BIT;
01048                     return c + (e - p);
01049                 }
01050                 c += q - p;
01051                 p = q;
01052             }
01053             ret = rb_enc_precise_mbclen(p, e, enc);
01054             if (MBCLEN_CHARFOUND_P(ret)) {
01055                 *cr |= ENC_CODERANGE_VALID;
01056                 p += MBCLEN_CHARFOUND_LEN(ret);
01057             }
01058             else {
01059                 *cr = ENC_CODERANGE_BROKEN;
01060                 p++;
01061             }
01062             c++;
01063         }
01064         if (!*cr) *cr = ENC_CODERANGE_7BIT;
01065         return c;
01066     }
01067 
01068     for (c=0; p<e; c++) {
01069         ret = rb_enc_precise_mbclen(p, e, enc);
01070         if (MBCLEN_CHARFOUND_P(ret)) {
01071             *cr |= ENC_CODERANGE_VALID;
01072             p += MBCLEN_CHARFOUND_LEN(ret);
01073         }
01074         else {
01075             *cr = ENC_CODERANGE_BROKEN;
01076             if (p + rb_enc_mbminlen(enc) <= e)
01077                 p += rb_enc_mbminlen(enc);
01078             else
01079                 p = e;
01080         }
01081     }
01082     if (!*cr) *cr = ENC_CODERANGE_7BIT;
01083     return c;
01084 }
01085 
01086 #ifdef NONASCII_MASK
01087 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
01088 
01089 /*
01090  * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
01091  * bit represention. (see http://en.wikipedia.org/wiki/UTF-8)
01092  * Therefore, following pseudo code can detect UTF-8 leading byte.
01093  *
01094  * if (!(byte & 0x80))
01095  *   byte |= 0x40;          // turn on bit6
01096  * return ((byte>>6) & 1);  // bit6 represent it's leading byte or not.
01097  *
01098  * This function calculate every bytes in the argument word `s'
01099  * using the above logic concurrently. and gather every bytes result.
01100  */
01101 static inline VALUE
01102 count_utf8_lead_bytes_with_word(const VALUE *s)
01103 {
01104     VALUE d = *s;
01105 
01106     /* Transform into bit0 represent UTF-8 leading or not. */
01107     d |= ~(d>>1);
01108     d >>= 6;
01109     d &= NONASCII_MASK >> 7;
01110 
01111     /* Gather every bytes. */
01112     d += (d>>8);
01113     d += (d>>16);
01114 #if SIZEOF_VALUE == 8
01115     d += (d>>32);
01116 #endif
01117     return (d&0xF);
01118 }
01119 #endif
01120 
01121 static long
01122 str_strlen(VALUE str, rb_encoding *enc)
01123 {
01124     const char *p, *e;
01125     long n;
01126     int cr;
01127 
01128     if (single_byte_optimizable(str)) return RSTRING_LEN(str);
01129     if (!enc) enc = STR_ENC_GET(str);
01130     p = RSTRING_PTR(str);
01131     e = RSTRING_END(str);
01132     cr = ENC_CODERANGE(str);
01133 #ifdef NONASCII_MASK
01134     if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
01135         enc == rb_utf8_encoding()) {
01136 
01137         VALUE len = 0;
01138         if ((int)sizeof(VALUE) * 2 < e - p) {
01139             const VALUE *s, *t;
01140             const VALUE lowbits = sizeof(VALUE) - 1;
01141             s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
01142             t = (const VALUE*)(~lowbits & (VALUE)e);
01143             while (p < (const char *)s) {
01144                 if (is_utf8_lead_byte(*p)) len++;
01145                 p++;
01146             }
01147             while (s < t) {
01148                 len += count_utf8_lead_bytes_with_word(s);
01149                 s++;
01150             }
01151             p = (const char *)s;
01152         }
01153         while (p < e) {
01154             if (is_utf8_lead_byte(*p)) len++;
01155             p++;
01156         }
01157         return (long)len;
01158     }
01159 #endif
01160     n = rb_enc_strlen_cr(p, e, enc, &cr);
01161     if (cr) {
01162         ENC_CODERANGE_SET(str, cr);
01163     }
01164     return n;
01165 }
01166 
01167 long
01168 rb_str_strlen(VALUE str)
01169 {
01170     return str_strlen(str, STR_ENC_GET(str));
01171 }
01172 
01173 /*
01174  *  call-seq:
01175  *     str.length   -> integer
01176  *     str.size     -> integer
01177  *
01178  *  Returns the character length of <i>str</i>.
01179  */
01180 
01181 VALUE
01182 rb_str_length(VALUE str)
01183 {
01184     long len;
01185 
01186     len = str_strlen(str, STR_ENC_GET(str));
01187     return LONG2NUM(len);
01188 }
01189 
01190 /*
01191  *  call-seq:
01192  *     str.bytesize  -> integer
01193  *
01194  *  Returns the length of +str+ in bytes.
01195  *
01196  *    "\x80\u3042".bytesize  #=> 4
01197  *    "hello".bytesize       #=> 5
01198  */
01199 
01200 static VALUE
01201 rb_str_bytesize(VALUE str)
01202 {
01203     return LONG2NUM(RSTRING_LEN(str));
01204 }
01205 
01206 /*
01207  *  call-seq:
01208  *     str.empty?   -> true or false
01209  *
01210  *  Returns <code>true</code> if <i>str</i> has a length of zero.
01211  *
01212  *     "hello".empty?   #=> false
01213  *     " ".empty?       #=> false
01214  *     "".empty?        #=> true
01215  */
01216 
01217 static VALUE
01218 rb_str_empty(VALUE str)
01219 {
01220     if (RSTRING_LEN(str) == 0)
01221         return Qtrue;
01222     return Qfalse;
01223 }
01224 
01225 /*
01226  *  call-seq:
01227  *     str + other_str   -> new_str
01228  *
01229  *  Concatenation---Returns a new <code>String</code> containing
01230  *  <i>other_str</i> concatenated to <i>str</i>.
01231  *
01232  *     "Hello from " + self.to_s   #=> "Hello from main"
01233  */
01234 
01235 VALUE
01236 rb_str_plus(VALUE str1, VALUE str2)
01237 {
01238     VALUE str3;
01239     rb_encoding *enc;
01240 
01241     StringValue(str2);
01242     enc = rb_enc_check(str1, str2);
01243     str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2));
01244     memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1));
01245     memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1),
01246            RSTRING_PTR(str2), RSTRING_LEN(str2));
01247     RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0';
01248 
01249     if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
01250         OBJ_TAINT(str3);
01251     ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
01252                            ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
01253     return str3;
01254 }
01255 
01256 /*
01257  *  call-seq:
01258  *     str * integer   -> new_str
01259  *
01260  *  Copy --- Returns a new String containing +integer+ copies of the receiver.
01261  *  +integer+ must be greater than or equal to 0.
01262  *
01263  *     "Ho! " * 3   #=> "Ho! Ho! Ho! "
01264  *     "Ho! " * 0   #=> ""
01265  */
01266 
01267 VALUE
01268 rb_str_times(VALUE str, VALUE times)
01269 {
01270     VALUE str2;
01271     long n, len;
01272     char *ptr2;
01273 
01274     len = NUM2LONG(times);
01275     if (len < 0) {
01276         rb_raise(rb_eArgError, "negative argument");
01277     }
01278     if (len && LONG_MAX/len <  RSTRING_LEN(str)) {
01279         rb_raise(rb_eArgError, "argument too big");
01280     }
01281 
01282     str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str));
01283     ptr2 = RSTRING_PTR(str2);
01284     if (len) {
01285         n = RSTRING_LEN(str);
01286         memcpy(ptr2, RSTRING_PTR(str), n);
01287         while (n <= len/2) {
01288             memcpy(ptr2 + n, ptr2, n);
01289             n *= 2;
01290         }
01291         memcpy(ptr2 + n, ptr2, len-n);
01292     }
01293     ptr2[RSTRING_LEN(str2)] = '\0';
01294     OBJ_INFECT(str2, str);
01295     rb_enc_cr_str_copy_for_substr(str2, str);
01296 
01297     return str2;
01298 }
01299 
01300 /*
01301  *  call-seq:
01302  *     str % arg   -> new_str
01303  *
01304  *  Format---Uses <i>str</i> as a format specification, and returns the result
01305  *  of applying it to <i>arg</i>. If the format specification contains more than
01306  *  one substitution, then <i>arg</i> must be an <code>Array</code> or <code>Hash</code>
01307  *  containing the values to be substituted. See <code>Kernel::sprintf</code> for
01308  *  details of the format string.
01309  *
01310  *     "%05d" % 123                              #=> "00123"
01311  *     "%-5s: %08x" % [ "ID", self.object_id ]   #=> "ID   : 200e14d6"
01312  *     "foo = %{foo}" % { :foo => 'bar' }        #=> "foo = bar"
01313  */
01314 
01315 static VALUE
01316 rb_str_format_m(VALUE str, VALUE arg)
01317 {
01318     volatile VALUE tmp = rb_check_array_type(arg);
01319 
01320     if (!NIL_P(tmp)) {
01321         return rb_str_format(RARRAY_LENINT(tmp), RARRAY_PTR(tmp), str);
01322     }
01323     return rb_str_format(1, &arg, str);
01324 }
01325 
01326 static inline void
01327 str_modifiable(VALUE str)
01328 {
01329     if (FL_TEST(str, STR_TMPLOCK)) {
01330         rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
01331     }
01332     rb_check_frozen(str);
01333     if (!OBJ_UNTRUSTED(str) && rb_safe_level() >= 4)
01334         rb_raise(rb_eSecurityError, "Insecure: can't modify string");
01335 }
01336 
01337 static inline int
01338 str_independent(VALUE str)
01339 {
01340     str_modifiable(str);
01341     if (!STR_SHARED_P(str)) return 1;
01342     if (STR_EMBED_P(str)) return 1;
01343     return 0;
01344 }
01345 
01346 static void
01347 str_make_independent_expand(VALUE str, long expand)
01348 {
01349     char *ptr;
01350     long len = RSTRING_LEN(str);
01351     long capa = len + expand;
01352 
01353     if (len > capa) len = capa;
01354     ptr = ALLOC_N(char, capa + 1);
01355     if (RSTRING_PTR(str)) {
01356         memcpy(ptr, RSTRING_PTR(str), len);
01357     }
01358     STR_SET_NOEMBED(str);
01359     STR_UNSET_NOCAPA(str);
01360     ptr[len] = 0;
01361     RSTRING(str)->as.heap.ptr = ptr;
01362     RSTRING(str)->as.heap.len = len;
01363     RSTRING(str)->as.heap.aux.capa = capa;
01364 }
01365 
01366 #define str_make_independent(str) str_make_independent_expand((str), 0L)
01367 
01368 void
01369 rb_str_modify(VALUE str)
01370 {
01371     if (!str_independent(str))
01372         str_make_independent(str);
01373     ENC_CODERANGE_CLEAR(str);
01374 }
01375 
01376 void
01377 rb_str_modify_expand(VALUE str, long expand)
01378 {
01379     if (expand < 0) {
01380         rb_raise(rb_eArgError, "negative expanding string size");
01381     }
01382     if (!str_independent(str)) {
01383         str_make_independent_expand(str, expand);
01384     }
01385     else if (expand > 0) {
01386         long len = RSTRING_LEN(str);
01387         long capa = len + expand;
01388         if (!STR_EMBED_P(str)) {
01389             REALLOC_N(RSTRING(str)->as.heap.ptr, char, capa+1);
01390             STR_UNSET_NOCAPA(str);
01391             RSTRING(str)->as.heap.aux.capa = capa;
01392         }
01393         else if (capa > RSTRING_EMBED_LEN_MAX) {
01394             str_make_independent_expand(str, expand);
01395         }
01396     }
01397     ENC_CODERANGE_CLEAR(str);
01398 }
01399 
01400 /* As rb_str_modify(), but don't clear coderange */
01401 static void
01402 str_modify_keep_cr(VALUE str)
01403 {
01404     if (!str_independent(str))
01405         str_make_independent(str);
01406     if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
01407         /* Force re-scan later */
01408         ENC_CODERANGE_CLEAR(str);
01409 }
01410 
01411 static inline void
01412 str_discard(VALUE str)
01413 {
01414     str_modifiable(str);
01415     if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) {
01416         xfree(RSTRING_PTR(str));
01417         RSTRING(str)->as.heap.ptr = 0;
01418         RSTRING(str)->as.heap.len = 0;
01419     }
01420 }
01421 
01422 void
01423 rb_str_associate(VALUE str, VALUE add)
01424 {
01425     /* sanity check */
01426     rb_check_frozen(str);
01427     if (STR_ASSOC_P(str)) {
01428         /* already associated */
01429         rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add);
01430     }
01431     else {
01432         if (STR_SHARED_P(str)) {
01433             VALUE assoc = RSTRING(str)->as.heap.aux.shared;
01434             str_make_independent(str);
01435             if (STR_ASSOC_P(assoc)) {
01436                 assoc = RSTRING(assoc)->as.heap.aux.shared;
01437                 rb_ary_concat(assoc, add);
01438                 add = assoc;
01439             }
01440         }
01441         else if (STR_EMBED_P(str)) {
01442             str_make_independent(str);
01443         }
01444         else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) {
01445             RESIZE_CAPA(str, RSTRING_LEN(str));
01446         }
01447         FL_SET(str, STR_ASSOC);
01448         RBASIC(add)->klass = 0;
01449         RSTRING(str)->as.heap.aux.shared = add;
01450     }
01451 }
01452 
01453 VALUE
01454 rb_str_associated(VALUE str)
01455 {
01456     if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared;
01457     if (STR_ASSOC_P(str)) {
01458         return RSTRING(str)->as.heap.aux.shared;
01459     }
01460     return Qfalse;
01461 }
01462 
01463 void
01464 rb_must_asciicompat(VALUE str)
01465 {
01466     rb_encoding *enc = rb_enc_get(str);
01467     if (!rb_enc_asciicompat(enc)) {
01468         rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
01469     }
01470 }
01471 
01472 VALUE
01473 rb_string_value(volatile VALUE *ptr)
01474 {
01475     VALUE s = *ptr;
01476     if (!RB_TYPE_P(s, T_STRING)) {
01477         s = rb_str_to_str(s);
01478         *ptr = s;
01479     }
01480     return s;
01481 }
01482 
01483 char *
01484 rb_string_value_ptr(volatile VALUE *ptr)
01485 {
01486     VALUE str = rb_string_value(ptr);
01487     return RSTRING_PTR(str);
01488 }
01489 
01490 char *
01491 rb_string_value_cstr(volatile VALUE *ptr)
01492 {
01493     VALUE str = rb_string_value(ptr);
01494     char *s = RSTRING_PTR(str);
01495     long len = RSTRING_LEN(str);
01496 
01497     if (!s || memchr(s, 0, len)) {
01498         rb_raise(rb_eArgError, "string contains null byte");
01499     }
01500     if (s[len]) {
01501         rb_str_modify(str);
01502         s = RSTRING_PTR(str);
01503         s[RSTRING_LEN(str)] = 0;
01504     }
01505     return s;
01506 }
01507 
01508 VALUE
01509 rb_check_string_type(VALUE str)
01510 {
01511     str = rb_check_convert_type(str, T_STRING, "String", "to_str");
01512     return str;
01513 }
01514 
01515 /*
01516  *  call-seq:
01517  *     String.try_convert(obj) -> string or nil
01518  *
01519  *  Try to convert <i>obj</i> into a String, using to_str method.
01520  *  Returns converted string or nil if <i>obj</i> cannot be converted
01521  *  for any reason.
01522  *
01523  *     String.try_convert("str")     #=> "str"
01524  *     String.try_convert(/re/)      #=> nil
01525  */
01526 static VALUE
01527 rb_str_s_try_convert(VALUE dummy, VALUE str)
01528 {
01529     return rb_check_string_type(str);
01530 }
01531 
01532 static char*
01533 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
01534 {
01535     long nth = *nthp;
01536     if (rb_enc_mbmaxlen(enc) == 1) {
01537         p += nth;
01538     }
01539     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01540         p += nth * rb_enc_mbmaxlen(enc);
01541     }
01542     else if (rb_enc_asciicompat(enc)) {
01543         const char *p2, *e2;
01544         int n;
01545 
01546         while (p < e && 0 < nth) {
01547             e2 = p + nth;
01548             if (e < e2) {
01549                 *nthp = nth;
01550                 return (char *)e;
01551             }
01552             if (ISASCII(*p)) {
01553                 p2 = search_nonascii(p, e2);
01554                 if (!p2) {
01555                     nth -= e2 - p;
01556                     *nthp = nth;
01557                     return (char *)e2;
01558                 }
01559                 nth -= p2 - p;
01560                 p = p2;
01561             }
01562             n = rb_enc_mbclen(p, e, enc);
01563             p += n;
01564             nth--;
01565         }
01566         *nthp = nth;
01567         if (nth != 0) {
01568             return (char *)e;
01569         }
01570         return (char *)p;
01571     }
01572     else {
01573         while (p < e && nth--) {
01574             p += rb_enc_mbclen(p, e, enc);
01575         }
01576     }
01577     if (p > e) p = e;
01578     *nthp = nth;
01579     return (char*)p;
01580 }
01581 
01582 char*
01583 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
01584 {
01585     return str_nth_len(p, e, &nth, enc);
01586 }
01587 
01588 static char*
01589 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
01590 {
01591     if (singlebyte)
01592         p += nth;
01593     else {
01594         p = str_nth_len(p, e, &nth, enc);
01595     }
01596     if (!p) return 0;
01597     if (p > e) p = e;
01598     return (char *)p;
01599 }
01600 
01601 /* char offset to byte offset */
01602 static long
01603 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
01604 {
01605     const char *pp = str_nth(p, e, nth, enc, singlebyte);
01606     if (!pp) return e - p;
01607     return pp - p;
01608 }
01609 
01610 long
01611 rb_str_offset(VALUE str, long pos)
01612 {
01613     return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
01614                       STR_ENC_GET(str), single_byte_optimizable(str));
01615 }
01616 
01617 #ifdef NONASCII_MASK
01618 static char *
01619 str_utf8_nth(const char *p, const char *e, long *nthp)
01620 {
01621     long nth = *nthp;
01622     if ((int)SIZEOF_VALUE * 2 < e - p && (int)SIZEOF_VALUE * 2 < nth) {
01623         const VALUE *s, *t;
01624         const VALUE lowbits = sizeof(VALUE) - 1;
01625         s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
01626         t = (const VALUE*)(~lowbits & (VALUE)e);
01627         while (p < (const char *)s) {
01628             if (is_utf8_lead_byte(*p)) nth--;
01629             p++;
01630         }
01631         do {
01632             nth -= count_utf8_lead_bytes_with_word(s);
01633             s++;
01634         } while (s < t && (int)sizeof(VALUE) <= nth);
01635         p = (char *)s;
01636     }
01637     while (p < e) {
01638         if (is_utf8_lead_byte(*p)) {
01639             if (nth == 0) break;
01640             nth--;
01641         }
01642         p++;
01643     }
01644     *nthp = nth;
01645     return (char *)p;
01646 }
01647 
01648 static long
01649 str_utf8_offset(const char *p, const char *e, long nth)
01650 {
01651     const char *pp = str_utf8_nth(p, e, &nth);
01652     return pp - p;
01653 }
01654 #endif
01655 
01656 /* byte offset to char offset */
01657 long
01658 rb_str_sublen(VALUE str, long pos)
01659 {
01660     if (single_byte_optimizable(str) || pos < 0)
01661         return pos;
01662     else {
01663         char *p = RSTRING_PTR(str);
01664         return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
01665     }
01666 }
01667 
01668 VALUE
01669 rb_str_subseq(VALUE str, long beg, long len)
01670 {
01671     VALUE str2;
01672 
01673     if (RSTRING_LEN(str) == beg + len &&
01674         RSTRING_EMBED_LEN_MAX < len) {
01675         str2 = rb_str_new_shared(rb_str_new_frozen(str));
01676         rb_str_drop_bytes(str2, beg);
01677     }
01678     else {
01679         str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len);
01680         RB_GC_GUARD(str);
01681     }
01682 
01683     rb_enc_cr_str_copy_for_substr(str2, str);
01684     OBJ_INFECT(str2, str);
01685 
01686     return str2;
01687 }
01688 
01689 static char *
01690 rb_str_subpos(VALUE str, long beg, long *lenp)
01691 {
01692     long len = *lenp;
01693     long slen = -1L;
01694     long blen = RSTRING_LEN(str);
01695     rb_encoding *enc = STR_ENC_GET(str);
01696     char *p, *s = RSTRING_PTR(str), *e = s + blen;
01697 
01698     if (len < 0) return 0;
01699     if (!blen) {
01700         len = 0;
01701     }
01702     if (single_byte_optimizable(str)) {
01703         if (beg > blen) return 0;
01704         if (beg < 0) {
01705             beg += blen;
01706             if (beg < 0) return 0;
01707         }
01708         if (beg + len > blen)
01709             len = blen - beg;
01710         if (len < 0) return 0;
01711         p = s + beg;
01712         goto end;
01713     }
01714     if (beg < 0) {
01715         if (len > -beg) len = -beg;
01716         if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
01717             beg = -beg;
01718             while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
01719             p = e;
01720             if (!p) return 0;
01721             while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
01722             if (!p) return 0;
01723             len = e - p;
01724             goto end;
01725         }
01726         else {
01727             slen = str_strlen(str, enc);
01728             beg += slen;
01729             if (beg < 0) return 0;
01730             p = s + beg;
01731             if (len == 0) goto end;
01732         }
01733     }
01734     else if (beg > 0 && beg > RSTRING_LEN(str)) {
01735         return 0;
01736     }
01737     if (len == 0) {
01738         if (beg > str_strlen(str, enc)) return 0;
01739         p = s + beg;
01740     }
01741 #ifdef NONASCII_MASK
01742     else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
01743         enc == rb_utf8_encoding()) {
01744         p = str_utf8_nth(s, e, &beg);
01745         if (beg > 0) return 0;
01746         len = str_utf8_offset(p, e, len);
01747     }
01748 #endif
01749     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01750         int char_sz = rb_enc_mbmaxlen(enc);
01751 
01752         p = s + beg * char_sz;
01753         if (p > e) {
01754             return 0;
01755         }
01756         else if (len * char_sz > e - p)
01757             len = e - p;
01758         else
01759             len *= char_sz;
01760     }
01761     else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
01762         if (beg > 0) return 0;
01763         len = 0;
01764     }
01765     else {
01766         len = str_offset(p, e, len, enc, 0);
01767     }
01768   end:
01769     *lenp = len;
01770     RB_GC_GUARD(str);
01771     return p;
01772 }
01773 
01774 VALUE
01775 rb_str_substr(VALUE str, long beg, long len)
01776 {
01777     VALUE str2;
01778     char *p = rb_str_subpos(str, beg, &len);
01779 
01780     if (!p) return Qnil;
01781     if (len > RSTRING_EMBED_LEN_MAX && p + len == RSTRING_END(str)) {
01782         str2 = rb_str_new4(str);
01783         str2 = str_new3(rb_obj_class(str2), str2);
01784         RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
01785         RSTRING(str2)->as.heap.len = len;
01786     }
01787     else {
01788         str2 = rb_str_new5(str, p, len);
01789         rb_enc_cr_str_copy_for_substr(str2, str);
01790         OBJ_INFECT(str2, str);
01791         RB_GC_GUARD(str);
01792     }
01793 
01794     return str2;
01795 }
01796 
01797 VALUE
01798 rb_str_freeze(VALUE str)
01799 {
01800     if (STR_ASSOC_P(str)) {
01801         VALUE ary = RSTRING(str)->as.heap.aux.shared;
01802         OBJ_FREEZE(ary);
01803     }
01804     return rb_obj_freeze(str);
01805 }
01806 
01807 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
01808 #define rb_str_dup_frozen rb_str_new_frozen
01809 
01810 VALUE
01811 rb_str_locktmp(VALUE str)
01812 {
01813     if (FL_TEST(str, STR_TMPLOCK)) {
01814         rb_raise(rb_eRuntimeError, "temporal locking already locked string");
01815     }
01816     FL_SET(str, STR_TMPLOCK);
01817     return str;
01818 }
01819 
01820 VALUE
01821 rb_str_unlocktmp(VALUE str)
01822 {
01823     if (!FL_TEST(str, STR_TMPLOCK)) {
01824         rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
01825     }
01826     FL_UNSET(str, STR_TMPLOCK);
01827     return str;
01828 }
01829 
01830 VALUE
01831 rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
01832 {
01833     rb_str_locktmp(str);
01834     return rb_ensure(func, arg, rb_str_unlocktmp, str);
01835 }
01836 
01837 void
01838 rb_str_set_len(VALUE str, long len)
01839 {
01840     long capa;
01841 
01842     str_modifiable(str);
01843     if (STR_SHARED_P(str)) {
01844         rb_raise(rb_eRuntimeError, "can't set length of shared string");
01845     }
01846     if (len > (capa = (long)rb_str_capacity(str))) {
01847         rb_bug("probable buffer overflow: %ld for %ld", len, capa);
01848     }
01849     STR_SET_LEN(str, len);
01850     RSTRING_PTR(str)[len] = '\0';
01851 }
01852 
01853 VALUE
01854 rb_str_resize(VALUE str, long len)
01855 {
01856     long slen;
01857     int independent;
01858 
01859     if (len < 0) {
01860         rb_raise(rb_eArgError, "negative string size (or size too big)");
01861     }
01862 
01863     independent = str_independent(str);
01864     ENC_CODERANGE_CLEAR(str);
01865     slen = RSTRING_LEN(str);
01866     if (len != slen) {
01867         if (STR_EMBED_P(str)) {
01868             if (len <= RSTRING_EMBED_LEN_MAX) {
01869                 STR_SET_EMBED_LEN(str, len);
01870                 RSTRING(str)->as.ary[len] = '\0';
01871                 return str;
01872             }
01873             str_make_independent_expand(str, len - slen);
01874             STR_SET_NOEMBED(str);
01875         }
01876         else if (len <= RSTRING_EMBED_LEN_MAX) {
01877             char *ptr = RSTRING(str)->as.heap.ptr;
01878             STR_SET_EMBED(str);
01879             if (slen > len) slen = len;
01880             if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen);
01881             RSTRING(str)->as.ary[len] = '\0';
01882             STR_SET_EMBED_LEN(str, len);
01883             if (independent) xfree(ptr);
01884             return str;
01885         }
01886         else if (!independent) {
01887             str_make_independent_expand(str, len - slen);
01888         }
01889         else if (slen < len || slen - len > 1024) {
01890             REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
01891         }
01892         if (!STR_NOCAPA_P(str)) {
01893             RSTRING(str)->as.heap.aux.capa = len;
01894         }
01895         RSTRING(str)->as.heap.len = len;
01896         RSTRING(str)->as.heap.ptr[len] = '\0';  /* sentinel */
01897     }
01898     return str;
01899 }
01900 
01901 static VALUE
01902 str_buf_cat(VALUE str, const char *ptr, long len)
01903 {
01904     long capa, total, off = -1;
01905 
01906     if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) {
01907         off = ptr - RSTRING_PTR(str);
01908     }
01909     rb_str_modify(str);
01910     if (len == 0) return 0;
01911     if (STR_ASSOC_P(str)) {
01912         FL_UNSET(str, STR_ASSOC);
01913         capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str);
01914     }
01915     else if (STR_EMBED_P(str)) {
01916         capa = RSTRING_EMBED_LEN_MAX;
01917     }
01918     else {
01919         capa = RSTRING(str)->as.heap.aux.capa;
01920     }
01921     if (RSTRING_LEN(str) >= LONG_MAX - len) {
01922         rb_raise(rb_eArgError, "string sizes too big");
01923     }
01924     total = RSTRING_LEN(str)+len;
01925     if (capa <= total) {
01926         while (total > capa) {
01927             if (capa + 1 >= LONG_MAX / 2) {
01928                 capa = (total + 4095) / 4096;
01929                 break;
01930             }
01931             capa = (capa + 1) * 2;
01932         }
01933         RESIZE_CAPA(str, capa);
01934     }
01935     if (off != -1) {
01936         ptr = RSTRING_PTR(str) + off;
01937     }
01938     memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len);
01939     STR_SET_LEN(str, total);
01940     RSTRING_PTR(str)[total] = '\0'; /* sentinel */
01941 
01942     return str;
01943 }
01944 
01945 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
01946 
01947 VALUE
01948 rb_str_buf_cat(VALUE str, const char *ptr, long len)
01949 {
01950     if (len == 0) return str;
01951     if (len < 0) {
01952         rb_raise(rb_eArgError, "negative string size (or size too big)");
01953     }
01954     return str_buf_cat(str, ptr, len);
01955 }
01956 
01957 VALUE
01958 rb_str_buf_cat2(VALUE str, const char *ptr)
01959 {
01960     return rb_str_buf_cat(str, ptr, strlen(ptr));
01961 }
01962 
01963 VALUE
01964 rb_str_cat(VALUE str, const char *ptr, long len)
01965 {
01966     if (len < 0) {
01967         rb_raise(rb_eArgError, "negative string size (or size too big)");
01968     }
01969     if (STR_ASSOC_P(str)) {
01970         char *p;
01971         rb_str_modify_expand(str, len);
01972         p = RSTRING(str)->as.heap.ptr;
01973         memcpy(p + RSTRING(str)->as.heap.len, ptr, len);
01974         len = RSTRING(str)->as.heap.len += len;
01975         p[len] = '\0'; /* sentinel */
01976         return str;
01977     }
01978 
01979     return rb_str_buf_cat(str, ptr, len);
01980 }
01981 
01982 VALUE
01983 rb_str_cat2(VALUE str, const char *ptr)
01984 {
01985     return rb_str_cat(str, ptr, strlen(ptr));
01986 }
01987 
01988 static VALUE
01989 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
01990     int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
01991 {
01992     int str_encindex = ENCODING_GET(str);
01993     int res_encindex;
01994     int str_cr, res_cr;
01995 
01996     str_cr = ENC_CODERANGE(str);
01997 
01998     if (str_encindex == ptr_encindex) {
01999         if (str_cr == ENC_CODERANGE_UNKNOWN)
02000             ptr_cr = ENC_CODERANGE_UNKNOWN;
02001         else if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
02002             ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
02003         }
02004     }
02005     else {
02006         rb_encoding *str_enc = rb_enc_from_index(str_encindex);
02007         rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex);
02008         if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
02009             if (len == 0)
02010                 return str;
02011             if (RSTRING_LEN(str) == 0) {
02012                 rb_str_buf_cat(str, ptr, len);
02013                 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
02014                 return str;
02015             }
02016             goto incompatible;
02017         }
02018         if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
02019             ptr_cr = coderange_scan(ptr, len, ptr_enc);
02020         }
02021         if (str_cr == ENC_CODERANGE_UNKNOWN) {
02022             if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
02023                 str_cr = rb_enc_str_coderange(str);
02024             }
02025         }
02026     }
02027     if (ptr_cr_ret)
02028         *ptr_cr_ret = ptr_cr;
02029 
02030     if (str_encindex != ptr_encindex &&
02031         str_cr != ENC_CODERANGE_7BIT &&
02032         ptr_cr != ENC_CODERANGE_7BIT) {
02033       incompatible:
02034         rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
02035             rb_enc_name(rb_enc_from_index(str_encindex)),
02036             rb_enc_name(rb_enc_from_index(ptr_encindex)));
02037     }
02038 
02039     if (str_cr == ENC_CODERANGE_UNKNOWN) {
02040         res_encindex = str_encindex;
02041         res_cr = ENC_CODERANGE_UNKNOWN;
02042     }
02043     else if (str_cr == ENC_CODERANGE_7BIT) {
02044         if (ptr_cr == ENC_CODERANGE_7BIT) {
02045             res_encindex = str_encindex;
02046             res_cr = ENC_CODERANGE_7BIT;
02047         }
02048         else {
02049             res_encindex = ptr_encindex;
02050             res_cr = ptr_cr;
02051         }
02052     }
02053     else if (str_cr == ENC_CODERANGE_VALID) {
02054         res_encindex = str_encindex;
02055         if (ptr_cr == ENC_CODERANGE_7BIT || ptr_cr == ENC_CODERANGE_VALID)
02056             res_cr = str_cr;
02057         else
02058             res_cr = ptr_cr;
02059     }
02060     else { /* str_cr == ENC_CODERANGE_BROKEN */
02061         res_encindex = str_encindex;
02062         res_cr = str_cr;
02063         if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
02064     }
02065 
02066     if (len < 0) {
02067         rb_raise(rb_eArgError, "negative string size (or size too big)");
02068     }
02069     str_buf_cat(str, ptr, len);
02070     ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
02071     return str;
02072 }
02073 
02074 VALUE
02075 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
02076 {
02077     return rb_enc_cr_str_buf_cat(str, ptr, len,
02078         rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
02079 }
02080 
02081 VALUE
02082 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
02083 {
02084     /* ptr must reference NUL terminated ASCII string. */
02085     int encindex = ENCODING_GET(str);
02086     rb_encoding *enc = rb_enc_from_index(encindex);
02087     if (rb_enc_asciicompat(enc)) {
02088         return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
02089             encindex, ENC_CODERANGE_7BIT, 0);
02090     }
02091     else {
02092         char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
02093         while (*ptr) {
02094             unsigned int c = (unsigned char)*ptr;
02095             int len = rb_enc_codelen(c, enc);
02096             rb_enc_mbcput(c, buf, enc);
02097             rb_enc_cr_str_buf_cat(str, buf, len,
02098                 encindex, ENC_CODERANGE_VALID, 0);
02099             ptr++;
02100         }
02101         return str;
02102     }
02103 }
02104 
02105 VALUE
02106 rb_str_buf_append(VALUE str, VALUE str2)
02107 {
02108     int str2_cr;
02109 
02110     str2_cr = ENC_CODERANGE(str2);
02111 
02112     rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
02113         ENCODING_GET(str2), str2_cr, &str2_cr);
02114 
02115     OBJ_INFECT(str, str2);
02116     ENC_CODERANGE_SET(str2, str2_cr);
02117 
02118     return str;
02119 }
02120 
02121 VALUE
02122 rb_str_append(VALUE str, VALUE str2)
02123 {
02124     rb_encoding *enc;
02125     int cr, cr2;
02126     long len2;
02127 
02128     StringValue(str2);
02129     if ((len2 = RSTRING_LEN(str2)) > 0 && STR_ASSOC_P(str)) {
02130         long len = RSTRING_LEN(str) + len2;
02131         enc = rb_enc_check(str, str2);
02132         cr = ENC_CODERANGE(str);
02133         if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2;
02134         rb_str_modify_expand(str, len2);
02135         memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len,
02136                RSTRING_PTR(str2), len2+1);
02137         RSTRING(str)->as.heap.len = len;
02138         rb_enc_associate(str, enc);
02139         ENC_CODERANGE_SET(str, cr);
02140         OBJ_INFECT(str, str2);
02141         return str;
02142     }
02143     return rb_str_buf_append(str, str2);
02144 }
02145 
02146 /*
02147  *  call-seq:
02148  *     str << integer       -> str
02149  *     str.concat(integer)  -> str
02150  *     str << obj           -> str
02151  *     str.concat(obj)      -> str
02152  *
02153  *  Append---Concatenates the given object to <i>str</i>. If the object is a
02154  *  <code>Integer</code>, it is considered as a codepoint, and is converted
02155  *  to a character before concatenation.
02156  *
02157  *     a = "hello "
02158  *     a << "world"   #=> "hello world"
02159  *     a.concat(33)   #=> "hello world!"
02160  */
02161 
02162 VALUE
02163 rb_str_concat(VALUE str1, VALUE str2)
02164 {
02165     unsigned int code;
02166     rb_encoding *enc = STR_ENC_GET(str1);
02167 
02168     if (FIXNUM_P(str2) || RB_TYPE_P(str2, T_BIGNUM)) {
02169         if (rb_num_to_uint(str2, &code) == 0) {
02170         }
02171         else if (FIXNUM_P(str2)) {
02172             rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
02173         }
02174         else {
02175             rb_raise(rb_eRangeError, "bignum out of char range");
02176         }
02177     }
02178     else {
02179         return rb_str_append(str1, str2);
02180     }
02181 
02182     if (enc == rb_usascii_encoding()) {
02183         /* US-ASCII automatically extended to ASCII-8BIT */
02184         char buf[1];
02185         buf[0] = (char)code;
02186         if (code > 0xFF) {
02187             rb_raise(rb_eRangeError, "%u out of char range", code);
02188         }
02189         rb_str_cat(str1, buf, 1);
02190         if (code > 127) {
02191             rb_enc_associate(str1, rb_ascii8bit_encoding());
02192             ENC_CODERANGE_SET(str1, ENC_CODERANGE_VALID);
02193         }
02194     }
02195     else {
02196         long pos = RSTRING_LEN(str1);
02197         int cr = ENC_CODERANGE(str1);
02198         int len;
02199         char *buf;
02200 
02201         switch (len = rb_enc_codelen(code, enc)) {
02202           case ONIGERR_INVALID_CODE_POINT_VALUE:
02203             rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
02204             break;
02205           case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
02206           case 0:
02207             rb_raise(rb_eRangeError, "%u out of char range", code);
02208             break;
02209         }
02210         buf = ALLOCA_N(char, len + 1);
02211         rb_enc_mbcput(code, buf, enc);
02212         if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
02213             rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
02214         }
02215         rb_str_resize(str1, pos+len);
02216         memcpy(RSTRING_PTR(str1) + pos, buf, len);
02217         if (cr == ENC_CODERANGE_7BIT && code > 127)
02218             cr = ENC_CODERANGE_VALID;
02219         ENC_CODERANGE_SET(str1, cr);
02220     }
02221     return str1;
02222 }
02223 
02224 /*
02225  *  call-seq:
02226  *     str.prepend(other_str)  -> str
02227  *
02228  *  Prepend---Prepend the given string to <i>str</i>.
02229  *
02230  *     a = "world"
02231  *     a.prepend("hello ") #=> "hello world"
02232  *     a                   #=> "hello world"
02233  */
02234 
02235 static VALUE
02236 rb_str_prepend(VALUE str, VALUE str2)
02237 {
02238     StringValue(str2);
02239     StringValue(str);
02240     rb_str_update(str, 0L, 0L, str2);
02241     return str;
02242 }
02243 
02244 st_index_t
02245 rb_str_hash(VALUE str)
02246 {
02247     int e = ENCODING_GET(str);
02248     if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
02249         e = 0;
02250     }
02251     return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
02252 }
02253 
02254 int
02255 rb_str_hash_cmp(VALUE str1, VALUE str2)
02256 {
02257     long len;
02258 
02259     if (!rb_str_comparable(str1, str2)) return 1;
02260     if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
02261         memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
02262         return 0;
02263     }
02264     return 1;
02265 }
02266 
02267 /*
02268  * call-seq:
02269  *    str.hash   -> fixnum
02270  *
02271  * Return a hash based on the string's length and content.
02272  */
02273 
02274 static VALUE
02275 rb_str_hash_m(VALUE str)
02276 {
02277     st_index_t hval = rb_str_hash(str);
02278     return INT2FIX(hval);
02279 }
02280 
02281 #define lesser(a,b) (((a)>(b))?(b):(a))
02282 
02283 int
02284 rb_str_comparable(VALUE str1, VALUE str2)
02285 {
02286     int idx1, idx2;
02287     int rc1, rc2;
02288 
02289     if (RSTRING_LEN(str1) == 0) return TRUE;
02290     if (RSTRING_LEN(str2) == 0) return TRUE;
02291     idx1 = ENCODING_GET(str1);
02292     idx2 = ENCODING_GET(str2);
02293     if (idx1 == idx2) return TRUE;
02294     rc1 = rb_enc_str_coderange(str1);
02295     rc2 = rb_enc_str_coderange(str2);
02296     if (rc1 == ENC_CODERANGE_7BIT) {
02297         if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
02298         if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
02299             return TRUE;
02300     }
02301     if (rc2 == ENC_CODERANGE_7BIT) {
02302         if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
02303             return TRUE;
02304     }
02305     return FALSE;
02306 }
02307 
02308 int
02309 rb_str_cmp(VALUE str1, VALUE str2)
02310 {
02311     long len1, len2;
02312     const char *ptr1, *ptr2;
02313     int retval;
02314 
02315     if (str1 == str2) return 0;
02316     RSTRING_GETMEM(str1, ptr1, len1);
02317     RSTRING_GETMEM(str2, ptr2, len2);
02318     if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
02319         if (len1 == len2) {
02320             if (!rb_str_comparable(str1, str2)) {
02321                 if (ENCODING_GET(str1) > ENCODING_GET(str2))
02322                     return 1;
02323                 return -1;
02324             }
02325             return 0;
02326         }
02327         if (len1 > len2) return 1;
02328         return -1;
02329     }
02330     if (retval > 0) return 1;
02331     return -1;
02332 }
02333 
02334 /* expect tail call optimization */
02335 static VALUE
02336 str_eql(const VALUE str1, const VALUE str2)
02337 {
02338     const long len = RSTRING_LEN(str1);
02339     const char *ptr1, *ptr2;
02340 
02341     if (len != RSTRING_LEN(str2)) return Qfalse;
02342     if (!rb_str_comparable(str1, str2)) return Qfalse;
02343     if ((ptr1 = RSTRING_PTR(str1)) == (ptr2 = RSTRING_PTR(str2)))
02344         return Qtrue;
02345     if (memcmp(ptr1, ptr2, len) == 0)
02346         return Qtrue;
02347     return Qfalse;
02348 }
02349 
02350 /*
02351  *  call-seq:
02352  *     str == obj   -> true or false
02353  *
02354  *  Equality---If <i>obj</i> is not a <code>String</code>, returns
02355  *  <code>false</code>. Otherwise, returns <code>true</code> if <i>str</i>
02356  *  <code><=></code> <i>obj</i> returns zero.
02357  */
02358 
02359 VALUE
02360 rb_str_equal(VALUE str1, VALUE str2)
02361 {
02362     if (str1 == str2) return Qtrue;
02363     if (!RB_TYPE_P(str2, T_STRING)) {
02364         if (!rb_respond_to(str2, rb_intern("to_str"))) {
02365             return Qfalse;
02366         }
02367         return rb_equal(str2, str1);
02368     }
02369     return str_eql(str1, str2);
02370 }
02371 
02372 /*
02373  * call-seq:
02374  *   str.eql?(other)   -> true or false
02375  *
02376  * Two strings are equal if they have the same length and content.
02377  */
02378 
02379 static VALUE
02380 rb_str_eql(VALUE str1, VALUE str2)
02381 {
02382     if (str1 == str2) return Qtrue;
02383     if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
02384     return str_eql(str1, str2);
02385 }
02386 
02387 /*
02388  *  call-seq:
02389  *     string <=> other_string   -> -1, 0, +1 or nil
02390  *
02391  *
02392  *  Comparison---Returns -1, 0, +1 or nil depending on whether +string+ is less
02393  *  than, equal to, or greater than +other_string+.
02394  *
02395  *  +nil+ is returned if the two values are incomparable.
02396  *
02397  *  If the strings are of different lengths, and the strings are equal when
02398  *  compared up to the shortest length, then the longer string is considered
02399  *  greater than the shorter one.
02400  *
02401  *  <code><=></code> is the basis for the methods <code><</code>,
02402  *  <code><=</code>, <code>></code>, <code>>=</code>, and
02403  *  <code>between?</code>, included from module Comparable. The method
02404  *  String#== does not use Comparable#==.
02405  *
02406  *     "abcdef" <=> "abcde"     #=> 1
02407  *     "abcdef" <=> "abcdef"    #=> 0
02408  *     "abcdef" <=> "abcdefg"   #=> -1
02409  *     "abcdef" <=> "ABCDEF"    #=> 1
02410  */
02411 
02412 static VALUE
02413 rb_str_cmp_m(VALUE str1, VALUE str2)
02414 {
02415     int result;
02416 
02417     if (!RB_TYPE_P(str2, T_STRING)) {
02418         VALUE tmp = rb_check_funcall(str2, rb_intern("to_str"), 0, 0);
02419         if (RB_TYPE_P(tmp, T_STRING)) {
02420             result = rb_str_cmp(str1, tmp);
02421         }
02422         else {
02423             return rb_invcmp(str1, str2);
02424         }
02425     }
02426     else {
02427         result = rb_str_cmp(str1, str2);
02428     }
02429     return INT2FIX(result);
02430 }
02431 
02432 /*
02433  *  call-seq:
02434  *     str.casecmp(other_str)   -> -1, 0, +1 or nil
02435  *
02436  *  Case-insensitive version of <code>String#<=></code>.
02437  *
02438  *     "abcdef".casecmp("abcde")     #=> 1
02439  *     "aBcDeF".casecmp("abcdef")    #=> 0
02440  *     "abcdef".casecmp("abcdefg")   #=> -1
02441  *     "abcdef".casecmp("ABCDEF")    #=> 0
02442  */
02443 
02444 static VALUE
02445 rb_str_casecmp(VALUE str1, VALUE str2)
02446 {
02447     long len;
02448     rb_encoding *enc;
02449     char *p1, *p1end, *p2, *p2end;
02450 
02451     StringValue(str2);
02452     enc = rb_enc_compatible(str1, str2);
02453     if (!enc) {
02454         return Qnil;
02455     }
02456 
02457     p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
02458     p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
02459     if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
02460         while (p1 < p1end && p2 < p2end) {
02461             if (*p1 != *p2) {
02462                 unsigned int c1 = TOUPPER(*p1 & 0xff);
02463                 unsigned int c2 = TOUPPER(*p2 & 0xff);
02464                 if (c1 != c2)
02465                     return INT2FIX(c1 < c2 ? -1 : 1);
02466             }
02467             p1++;
02468             p2++;
02469         }
02470     }
02471     else {
02472         while (p1 < p1end && p2 < p2end) {
02473             int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
02474             int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
02475 
02476             if (0 <= c1 && 0 <= c2) {
02477                 c1 = TOUPPER(c1);
02478                 c2 = TOUPPER(c2);
02479                 if (c1 != c2)
02480                     return INT2FIX(c1 < c2 ? -1 : 1);
02481             }
02482             else {
02483                 int r;
02484                 l1 = rb_enc_mbclen(p1, p1end, enc);
02485                 l2 = rb_enc_mbclen(p2, p2end, enc);
02486                 len = l1 < l2 ? l1 : l2;
02487                 r = memcmp(p1, p2, len);
02488                 if (r != 0)
02489                     return INT2FIX(r < 0 ? -1 : 1);
02490                 if (l1 != l2)
02491                     return INT2FIX(l1 < l2 ? -1 : 1);
02492             }
02493             p1 += l1;
02494             p2 += l2;
02495         }
02496     }
02497     if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
02498     if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
02499     return INT2FIX(-1);
02500 }
02501 
02502 static long
02503 rb_str_index(VALUE str, VALUE sub, long offset)
02504 {
02505     long pos;
02506     char *s, *sptr, *e;
02507     long len, slen;
02508     rb_encoding *enc;
02509 
02510     enc = rb_enc_check(str, sub);
02511     if (is_broken_string(sub)) {
02512         return -1;
02513     }
02514     len = str_strlen(str, enc);
02515     slen = str_strlen(sub, enc);
02516     if (offset < 0) {
02517         offset += len;
02518         if (offset < 0) return -1;
02519     }
02520     if (len - offset < slen) return -1;
02521     s = RSTRING_PTR(str);
02522     e = s + RSTRING_LEN(str);
02523     if (offset) {
02524         offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str));
02525         s += offset;
02526     }
02527     if (slen == 0) return offset;
02528     /* need proceed one character at a time */
02529     sptr = RSTRING_PTR(sub);
02530     slen = RSTRING_LEN(sub);
02531     len = RSTRING_LEN(str) - offset;
02532     for (;;) {
02533         char *t;
02534         pos = rb_memsearch(sptr, slen, s, len, enc);
02535         if (pos < 0) return pos;
02536         t = rb_enc_right_char_head(s, s+pos, e, enc);
02537         if (t == s + pos) break;
02538         if ((len -= t - s) <= 0) return -1;
02539         offset += t - s;
02540         s = t;
02541     }
02542     return pos + offset;
02543 }
02544 
02545 
02546 /*
02547  *  call-seq:
02548  *     str.index(substring [, offset])   -> fixnum or nil
02549  *     str.index(regexp [, offset])      -> fixnum or nil
02550  *
02551  *  Returns the index of the first occurrence of the given <i>substring</i> or
02552  *  pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
02553  *  found. If the second parameter is present, it specifies the position in the
02554  *  string to begin the search.
02555  *
02556  *     "hello".index('e')             #=> 1
02557  *     "hello".index('lo')            #=> 3
02558  *     "hello".index('a')             #=> nil
02559  *     "hello".index(?e)              #=> 1
02560  *     "hello".index(/[aeiou]/, -3)   #=> 4
02561  */
02562 
02563 static VALUE
02564 rb_str_index_m(int argc, VALUE *argv, VALUE str)
02565 {
02566     VALUE sub;
02567     VALUE initpos;
02568     long pos;
02569 
02570     if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
02571         pos = NUM2LONG(initpos);
02572     }
02573     else {
02574         pos = 0;
02575     }
02576     if (pos < 0) {
02577         pos += str_strlen(str, STR_ENC_GET(str));
02578         if (pos < 0) {
02579             if (RB_TYPE_P(sub, T_REGEXP)) {
02580                 rb_backref_set(Qnil);
02581             }
02582             return Qnil;
02583         }
02584     }
02585 
02586     if (SPECIAL_CONST_P(sub)) goto generic;
02587     switch (BUILTIN_TYPE(sub)) {
02588       case T_REGEXP:
02589         if (pos > str_strlen(str, STR_ENC_GET(str)))
02590             return Qnil;
02591         pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
02592                          rb_enc_check(str, sub), single_byte_optimizable(str));
02593 
02594         pos = rb_reg_search(sub, str, pos, 0);
02595         pos = rb_str_sublen(str, pos);
02596         break;
02597 
02598       generic:
02599       default: {
02600         VALUE tmp;
02601 
02602         tmp = rb_check_string_type(sub);
02603         if (NIL_P(tmp)) {
02604             rb_raise(rb_eTypeError, "type mismatch: %s given",
02605                      rb_obj_classname(sub));
02606         }
02607         sub = tmp;
02608       }
02609         /* fall through */
02610       case T_STRING:
02611         pos = rb_str_index(str, sub, pos);
02612         pos = rb_str_sublen(str, pos);
02613         break;
02614     }
02615 
02616     if (pos == -1) return Qnil;
02617     return LONG2NUM(pos);
02618 }
02619 
02620 static long
02621 rb_str_rindex(VALUE str, VALUE sub, long pos)
02622 {
02623     long len, slen;
02624     char *s, *sbeg, *e, *t;
02625     rb_encoding *enc;
02626     int singlebyte = single_byte_optimizable(str);
02627 
02628     enc = rb_enc_check(str, sub);
02629     if (is_broken_string(sub)) {
02630         return -1;
02631     }
02632     len = str_strlen(str, enc);
02633     slen = str_strlen(sub, enc);
02634     /* substring longer than string */
02635     if (len < slen) return -1;
02636     if (len - pos < slen) {
02637         pos = len - slen;
02638     }
02639     if (len == 0) {
02640         return pos;
02641     }
02642     sbeg = RSTRING_PTR(str);
02643     e = RSTRING_END(str);
02644     t = RSTRING_PTR(sub);
02645     slen = RSTRING_LEN(sub);
02646     s = str_nth(sbeg, e, pos, enc, singlebyte);
02647     while (s) {
02648         if (memcmp(s, t, slen) == 0) {
02649             return pos;
02650         }
02651         if (pos == 0) break;
02652         pos--;
02653         s = rb_enc_prev_char(sbeg, s, e, enc);
02654     }
02655     return -1;
02656 }
02657 
02658 
02659 /*
02660  *  call-seq:
02661  *     str.rindex(substring [, fixnum])   -> fixnum or nil
02662  *     str.rindex(regexp [, fixnum])   -> fixnum or nil
02663  *
02664  *  Returns the index of the last occurrence of the given <i>substring</i> or
02665  *  pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
02666  *  found. If the second parameter is present, it specifies the position in the
02667  *  string to end the search---characters beyond this point will not be
02668  *  considered.
02669  *
02670  *     "hello".rindex('e')             #=> 1
02671  *     "hello".rindex('l')             #=> 3
02672  *     "hello".rindex('a')             #=> nil
02673  *     "hello".rindex(?e)              #=> 1
02674  *     "hello".rindex(/[aeiou]/, -2)   #=> 1
02675  */
02676 
02677 static VALUE
02678 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
02679 {
02680     VALUE sub;
02681     VALUE vpos;
02682     rb_encoding *enc = STR_ENC_GET(str);
02683     long pos, len = str_strlen(str, enc);
02684 
02685     if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
02686         pos = NUM2LONG(vpos);
02687         if (pos < 0) {
02688             pos += len;
02689             if (pos < 0) {
02690                 if (RB_TYPE_P(sub, T_REGEXP)) {
02691                     rb_backref_set(Qnil);
02692                 }
02693                 return Qnil;
02694             }
02695         }
02696         if (pos > len) pos = len;
02697     }
02698     else {
02699         pos = len;
02700     }
02701 
02702     if (SPECIAL_CONST_P(sub)) goto generic;
02703     switch (BUILTIN_TYPE(sub)) {
02704       case T_REGEXP:
02705         /* enc = rb_get_check(str, sub); */
02706         pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
02707                          STR_ENC_GET(str), single_byte_optimizable(str));
02708 
02709         if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) {
02710             pos = rb_reg_search(sub, str, pos, 1);
02711             pos = rb_str_sublen(str, pos);
02712         }
02713         if (pos >= 0) return LONG2NUM(pos);
02714         break;
02715 
02716       generic:
02717       default: {
02718         VALUE tmp;
02719 
02720         tmp = rb_check_string_type(sub);
02721         if (NIL_P(tmp)) {
02722             rb_raise(rb_eTypeError, "type mismatch: %s given",
02723                      rb_obj_classname(sub));
02724         }
02725         sub = tmp;
02726       }
02727         /* fall through */
02728       case T_STRING:
02729         pos = rb_str_rindex(str, sub, pos);
02730         if (pos >= 0) return LONG2NUM(pos);
02731         break;
02732     }
02733     return Qnil;
02734 }
02735 
02736 /*
02737  *  call-seq:
02738  *     str =~ obj   -> fixnum or nil
02739  *
02740  *  Match---If <i>obj</i> is a <code>Regexp</code>, use it as a pattern to match
02741  *  against <i>str</i>,and returns the position the match starts, or
02742  *  <code>nil</code> if there is no match. Otherwise, invokes
02743  *  <i>obj.=~</i>, passing <i>str</i> as an argument. The default
02744  *  <code>=~</code> in <code>Object</code> returns <code>nil</code>.
02745  *
02746  *  Note: <code>str =~ regexp</code> is not the same as
02747  *  <code>regexp =~ str</code>. Strings captured from named capture groups
02748  *  are assigned to local variables only in the second case.
02749  *
02750  *     "cat o' 9 tails" =~ /\d/   #=> 7
02751  *     "cat o' 9 tails" =~ 9      #=> nil
02752  */
02753 
02754 static VALUE
02755 rb_str_match(VALUE x, VALUE y)
02756 {
02757     if (SPECIAL_CONST_P(y)) goto generic;
02758     switch (BUILTIN_TYPE(y)) {
02759       case T_STRING:
02760         rb_raise(rb_eTypeError, "type mismatch: String given");
02761 
02762       case T_REGEXP:
02763         return rb_reg_match(y, x);
02764 
02765       generic:
02766       default:
02767         return rb_funcall(y, rb_intern("=~"), 1, x);
02768     }
02769 }
02770 
02771 
02772 static VALUE get_pat(VALUE, int);
02773 
02774 
02775 /*
02776  *  call-seq:
02777  *     str.match(pattern)        -> matchdata or nil
02778  *     str.match(pattern, pos)   -> matchdata or nil
02779  *
02780  *  Converts <i>pattern</i> to a <code>Regexp</code> (if it isn't already one),
02781  *  then invokes its <code>match</code> method on <i>str</i>.  If the second
02782  *  parameter is present, it specifies the position in the string to begin the
02783  *  search.
02784  *
02785  *     'hello'.match('(.)\1')      #=> #<MatchData "ll" 1:"l">
02786  *     'hello'.match('(.)\1')[0]   #=> "ll"
02787  *     'hello'.match(/(.)\1/)[0]   #=> "ll"
02788  *     'hello'.match('xx')         #=> nil
02789  *
02790  *  If a block is given, invoke the block with MatchData if match succeed, so
02791  *  that you can write
02792  *
02793  *     str.match(pat) {|m| ...}
02794  *
02795  *  instead of
02796  *
02797  *     if m = str.match(pat)
02798  *       ...
02799  *     end
02800  *
02801  *  The return value is a value from block execution in this case.
02802  */
02803 
02804 static VALUE
02805 rb_str_match_m(int argc, VALUE *argv, VALUE str)
02806 {
02807     VALUE re, result;
02808     if (argc < 1)
02809         rb_check_arity(argc, 1, 2);
02810     re = argv[0];
02811     argv[0] = str;
02812     result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv);
02813     if (!NIL_P(result) && rb_block_given_p()) {
02814         return rb_yield(result);
02815     }
02816     return result;
02817 }
02818 
02819 enum neighbor_char {
02820     NEIGHBOR_NOT_CHAR,
02821     NEIGHBOR_FOUND,
02822     NEIGHBOR_WRAPPED
02823 };
02824 
02825 static enum neighbor_char
02826 enc_succ_char(char *p, long len, rb_encoding *enc)
02827 {
02828     long i;
02829     int l;
02830     while (1) {
02831         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
02832             p[i] = '\0';
02833         if (i < 0)
02834             return NEIGHBOR_WRAPPED;
02835         ++((unsigned char*)p)[i];
02836         l = rb_enc_precise_mbclen(p, p+len, enc);
02837         if (MBCLEN_CHARFOUND_P(l)) {
02838             l = MBCLEN_CHARFOUND_LEN(l);
02839             if (l == len) {
02840                 return NEIGHBOR_FOUND;
02841             }
02842             else {
02843                 memset(p+l, 0xff, len-l);
02844             }
02845         }
02846         if (MBCLEN_INVALID_P(l) && i < len-1) {
02847             long len2;
02848             int l2;
02849             for (len2 = len-1; 0 < len2; len2--) {
02850                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
02851                 if (!MBCLEN_INVALID_P(l2))
02852                     break;
02853             }
02854             memset(p+len2+1, 0xff, len-(len2+1));
02855         }
02856     }
02857 }
02858 
02859 static enum neighbor_char
02860 enc_pred_char(char *p, long len, rb_encoding *enc)
02861 {
02862     long i;
02863     int l;
02864     while (1) {
02865         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
02866             p[i] = '\xff';
02867         if (i < 0)
02868             return NEIGHBOR_WRAPPED;
02869         --((unsigned char*)p)[i];
02870         l = rb_enc_precise_mbclen(p, p+len, enc);
02871         if (MBCLEN_CHARFOUND_P(l)) {
02872             l = MBCLEN_CHARFOUND_LEN(l);
02873             if (l == len) {
02874                 return NEIGHBOR_FOUND;
02875             }
02876             else {
02877                 memset(p+l, 0, len-l);
02878             }
02879         }
02880         if (MBCLEN_INVALID_P(l) && i < len-1) {
02881             long len2;
02882             int l2;
02883             for (len2 = len-1; 0 < len2; len2--) {
02884                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
02885                 if (!MBCLEN_INVALID_P(l2))
02886                     break;
02887             }
02888             memset(p+len2+1, 0, len-(len2+1));
02889         }
02890     }
02891 }
02892 
02893 /*
02894   overwrite +p+ by succeeding letter in +enc+ and returns
02895   NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
02896   When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
02897   assuming each ranges are successive, and mbclen
02898   never change in each ranges.
02899   NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
02900   character.
02901  */
02902 static enum neighbor_char
02903 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
02904 {
02905     enum neighbor_char ret;
02906     unsigned int c;
02907     int ctype;
02908     int range;
02909     char save[ONIGENC_CODE_TO_MBC_MAXLEN];
02910 
02911     c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02912     if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
02913         ctype = ONIGENC_CTYPE_DIGIT;
02914     else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
02915         ctype = ONIGENC_CTYPE_ALPHA;
02916     else
02917         return NEIGHBOR_NOT_CHAR;
02918 
02919     MEMCPY(save, p, char, len);
02920     ret = enc_succ_char(p, len, enc);
02921     if (ret == NEIGHBOR_FOUND) {
02922         c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02923         if (rb_enc_isctype(c, ctype, enc))
02924             return NEIGHBOR_FOUND;
02925     }
02926     MEMCPY(p, save, char, len);
02927     range = 1;
02928     while (1) {
02929         MEMCPY(save, p, char, len);
02930         ret = enc_pred_char(p, len, enc);
02931         if (ret == NEIGHBOR_FOUND) {
02932             c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02933             if (!rb_enc_isctype(c, ctype, enc)) {
02934                 MEMCPY(p, save, char, len);
02935                 break;
02936             }
02937         }
02938         else {
02939             MEMCPY(p, save, char, len);
02940             break;
02941         }
02942         range++;
02943     }
02944     if (range == 1) {
02945         return NEIGHBOR_NOT_CHAR;
02946     }
02947 
02948     if (ctype != ONIGENC_CTYPE_DIGIT) {
02949         MEMCPY(carry, p, char, len);
02950         return NEIGHBOR_WRAPPED;
02951     }
02952 
02953     MEMCPY(carry, p, char, len);
02954     enc_succ_char(carry, len, enc);
02955     return NEIGHBOR_WRAPPED;
02956 }
02957 
02958 
02959 /*
02960  *  call-seq:
02961  *     str.succ   -> new_str
02962  *     str.next   -> new_str
02963  *
02964  *  Returns the successor to <i>str</i>. The successor is calculated by
02965  *  incrementing characters starting from the rightmost alphanumeric (or
02966  *  the rightmost character if there are no alphanumerics) in the
02967  *  string. Incrementing a digit always results in another digit, and
02968  *  incrementing a letter results in another letter of the same case.
02969  *  Incrementing nonalphanumerics uses the underlying character set's
02970  *  collating sequence.
02971  *
02972  *  If the increment generates a ``carry,'' the character to the left of
02973  *  it is incremented. This process repeats until there is no carry,
02974  *  adding an additional character if necessary.
02975  *
02976  *     "abcd".succ        #=> "abce"
02977  *     "THX1138".succ     #=> "THX1139"
02978  *     "<<koala>>".succ   #=> "<<koalb>>"
02979  *     "1999zzz".succ     #=> "2000aaa"
02980  *     "ZZZ9999".succ     #=> "AAAA0000"
02981  *     "***".succ         #=> "**+"
02982  */
02983 
02984 VALUE
02985 rb_str_succ(VALUE orig)
02986 {
02987     rb_encoding *enc;
02988     VALUE str;
02989     char *sbeg, *s, *e, *last_alnum = 0;
02990     int c = -1;
02991     long l;
02992     char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
02993     long carry_pos = 0, carry_len = 1;
02994     enum neighbor_char neighbor = NEIGHBOR_FOUND;
02995 
02996     str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
02997     rb_enc_cr_str_copy_for_substr(str, orig);
02998     OBJ_INFECT(str, orig);
02999     if (RSTRING_LEN(str) == 0) return str;
03000 
03001     enc = STR_ENC_GET(orig);
03002     sbeg = RSTRING_PTR(str);
03003     s = e = sbeg + RSTRING_LEN(str);
03004 
03005     while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
03006         if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
03007             if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
03008                 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
03009                 s = last_alnum;
03010                 break;
03011             }
03012         }
03013         if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
03014         neighbor = enc_succ_alnum_char(s, l, enc, carry);
03015         switch (neighbor) {
03016           case NEIGHBOR_NOT_CHAR:
03017             continue;
03018           case NEIGHBOR_FOUND:
03019             return str;
03020           case NEIGHBOR_WRAPPED:
03021             last_alnum = s;
03022             break;
03023         }
03024         c = 1;
03025         carry_pos = s - sbeg;
03026         carry_len = l;
03027     }
03028     if (c == -1) {              /* str contains no alnum */
03029         s = e;
03030         while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
03031             enum neighbor_char neighbor;
03032             if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
03033             neighbor = enc_succ_char(s, l, enc);
03034             if (neighbor == NEIGHBOR_FOUND)
03035                 return str;
03036             if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
03037                 /* wrapped to \0...\0.  search next valid char. */
03038                 enc_succ_char(s, l, enc);
03039             }
03040             if (!rb_enc_asciicompat(enc)) {
03041                 MEMCPY(carry, s, char, l);
03042                 carry_len = l;
03043             }
03044             carry_pos = s - sbeg;
03045         }
03046     }
03047     RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len);
03048     s = RSTRING_PTR(str) + carry_pos;
03049     memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos);
03050     memmove(s, carry, carry_len);
03051     STR_SET_LEN(str, RSTRING_LEN(str) + carry_len);
03052     RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
03053     rb_enc_str_coderange(str);
03054     return str;
03055 }
03056 
03057 
03058 /*
03059  *  call-seq:
03060  *     str.succ!   -> str
03061  *     str.next!   -> str
03062  *
03063  *  Equivalent to <code>String#succ</code>, but modifies the receiver in
03064  *  place.
03065  */
03066 
03067 static VALUE
03068 rb_str_succ_bang(VALUE str)
03069 {
03070     rb_str_shared_replace(str, rb_str_succ(str));
03071 
03072     return str;
03073 }
03074 
03075 
03076 /*
03077  *  call-seq:
03078  *     str.upto(other_str, exclusive=false) {|s| block }   -> str
03079  *     str.upto(other_str, exclusive=false)                -> an_enumerator
03080  *
03081  *  Iterates through successive values, starting at <i>str</i> and
03082  *  ending at <i>other_str</i> inclusive, passing each value in turn to
03083  *  the block. The <code>String#succ</code> method is used to generate
03084  *  each value.  If optional second argument exclusive is omitted or is false,
03085  *  the last value will be included; otherwise it will be excluded.
03086  *
03087  *  If no block is given, an enumerator is returned instead.
03088  *
03089  *     "a8".upto("b6") {|s| print s, ' ' }
03090  *     for s in "a8".."b6"
03091  *       print s, ' '
03092  *     end
03093  *
03094  *  <em>produces:</em>
03095  *
03096  *     a8 a9 b0 b1 b2 b3 b4 b5 b6
03097  *     a8 a9 b0 b1 b2 b3 b4 b5 b6
03098  *
03099  *  If <i>str</i> and <i>other_str</i> contains only ascii numeric characters,
03100  *  both are recognized as decimal numbers. In addition, the width of
03101  *  string (e.g. leading zeros) is handled appropriately.
03102  *
03103  *     "9".upto("11").to_a   #=> ["9", "10", "11"]
03104  *     "25".upto("5").to_a   #=> []
03105  *     "07".upto("11").to_a  #=> ["07", "08", "09", "10", "11"]
03106  */
03107 
03108 static VALUE
03109 rb_str_upto(int argc, VALUE *argv, VALUE beg)
03110 {
03111     VALUE end, exclusive;
03112     VALUE current, after_end;
03113     ID succ;
03114     int n, excl, ascii;
03115     rb_encoding *enc;
03116 
03117     rb_scan_args(argc, argv, "11", &end, &exclusive);
03118     RETURN_ENUMERATOR(beg, argc, argv);
03119     excl = RTEST(exclusive);
03120     CONST_ID(succ, "succ");
03121     StringValue(end);
03122     enc = rb_enc_check(beg, end);
03123     ascii = (is_ascii_string(beg) && is_ascii_string(end));
03124     /* single character */
03125     if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
03126         char c = RSTRING_PTR(beg)[0];
03127         char e = RSTRING_PTR(end)[0];
03128 
03129         if (c > e || (excl && c == e)) return beg;
03130         for (;;) {
03131             rb_yield(rb_enc_str_new(&c, 1, enc));
03132             if (!excl && c == e) break;
03133             c++;
03134             if (excl && c == e) break;
03135         }
03136         return beg;
03137     }
03138     /* both edges are all digits */
03139     if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0])) {
03140         char *s, *send;
03141         VALUE b, e;
03142         int width;
03143 
03144         s = RSTRING_PTR(beg); send = RSTRING_END(beg);
03145         width = rb_long2int(send - s);
03146         while (s < send) {
03147             if (!ISDIGIT(*s)) goto no_digits;
03148             s++;
03149         }
03150         s = RSTRING_PTR(end); send = RSTRING_END(end);
03151         while (s < send) {
03152             if (!ISDIGIT(*s)) goto no_digits;
03153             s++;
03154         }
03155         b = rb_str_to_inum(beg, 10, FALSE);
03156         e = rb_str_to_inum(end, 10, FALSE);
03157         if (FIXNUM_P(b) && FIXNUM_P(e)) {
03158             long bi = FIX2LONG(b);
03159             long ei = FIX2LONG(e);
03160             rb_encoding *usascii = rb_usascii_encoding();
03161 
03162             while (bi <= ei) {
03163                 if (excl && bi == ei) break;
03164                 rb_yield(rb_enc_sprintf(usascii, "%.*ld", width, bi));
03165                 bi++;
03166             }
03167         }
03168         else {
03169             ID op = excl ? '<' : rb_intern("<=");
03170             VALUE args[2], fmt = rb_obj_freeze(rb_usascii_str_new_cstr("%.*d"));
03171 
03172             args[0] = INT2FIX(width);
03173             while (rb_funcall(b, op, 1, e)) {
03174                 args[1] = b;
03175                 rb_yield(rb_str_format(numberof(args), args, fmt));
03176                 b = rb_funcall(b, succ, 0, 0);
03177             }
03178         }
03179         return beg;
03180     }
03181     /* normal case */
03182   no_digits:
03183     n = rb_str_cmp(beg, end);
03184     if (n > 0 || (excl && n == 0)) return beg;
03185 
03186     after_end = rb_funcall(end, succ, 0, 0);
03187     current = rb_str_dup(beg);
03188     while (!rb_str_equal(current, after_end)) {
03189         VALUE next = Qnil;
03190         if (excl || !rb_str_equal(current, end))
03191             next = rb_funcall(current, succ, 0, 0);
03192         rb_yield(current);
03193         if (NIL_P(next)) break;
03194         current = next;
03195         StringValue(current);
03196         if (excl && rb_str_equal(current, end)) break;
03197         if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
03198             break;
03199     }
03200 
03201     return beg;
03202 }
03203 
03204 static VALUE
03205 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
03206 {
03207     if (rb_reg_search(re, str, 0, 0) >= 0) {
03208         VALUE match = rb_backref_get();
03209         int nth = rb_reg_backref_number(match, backref);
03210         return rb_reg_nth_match(nth, match);
03211     }
03212     return Qnil;
03213 }
03214 
03215 static VALUE
03216 rb_str_aref(VALUE str, VALUE indx)
03217 {
03218     long idx;
03219 
03220     if (FIXNUM_P(indx)) {
03221         idx = FIX2LONG(indx);
03222 
03223       num_index:
03224         str = rb_str_substr(str, idx, 1);
03225         if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil;
03226         return str;
03227     }
03228 
03229     if (SPECIAL_CONST_P(indx)) goto generic;
03230     switch (BUILTIN_TYPE(indx)) {
03231       case T_REGEXP:
03232         return rb_str_subpat(str, indx, INT2FIX(0));
03233 
03234       case T_STRING:
03235         if (rb_str_index(str, indx, 0) != -1)
03236             return rb_str_dup(indx);
03237         return Qnil;
03238 
03239       generic:
03240       default:
03241         /* check if indx is Range */
03242         {
03243             long beg, len;
03244             VALUE tmp;
03245 
03246             len = str_strlen(str, STR_ENC_GET(str));
03247             switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
03248               case Qfalse:
03249                 break;
03250               case Qnil:
03251                 return Qnil;
03252               default:
03253                 tmp = rb_str_substr(str, beg, len);
03254                 return tmp;
03255             }
03256         }
03257         idx = NUM2LONG(indx);
03258         goto num_index;
03259     }
03260 
03261     UNREACHABLE;
03262 }
03263 
03264 
03265 /*
03266  *  call-seq:
03267  *     str[index]                 -> new_str or nil
03268  *     str[start, length]         -> new_str or nil
03269  *     str[range]                 -> new_str or nil
03270  *     str[regexp]                -> new_str or nil
03271  *     str[regexp, capture]       -> new_str or nil
03272  *     str[match_str]             -> new_str or nil
03273  *     str.slice(index)           -> new_str or nil
03274  *     str.slice(start, length)   -> new_str or nil
03275  *     str.slice(range)           -> new_str or nil
03276  *     str.slice(regexp)          -> new_str or nil
03277  *     str.slice(regexp, capture) -> new_str or nil
03278  *     str.slice(match_str)       -> new_str or nil
03279  *
03280  *  Element Reference --- If passed a single +index+, returns a substring of
03281  *  one character at that index. If passed a +start+ index and a +length+,
03282  *  returns a substring containing +length+ characters starting at the
03283  *  +index+. If passed a +range+, its beginning and end are interpreted as
03284  *  offsets delimiting the substring to be returned.
03285  *
03286  *  In these three cases, if an index is negative, it is counted from the end
03287  *  of the string.  For the +start+ and +range+ cases the starting index
03288  *  is just before a character and an index matching the string's size.
03289  *  Additionally, an empty string is returned when the starting index for a
03290  *  character range is at the end of the string.
03291  *
03292  *  Returns +nil+ if the initial index falls outside the string or the length
03293  *  is negative.
03294  *
03295  *  If a +Regexp+ is supplied, the matching portion of the string is
03296  *  returned.  If a +capture+ follows the regular expression, which may be a
03297  *  capture group index or name, follows the regular expression that component
03298  *  of the MatchData is returned instead.
03299  *
03300  *  If a +match_str+ is given, that string is returned if it occurs in
03301  *  the string.
03302  *
03303  *  Returns +nil+ if the regular expression does not match or the match string
03304  *  cannot be found.
03305  *
03306  *     a = "hello there"
03307  *
03308  *     a[1]                   #=> "e"
03309  *     a[2, 3]                #=> "llo"
03310  *     a[2..3]                #=> "ll"
03311  *
03312  *     a[-3, 2]               #=> "er"
03313  *     a[7..-2]               #=> "her"
03314  *     a[-4..-2]              #=> "her"
03315  *     a[-2..-4]              #=> ""
03316  *
03317  *     a[11, 0]               #=> ""
03318  *     a[11]                  #=> nil
03319  *     a[12, 0]               #=> nil
03320  *     a[12..-1]              #=> nil
03321  *
03322  *     a[/[aeiou](.)\1/]      #=> "ell"
03323  *     a[/[aeiou](.)\1/, 0]   #=> "ell"
03324  *     a[/[aeiou](.)\1/, 1]   #=> "l"
03325  *     a[/[aeiou](.)\1/, 2]   #=> nil
03326  *
03327  *     a[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "non_vowel"] #=> "l"
03328  *     a[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "vowel"]     #=> "e"
03329  *
03330  *     a["lo"]                #=> "lo"
03331  *     a["bye"]               #=> nil
03332  */
03333 
03334 static VALUE
03335 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
03336 {
03337     if (argc == 2) {
03338         if (RB_TYPE_P(argv[0], T_REGEXP)) {
03339             return rb_str_subpat(str, argv[0], argv[1]);
03340         }
03341         return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
03342     }
03343     rb_check_arity(argc, 1, 2);
03344     return rb_str_aref(str, argv[0]);
03345 }
03346 
03347 VALUE
03348 rb_str_drop_bytes(VALUE str, long len)
03349 {
03350     char *ptr = RSTRING_PTR(str);
03351     long olen = RSTRING_LEN(str), nlen;
03352 
03353     str_modifiable(str);
03354     if (len > olen) len = olen;
03355     nlen = olen - len;
03356     if (nlen <= RSTRING_EMBED_LEN_MAX) {
03357         char *oldptr = ptr;
03358         int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|ELTS_SHARED));
03359         STR_SET_EMBED(str);
03360         STR_SET_EMBED_LEN(str, nlen);
03361         ptr = RSTRING(str)->as.ary;
03362         memmove(ptr, oldptr + len, nlen);
03363         if (fl == STR_NOEMBED) xfree(oldptr);
03364     }
03365     else {
03366         if (!STR_SHARED_P(str)) rb_str_new4(str);
03367         ptr = RSTRING(str)->as.heap.ptr += len;
03368         RSTRING(str)->as.heap.len = nlen;
03369     }
03370     ptr[nlen] = 0;
03371     ENC_CODERANGE_CLEAR(str);
03372     return str;
03373 }
03374 
03375 static void
03376 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
03377 {
03378     if (beg == 0 && RSTRING_LEN(val) == 0) {
03379         rb_str_drop_bytes(str, len);
03380         OBJ_INFECT(str, val);
03381         return;
03382     }
03383 
03384     rb_str_modify(str);
03385     if (len < RSTRING_LEN(val)) {
03386         /* expand string */
03387         RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1);
03388     }
03389 
03390     if (RSTRING_LEN(val) != len) {
03391         memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val),
03392                 RSTRING_PTR(str) + beg + len,
03393                 RSTRING_LEN(str) - (beg + len));
03394     }
03395     if (RSTRING_LEN(val) < beg && len < 0) {
03396         MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len);
03397     }
03398     if (RSTRING_LEN(val) > 0) {
03399         memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val));
03400     }
03401     STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len);
03402     if (RSTRING_PTR(str)) {
03403         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
03404     }
03405     OBJ_INFECT(str, val);
03406 }
03407 
03408 static void
03409 rb_str_splice(VALUE str, long beg, long len, VALUE val)
03410 {
03411     long slen;
03412     char *p, *e;
03413     rb_encoding *enc;
03414     int singlebyte = single_byte_optimizable(str);
03415     int cr;
03416 
03417     if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
03418 
03419     StringValue(val);
03420     enc = rb_enc_check(str, val);
03421     slen = str_strlen(str, enc);
03422 
03423     if (slen < beg) {
03424       out_of_range:
03425         rb_raise(rb_eIndexError, "index %ld out of string", beg);
03426     }
03427     if (beg < 0) {
03428         if (-beg > slen) {
03429             goto out_of_range;
03430         }
03431         beg += slen;
03432     }
03433     if (slen < len || slen < beg + len) {
03434         len = slen - beg;
03435     }
03436     str_modify_keep_cr(str);
03437     p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
03438     if (!p) p = RSTRING_END(str);
03439     e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
03440     if (!e) e = RSTRING_END(str);
03441     /* error check */
03442     beg = p - RSTRING_PTR(str); /* physical position */
03443     len = e - p;                /* physical length */
03444     rb_str_splice_0(str, beg, len, val);
03445     rb_enc_associate(str, enc);
03446     cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
03447     if (cr != ENC_CODERANGE_BROKEN)
03448         ENC_CODERANGE_SET(str, cr);
03449 }
03450 
03451 void
03452 rb_str_update(VALUE str, long beg, long len, VALUE val)
03453 {
03454     rb_str_splice(str, beg, len, val);
03455 }
03456 
03457 static void
03458 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
03459 {
03460     int nth;
03461     VALUE match;
03462     long start, end, len;
03463     rb_encoding *enc;
03464     struct re_registers *regs;
03465 
03466     if (rb_reg_search(re, str, 0, 0) < 0) {
03467         rb_raise(rb_eIndexError, "regexp not matched");
03468     }
03469     match = rb_backref_get();
03470     nth = rb_reg_backref_number(match, backref);
03471     regs = RMATCH_REGS(match);
03472     if (nth >= regs->num_regs) {
03473       out_of_range:
03474         rb_raise(rb_eIndexError, "index %d out of regexp", nth);
03475     }
03476     if (nth < 0) {
03477         if (-nth >= regs->num_regs) {
03478             goto out_of_range;
03479         }
03480         nth += regs->num_regs;
03481     }
03482 
03483     start = BEG(nth);
03484     if (start == -1) {
03485         rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
03486     }
03487     end = END(nth);
03488     len = end - start;
03489     StringValue(val);
03490     enc = rb_enc_check(str, val);
03491     rb_str_splice_0(str, start, len, val);
03492     rb_enc_associate(str, enc);
03493 }
03494 
03495 static VALUE
03496 rb_str_aset(VALUE str, VALUE indx, VALUE val)
03497 {
03498     long idx, beg;
03499 
03500     if (FIXNUM_P(indx)) {
03501         idx = FIX2LONG(indx);
03502       num_index:
03503         rb_str_splice(str, idx, 1, val);
03504         return val;
03505     }
03506 
03507     if (SPECIAL_CONST_P(indx)) goto generic;
03508     switch (TYPE(indx)) {
03509       case T_REGEXP:
03510         rb_str_subpat_set(str, indx, INT2FIX(0), val);
03511         return val;
03512 
03513       case T_STRING:
03514         beg = rb_str_index(str, indx, 0);
03515         if (beg < 0) {
03516             rb_raise(rb_eIndexError, "string not matched");
03517         }
03518         beg = rb_str_sublen(str, beg);
03519         rb_str_splice(str, beg, str_strlen(indx, 0), val);
03520         return val;
03521 
03522       generic:
03523       default:
03524         /* check if indx is Range */
03525         {
03526             long beg, len;
03527             if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) {
03528                 rb_str_splice(str, beg, len, val);
03529                 return val;
03530             }
03531         }
03532         idx = NUM2LONG(indx);
03533         goto num_index;
03534     }
03535 }
03536 
03537 /*
03538  *  call-seq:
03539  *     str[fixnum] = new_str
03540  *     str[fixnum, fixnum] = new_str
03541  *     str[range] = aString
03542  *     str[regexp] = new_str
03543  *     str[regexp, fixnum] = new_str
03544  *     str[regexp, name] = new_str
03545  *     str[other_str] = new_str
03546  *
03547  *  Element Assignment---Replaces some or all of the content of <i>str</i>. The
03548  *  portion of the string affected is determined using the same criteria as
03549  *  <code>String#[]</code>. If the replacement string is not the same length as
03550  *  the text it is replacing, the string will be adjusted accordingly. If the
03551  *  regular expression or string is used as the index doesn't match a position
03552  *  in the string, <code>IndexError</code> is raised. If the regular expression
03553  *  form is used, the optional second <code>Fixnum</code> allows you to specify
03554  *  which portion of the match to replace (effectively using the
03555  *  <code>MatchData</code> indexing rules. The forms that take a
03556  *  <code>Fixnum</code> will raise an <code>IndexError</code> if the value is
03557  *  out of range; the <code>Range</code> form will raise a
03558  *  <code>RangeError</code>, and the <code>Regexp</code> and <code>String</code>
03559  *  will raise an <code>IndexError</code> on negative match.
03560  */
03561 
03562 static VALUE
03563 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
03564 {
03565     if (argc == 3) {
03566         if (RB_TYPE_P(argv[0], T_REGEXP)) {
03567             rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
03568         }
03569         else {
03570             rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
03571         }
03572         return argv[2];
03573     }
03574     rb_check_arity(argc, 2, 3);
03575     return rb_str_aset(str, argv[0], argv[1]);
03576 }
03577 
03578 /*
03579  *  call-seq:
03580  *     str.insert(index, other_str)   -> str
03581  *
03582  *  Inserts <i>other_str</i> before the character at the given
03583  *  <i>index</i>, modifying <i>str</i>. Negative indices count from the
03584  *  end of the string, and insert <em>after</em> the given character.
03585  *  The intent is insert <i>aString</i> so that it starts at the given
03586  *  <i>index</i>.
03587  *
03588  *     "abcd".insert(0, 'X')    #=> "Xabcd"
03589  *     "abcd".insert(3, 'X')    #=> "abcXd"
03590  *     "abcd".insert(4, 'X')    #=> "abcdX"
03591  *     "abcd".insert(-3, 'X')   #=> "abXcd"
03592  *     "abcd".insert(-1, 'X')   #=> "abcdX"
03593  */
03594 
03595 static VALUE
03596 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
03597 {
03598     long pos = NUM2LONG(idx);
03599 
03600     if (pos == -1) {
03601         return rb_str_append(str, str2);
03602     }
03603     else if (pos < 0) {
03604         pos++;
03605     }
03606     rb_str_splice(str, pos, 0, str2);
03607     return str;
03608 }
03609 
03610 
03611 /*
03612  *  call-seq:
03613  *     str.slice!(fixnum)           -> fixnum or nil
03614  *     str.slice!(fixnum, fixnum)   -> new_str or nil
03615  *     str.slice!(range)            -> new_str or nil
03616  *     str.slice!(regexp)           -> new_str or nil
03617  *     str.slice!(other_str)        -> new_str or nil
03618  *
03619  *  Deletes the specified portion from <i>str</i>, and returns the portion
03620  *  deleted.
03621  *
03622  *     string = "this is a string"
03623  *     string.slice!(2)        #=> "i"
03624  *     string.slice!(3..6)     #=> " is "
03625  *     string.slice!(/s.*t/)   #=> "sa st"
03626  *     string.slice!("r")      #=> "r"
03627  *     string                  #=> "thing"
03628  */
03629 
03630 static VALUE
03631 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
03632 {
03633     VALUE result;
03634     VALUE buf[3];
03635     int i;
03636 
03637     rb_check_arity(argc, 1, 2);
03638     for (i=0; i<argc; i++) {
03639         buf[i] = argv[i];
03640     }
03641     str_modify_keep_cr(str);
03642     result = rb_str_aref_m(argc, buf, str);
03643     if (!NIL_P(result)) {
03644         buf[i] = rb_str_new(0,0);
03645         rb_str_aset_m(argc+1, buf, str);
03646     }
03647     return result;
03648 }
03649 
03650 static VALUE
03651 get_pat(VALUE pat, int quote)
03652 {
03653     VALUE val;
03654 
03655     switch (TYPE(pat)) {
03656       case T_REGEXP:
03657         return pat;
03658 
03659       case T_STRING:
03660         break;
03661 
03662       default:
03663         val = rb_check_string_type(pat);
03664         if (NIL_P(val)) {
03665             Check_Type(pat, T_REGEXP);
03666         }
03667         pat = val;
03668     }
03669 
03670     if (quote) {
03671         pat = rb_reg_quote(pat);
03672     }
03673 
03674     return rb_reg_regcomp(pat);
03675 }
03676 
03677 
03678 /*
03679  *  call-seq:
03680  *     str.sub!(pattern, replacement)          -> str or nil
03681  *     str.sub!(pattern) {|match| block }      -> str or nil
03682  *
03683  *  Performs the same substitution as String#sub in-place.
03684  *
03685  *  Returns +str+ if a substitution was performed or +nil+ if no substitution
03686  *  was performed.
03687  */
03688 
03689 static VALUE
03690 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
03691 {
03692     VALUE pat, repl, hash = Qnil;
03693     int iter = 0;
03694     int tainted = 0;
03695     int untrusted = 0;
03696     long plen;
03697     int min_arity = rb_block_given_p() ? 1 : 2;
03698 
03699     rb_check_arity(argc, min_arity, 2);
03700     if (argc == 1) {
03701         iter = 1;
03702     }
03703     else {
03704         repl = argv[1];
03705         hash = rb_check_hash_type(argv[1]);
03706         if (NIL_P(hash)) {
03707             StringValue(repl);
03708         }
03709         if (OBJ_TAINTED(repl)) tainted = 1;
03710         if (OBJ_UNTRUSTED(repl)) untrusted = 1;
03711     }
03712 
03713     pat = get_pat(argv[0], 1);
03714     str_modifiable(str);
03715     if (rb_reg_search(pat, str, 0, 0) >= 0) {
03716         rb_encoding *enc;
03717         int cr = ENC_CODERANGE(str);
03718         VALUE match = rb_backref_get();
03719         struct re_registers *regs = RMATCH_REGS(match);
03720         long beg0 = BEG(0);
03721         long end0 = END(0);
03722         char *p, *rp;
03723         long len, rlen;
03724 
03725         if (iter || !NIL_P(hash)) {
03726             p = RSTRING_PTR(str); len = RSTRING_LEN(str);
03727 
03728             if (iter) {
03729                 repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
03730             }
03731             else {
03732                 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
03733                 repl = rb_obj_as_string(repl);
03734             }
03735             str_mod_check(str, p, len);
03736             rb_check_frozen(str);
03737         }
03738         else {
03739             repl = rb_reg_regsub(repl, str, regs, pat);
03740         }
03741         enc = rb_enc_compatible(str, repl);
03742         if (!enc) {
03743             rb_encoding *str_enc = STR_ENC_GET(str);
03744             p = RSTRING_PTR(str); len = RSTRING_LEN(str);
03745             if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
03746                 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
03747                 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
03748                          rb_enc_name(str_enc),
03749                          rb_enc_name(STR_ENC_GET(repl)));
03750             }
03751             enc = STR_ENC_GET(repl);
03752         }
03753         rb_str_modify(str);
03754         rb_enc_associate(str, enc);
03755         if (OBJ_TAINTED(repl)) tainted = 1;
03756         if (OBJ_UNTRUSTED(repl)) untrusted = 1;
03757         if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
03758             int cr2 = ENC_CODERANGE(repl);
03759             if (cr2 == ENC_CODERANGE_BROKEN ||
03760                 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
03761                 cr = ENC_CODERANGE_UNKNOWN;
03762             else
03763                 cr = cr2;
03764         }
03765         plen = end0 - beg0;
03766         rp = RSTRING_PTR(repl); rlen = RSTRING_LEN(repl);
03767         len = RSTRING_LEN(str);
03768         if (rlen > plen) {
03769             RESIZE_CAPA(str, len + rlen - plen);
03770         }
03771         p = RSTRING_PTR(str);
03772         if (rlen != plen) {
03773             memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
03774         }
03775         memcpy(p + beg0, rp, rlen);
03776         len += rlen - plen;
03777         STR_SET_LEN(str, len);
03778         RSTRING_PTR(str)[len] = '\0';
03779         ENC_CODERANGE_SET(str, cr);
03780         if (tainted) OBJ_TAINT(str);
03781         if (untrusted) OBJ_UNTRUST(str);
03782 
03783         return str;
03784     }
03785     return Qnil;
03786 }
03787 
03788 
03789 /*
03790  *  call-seq:
03791  *     str.sub(pattern, replacement)         -> new_str
03792  *     str.sub(pattern, hash)                -> new_str
03793  *     str.sub(pattern) {|match| block }     -> new_str
03794  *
03795  *  Returns a copy of +str+ with the _first_ occurrence of +pattern+
03796  *  replaced by the second argument. The +pattern+ is typically a Regexp; if
03797  *  given as a String, any regular expression metacharacters it contains will
03798  *  be interpreted literally, e.g. <code>'\\\d'</code> will match a backlash
03799  *  followed by 'd', instead of a digit.
03800  *
03801  *  If +replacement+ is a String it will be substituted for the matched text.
03802  *  It may contain back-references to the pattern's capture groups of the form
03803  *  <code>"\\d"</code>, where <i>d</i> is a group number, or
03804  *  <code>"\\k<n>"</code>, where <i>n</i> is a group name. If it is a
03805  *  double-quoted string, both back-references must be preceded by an
03806  *  additional backslash. However, within +replacement+ the special match
03807  *  variables, such as <code>&$</code>, will not refer to the current match.
03808  *
03809  *  If the second argument is a Hash, and the matched text is one of its keys,
03810  *  the corresponding value is the replacement string.
03811  *
03812  *  In the block form, the current match string is passed in as a parameter,
03813  *  and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
03814  *  <code>$&</code>, and <code>$'</code> will be set appropriately. The value
03815  *  returned by the block will be substituted for the match on each call.
03816  *
03817  *  The result inherits any tainting in the original string or any supplied
03818  *  replacement string.
03819  *
03820  *     "hello".sub(/[aeiou]/, '*')                  #=> "h*llo"
03821  *     "hello".sub(/([aeiou])/, '<\1>')             #=> "h<e>llo"
03822  *     "hello".sub(/./) {|s| s.ord.to_s + ' ' }     #=> "104 ello"
03823  *     "hello".sub(/(?<foo>[aeiou])/, '*\k<foo>*')  #=> "h*e*llo"
03824  *     'Is SHELL your preferred shell?'.sub(/[[:upper:]]{2,}/, ENV)
03825  *      #=> "Is /bin/bash your preferred shell?"
03826  */
03827 
03828 static VALUE
03829 rb_str_sub(int argc, VALUE *argv, VALUE str)
03830 {
03831     str = rb_str_dup(str);
03832     rb_str_sub_bang(argc, argv, str);
03833     return str;
03834 }
03835 
03836 static VALUE
03837 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
03838 {
03839     VALUE pat, val, repl, match, dest, hash = Qnil;
03840     struct re_registers *regs;
03841     long beg, n;
03842     long beg0, end0;
03843     long offset, blen, slen, len, last;
03844     int iter = 0;
03845     char *sp, *cp;
03846     int tainted = 0;
03847     rb_encoding *str_enc;
03848 
03849     switch (argc) {
03850       case 1:
03851         RETURN_ENUMERATOR(str, argc, argv);
03852         iter = 1;
03853         break;
03854       case 2:
03855         repl = argv[1];
03856         hash = rb_check_hash_type(argv[1]);
03857         if (NIL_P(hash)) {
03858             StringValue(repl);
03859         }
03860         if (OBJ_TAINTED(repl)) tainted = 1;
03861         break;
03862       default:
03863         rb_check_arity(argc, 1, 2);
03864     }
03865 
03866     pat = get_pat(argv[0], 1);
03867     beg = rb_reg_search(pat, str, 0, 0);
03868     if (beg < 0) {
03869         if (bang) return Qnil;  /* no match, no substitution */
03870         return rb_str_dup(str);
03871     }
03872 
03873     offset = 0;
03874     n = 0;
03875     blen = RSTRING_LEN(str) + 30; /* len + margin */
03876     dest = rb_str_buf_new(blen);
03877     sp = RSTRING_PTR(str);
03878     slen = RSTRING_LEN(str);
03879     cp = sp;
03880     str_enc = STR_ENC_GET(str);
03881     rb_enc_associate(dest, str_enc);
03882     ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
03883 
03884     do {
03885         n++;
03886         match = rb_backref_get();
03887         regs = RMATCH_REGS(match);
03888         beg0 = BEG(0);
03889         end0 = END(0);
03890         if (iter || !NIL_P(hash)) {
03891             if (iter) {
03892                 val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
03893             }
03894             else {
03895                 val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0)));
03896                 val = rb_obj_as_string(val);
03897             }
03898             str_mod_check(str, sp, slen);
03899             if (val == dest) {  /* paranoid check [ruby-dev:24827] */
03900                 rb_raise(rb_eRuntimeError, "block should not cheat");
03901             }
03902         }
03903         else {
03904             val = rb_reg_regsub(repl, str, regs, pat);
03905         }
03906 
03907         if (OBJ_TAINTED(val)) tainted = 1;
03908 
03909         len = beg0 - offset;    /* copy pre-match substr */
03910         if (len) {
03911             rb_enc_str_buf_cat(dest, cp, len, str_enc);
03912         }
03913 
03914         rb_str_buf_append(dest, val);
03915 
03916         last = offset;
03917         offset = end0;
03918         if (beg0 == end0) {
03919             /*
03920              * Always consume at least one character of the input string
03921              * in order to prevent infinite loops.
03922              */
03923             if (RSTRING_LEN(str) <= end0) break;
03924             len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
03925             rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
03926             offset = end0 + len;
03927         }
03928         cp = RSTRING_PTR(str) + offset;
03929         if (offset > RSTRING_LEN(str)) break;
03930         beg = rb_reg_search(pat, str, offset, 0);
03931     } while (beg >= 0);
03932     if (RSTRING_LEN(str) > offset) {
03933         rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
03934     }
03935     rb_reg_search(pat, str, last, 0);
03936     if (bang) {
03937         rb_str_shared_replace(str, dest);
03938     }
03939     else {
03940         RBASIC(dest)->klass = rb_obj_class(str);
03941         OBJ_INFECT(dest, str);
03942         str = dest;
03943     }
03944 
03945     if (tainted) OBJ_TAINT(str);
03946     return str;
03947 }
03948 
03949 
03950 /*
03951  *  call-seq:
03952  *     str.gsub!(pattern, replacement)        -> str or nil
03953  *     str.gsub!(pattern) {|match| block }    -> str or nil
03954  *     str.gsub!(pattern)                     -> an_enumerator
03955  *
03956  *  Performs the substitutions of <code>String#gsub</code> in place, returning
03957  *  <i>str</i>, or <code>nil</code> if no substitutions were performed.
03958  *  If no block and no <i>replacement</i> is given, an enumerator is returned instead.
03959  */
03960 
03961 static VALUE
03962 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
03963 {
03964     str_modify_keep_cr(str);
03965     return str_gsub(argc, argv, str, 1);
03966 }
03967 
03968 
03969 /*
03970  *  call-seq:
03971  *     str.gsub(pattern, replacement)       -> new_str
03972  *     str.gsub(pattern, hash)              -> new_str
03973  *     str.gsub(pattern) {|match| block }   -> new_str
03974  *     str.gsub(pattern)                    -> enumerator
03975  *
03976  *  Returns a copy of <i>str</i> with the <em>all</em> occurrences of
03977  *  <i>pattern</i> substituted for the second argument. The <i>pattern</i> is
03978  *  typically a <code>Regexp</code>; if given as a <code>String</code>, any
03979  *  regular expression metacharacters it contains will be interpreted
03980  *  literally, e.g. <code>'\\\d'</code> will match a backlash followed by 'd',
03981  *  instead of a digit.
03982  *
03983  *  If <i>replacement</i> is a <code>String</code> it will be substituted for
03984  *  the matched text. It may contain back-references to the pattern's capture
03985  *  groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or
03986  *  <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a
03987  *  double-quoted string, both back-references must be preceded by an
03988  *  additional backslash. However, within <i>replacement</i> the special match
03989  *  variables, such as <code>$&</code>, will not refer to the current match.
03990  *
03991  *  If the second argument is a <code>Hash</code>, and the matched text is one
03992  *  of its keys, the corresponding value is the replacement string.
03993  *
03994  *  In the block form, the current match string is passed in as a parameter,
03995  *  and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
03996  *  <code>$&</code>, and <code>$'</code> will be set appropriately. The value
03997  *  returned by the block will be substituted for the match on each call.
03998  *
03999  *  The result inherits any tainting in the original string or any supplied
04000  *  replacement string.
04001  *
04002  *  When neither a block nor a second argument is supplied, an
04003  *  <code>Enumerator</code> is returned.
04004  *
04005  *     "hello".gsub(/[aeiou]/, '*')                  #=> "h*ll*"
04006  *     "hello".gsub(/([aeiou])/, '<\1>')             #=> "h<e>ll<o>"
04007  *     "hello".gsub(/./) {|s| s.ord.to_s + ' '}      #=> "104 101 108 108 111 "
04008  *     "hello".gsub(/(?<foo>[aeiou])/, '{\k<foo>}')  #=> "h{e}ll{o}"
04009  *     'hello'.gsub(/[eo]/, 'e' => 3, 'o' => '*')    #=> "h3ll*"
04010  */
04011 
04012 static VALUE
04013 rb_str_gsub(int argc, VALUE *argv, VALUE str)
04014 {
04015     return str_gsub(argc, argv, str, 0);
04016 }
04017 
04018 
04019 /*
04020  *  call-seq:
04021  *     str.replace(other_str)   -> str
04022  *
04023  *  Replaces the contents and taintedness of <i>str</i> with the corresponding
04024  *  values in <i>other_str</i>.
04025  *
04026  *     s = "hello"         #=> "hello"
04027  *     s.replace "world"   #=> "world"
04028  */
04029 
04030 VALUE
04031 rb_str_replace(VALUE str, VALUE str2)
04032 {
04033     str_modifiable(str);
04034     if (str == str2) return str;
04035 
04036     StringValue(str2);
04037     str_discard(str);
04038     return str_replace(str, str2);
04039 }
04040 
04041 /*
04042  *  call-seq:
04043  *     string.clear    ->  string
04044  *
04045  *  Makes string empty.
04046  *
04047  *     a = "abcde"
04048  *     a.clear    #=> ""
04049  */
04050 
04051 static VALUE
04052 rb_str_clear(VALUE str)
04053 {
04054     str_discard(str);
04055     STR_SET_EMBED(str);
04056     STR_SET_EMBED_LEN(str, 0);
04057     RSTRING_PTR(str)[0] = 0;
04058     if (rb_enc_asciicompat(STR_ENC_GET(str)))
04059         ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
04060     else
04061         ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
04062     return str;
04063 }
04064 
04065 /*
04066  *  call-seq:
04067  *     string.chr    ->  string
04068  *
04069  *  Returns a one-character string at the beginning of the string.
04070  *
04071  *     a = "abcde"
04072  *     a.chr    #=> "a"
04073  */
04074 
04075 static VALUE
04076 rb_str_chr(VALUE str)
04077 {
04078     return rb_str_substr(str, 0, 1);
04079 }
04080 
04081 /*
04082  *  call-seq:
04083  *     str.getbyte(index)          -> 0 .. 255
04084  *
04085  *  returns the <i>index</i>th byte as an integer.
04086  */
04087 static VALUE
04088 rb_str_getbyte(VALUE str, VALUE index)
04089 {
04090     long pos = NUM2LONG(index);
04091 
04092     if (pos < 0)
04093         pos += RSTRING_LEN(str);
04094     if (pos < 0 ||  RSTRING_LEN(str) <= pos)
04095         return Qnil;
04096 
04097     return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
04098 }
04099 
04100 /*
04101  *  call-seq:
04102  *     str.setbyte(index, integer) -> integer
04103  *
04104  *  modifies the <i>index</i>th byte as <i>integer</i>.
04105  */
04106 static VALUE
04107 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
04108 {
04109     long pos = NUM2LONG(index);
04110     int byte = NUM2INT(value);
04111 
04112     rb_str_modify(str);
04113 
04114     if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos)
04115         rb_raise(rb_eIndexError, "index %ld out of string", pos);
04116     if (pos < 0)
04117         pos += RSTRING_LEN(str);
04118 
04119     RSTRING_PTR(str)[pos] = byte;
04120 
04121     return value;
04122 }
04123 
04124 static VALUE
04125 str_byte_substr(VALUE str, long beg, long len)
04126 {
04127     char *p, *s = RSTRING_PTR(str);
04128     long n = RSTRING_LEN(str);
04129     VALUE str2;
04130 
04131     if (beg > n || len < 0) return Qnil;
04132     if (beg < 0) {
04133         beg += n;
04134         if (beg < 0) return Qnil;
04135     }
04136     if (beg + len > n)
04137         len = n - beg;
04138     if (len <= 0) {
04139         len = 0;
04140         p = 0;
04141     }
04142     else
04143         p = s + beg;
04144 
04145     if (len > RSTRING_EMBED_LEN_MAX && beg + len == n) {
04146         str2 = rb_str_new4(str);
04147         str2 = str_new3(rb_obj_class(str2), str2);
04148         RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
04149         RSTRING(str2)->as.heap.len = len;
04150     }
04151     else {
04152         str2 = rb_str_new5(str, p, len);
04153     }
04154 
04155     str_enc_copy(str2, str);
04156 
04157     if (RSTRING_LEN(str2) == 0) {
04158         if (!rb_enc_asciicompat(STR_ENC_GET(str)))
04159             ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID);
04160         else
04161             ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
04162     }
04163     else {
04164         switch (ENC_CODERANGE(str)) {
04165           case ENC_CODERANGE_7BIT:
04166             ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
04167             break;
04168           default:
04169             ENC_CODERANGE_SET(str2, ENC_CODERANGE_UNKNOWN);
04170             break;
04171         }
04172     }
04173 
04174     OBJ_INFECT(str2, str);
04175 
04176     return str2;
04177 }
04178 
04179 static VALUE
04180 str_byte_aref(VALUE str, VALUE indx)
04181 {
04182     long idx;
04183     switch (TYPE(indx)) {
04184       case T_FIXNUM:
04185         idx = FIX2LONG(indx);
04186 
04187       num_index:
04188         str = str_byte_substr(str, idx, 1);
04189         if (NIL_P(str) || RSTRING_LEN(str) == 0) return Qnil;
04190         return str;
04191 
04192       default:
04193         /* check if indx is Range */
04194         {
04195             long beg, len = RSTRING_LEN(str);
04196 
04197             switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
04198               case Qfalse:
04199                 break;
04200               case Qnil:
04201                 return Qnil;
04202               default:
04203                 return str_byte_substr(str, beg, len);
04204             }
04205         }
04206         idx = NUM2LONG(indx);
04207         goto num_index;
04208     }
04209 
04210     UNREACHABLE;
04211 }
04212 
04213 /*
04214  *  call-seq:
04215  *     str.byteslice(fixnum)           -> new_str or nil
04216  *     str.byteslice(fixnum, fixnum)   -> new_str or nil
04217  *     str.byteslice(range)            -> new_str or nil
04218  *
04219  *  Byte Reference---If passed a single <code>Fixnum</code>, returns a
04220  *  substring of one byte at that position. If passed two <code>Fixnum</code>
04221  *  objects, returns a substring starting at the offset given by the first, and
04222  *  a length given by the second. If given a <code>Range</code>, a substring containing
04223  *  bytes at offsets given by the range is returned. In all three cases, if
04224  *  an offset is negative, it is counted from the end of <i>str</i>. Returns
04225  *  <code>nil</code> if the initial offset falls outside the string, the length
04226  *  is negative, or the beginning of the range is greater than the end.
04227  *  The encoding of the resulted string keeps original encoding.
04228  *
04229  *     "hello".byteslice(1)     #=> "e"
04230  *     "hello".byteslice(-1)    #=> "o"
04231  *     "hello".byteslice(1, 2)  #=> "el"
04232  *     "\x80\u3042".byteslice(1, 3) #=> "\u3042"
04233  *     "\x03\u3042\xff".byteslice(1..3) #=> "\u3042"
04234  */
04235 
04236 static VALUE
04237 rb_str_byteslice(int argc, VALUE *argv, VALUE str)
04238 {
04239     if (argc == 2) {
04240         return str_byte_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
04241     }
04242     rb_check_arity(argc, 1, 2);
04243     return str_byte_aref(str, argv[0]);
04244 }
04245 
04246 /*
04247  *  call-seq:
04248  *     str.reverse   -> new_str
04249  *
04250  *  Returns a new string with the characters from <i>str</i> in reverse order.
04251  *
04252  *     "stressed".reverse   #=> "desserts"
04253  */
04254 
04255 static VALUE
04256 rb_str_reverse(VALUE str)
04257 {
04258     rb_encoding *enc;
04259     VALUE rev;
04260     char *s, *e, *p;
04261     int single = 1;
04262 
04263     if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
04264     enc = STR_ENC_GET(str);
04265     rev = rb_str_new5(str, 0, RSTRING_LEN(str));
04266     s = RSTRING_PTR(str); e = RSTRING_END(str);
04267     p = RSTRING_END(rev);
04268 
04269     if (RSTRING_LEN(str) > 1) {
04270         if (single_byte_optimizable(str)) {
04271             while (s < e) {
04272                 *--p = *s++;
04273             }
04274         }
04275         else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID) {
04276             while (s < e) {
04277                 int clen = rb_enc_fast_mbclen(s, e, enc);
04278 
04279                 if (clen > 1 || (*s & 0x80)) single = 0;
04280                 p -= clen;
04281                 memcpy(p, s, clen);
04282                 s += clen;
04283             }
04284         }
04285         else {
04286             while (s < e) {
04287                 int clen = rb_enc_mbclen(s, e, enc);
04288 
04289                 if (clen > 1 || (*s & 0x80)) single = 0;
04290                 p -= clen;
04291                 memcpy(p, s, clen);
04292                 s += clen;
04293             }
04294         }
04295     }
04296     STR_SET_LEN(rev, RSTRING_LEN(str));
04297     OBJ_INFECT(rev, str);
04298     if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) {
04299         if (single) {
04300             ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
04301         }
04302         else {
04303             ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
04304         }
04305     }
04306     rb_enc_cr_str_copy_for_substr(rev, str);
04307 
04308     return rev;
04309 }
04310 
04311 
04312 /*
04313  *  call-seq:
04314  *     str.reverse!   -> str
04315  *
04316  *  Reverses <i>str</i> in place.
04317  */
04318 
04319 static VALUE
04320 rb_str_reverse_bang(VALUE str)
04321 {
04322     if (RSTRING_LEN(str) > 1) {
04323         if (single_byte_optimizable(str)) {
04324             char *s, *e, c;
04325 
04326             str_modify_keep_cr(str);
04327             s = RSTRING_PTR(str);
04328             e = RSTRING_END(str) - 1;
04329             while (s < e) {
04330                 c = *s;
04331                 *s++ = *e;
04332                 *e-- = c;
04333             }
04334         }
04335         else {
04336             rb_str_shared_replace(str, rb_str_reverse(str));
04337         }
04338     }
04339     else {
04340         str_modify_keep_cr(str);
04341     }
04342     return str;
04343 }
04344 
04345 
04346 /*
04347  *  call-seq:
04348  *     str.include? other_str   -> true or false
04349  *
04350  *  Returns <code>true</code> if <i>str</i> contains the given string or
04351  *  character.
04352  *
04353  *     "hello".include? "lo"   #=> true
04354  *     "hello".include? "ol"   #=> false
04355  *     "hello".include? ?h     #=> true
04356  */
04357 
04358 static VALUE
04359 rb_str_include(VALUE str, VALUE arg)
04360 {
04361     long i;
04362 
04363     StringValue(arg);
04364     i = rb_str_index(str, arg, 0);
04365 
04366     if (i == -1) return Qfalse;
04367     return Qtrue;
04368 }
04369 
04370 
04371 /*
04372  *  call-seq:
04373  *     str.to_i(base=10)   -> integer
04374  *
04375  *  Returns the result of interpreting leading characters in <i>str</i> as an
04376  *  integer base <i>base</i> (between 2 and 36). Extraneous characters past the
04377  *  end of a valid number are ignored. If there is not a valid number at the
04378  *  start of <i>str</i>, <code>0</code> is returned. This method never raises an
04379  *  exception when <i>base</i> is valid.
04380  *
04381  *     "12345".to_i             #=> 12345
04382  *     "99 red balloons".to_i   #=> 99
04383  *     "0a".to_i                #=> 0
04384  *     "0a".to_i(16)            #=> 10
04385  *     "hello".to_i             #=> 0
04386  *     "1100101".to_i(2)        #=> 101
04387  *     "1100101".to_i(8)        #=> 294977
04388  *     "1100101".to_i(10)       #=> 1100101
04389  *     "1100101".to_i(16)       #=> 17826049
04390  */
04391 
04392 static VALUE
04393 rb_str_to_i(int argc, VALUE *argv, VALUE str)
04394 {
04395     int base;
04396 
04397     if (argc == 0) base = 10;
04398     else {
04399         VALUE b;
04400 
04401         rb_scan_args(argc, argv, "01", &b);
04402         base = NUM2INT(b);
04403     }
04404     if (base < 0) {
04405         rb_raise(rb_eArgError, "invalid radix %d", base);
04406     }
04407     return rb_str_to_inum(str, base, FALSE);
04408 }
04409 
04410 
04411 /*
04412  *  call-seq:
04413  *     str.to_f   -> float
04414  *
04415  *  Returns the result of interpreting leading characters in <i>str</i> as a
04416  *  floating point number. Extraneous characters past the end of a valid number
04417  *  are ignored. If there is not a valid number at the start of <i>str</i>,
04418  *  <code>0.0</code> is returned. This method never raises an exception.
04419  *
04420  *     "123.45e1".to_f        #=> 1234.5
04421  *     "45.67 degrees".to_f   #=> 45.67
04422  *     "thx1138".to_f         #=> 0.0
04423  */
04424 
04425 static VALUE
04426 rb_str_to_f(VALUE str)
04427 {
04428     return DBL2NUM(rb_str_to_dbl(str, FALSE));
04429 }
04430 
04431 
04432 /*
04433  *  call-seq:
04434  *     str.to_s     -> str
04435  *     str.to_str   -> str
04436  *
04437  *  Returns the receiver.
04438  */
04439 
04440 static VALUE
04441 rb_str_to_s(VALUE str)
04442 {
04443     if (rb_obj_class(str) != rb_cString) {
04444         return str_duplicate(rb_cString, str);
04445     }
04446     return str;
04447 }
04448 
04449 #if 0
04450 static void
04451 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
04452 {
04453     char s[RUBY_MAX_CHAR_LEN];
04454     int n = rb_enc_codelen(c, enc);
04455 
04456     rb_enc_mbcput(c, s, enc);
04457     rb_enc_str_buf_cat(str, s, n, enc);
04458 }
04459 #endif
04460 
04461 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
04462 
04463 int
04464 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
04465 {
04466     char buf[CHAR_ESC_LEN + 1];
04467     int l;
04468 
04469 #if SIZEOF_INT > 4
04470     c &= 0xffffffff;
04471 #endif
04472     if (unicode_p) {
04473         if (c < 0x7F && ISPRINT(c)) {
04474             snprintf(buf, CHAR_ESC_LEN, "%c", c);
04475         }
04476         else if (c < 0x10000) {
04477             snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
04478         }
04479         else {
04480             snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
04481         }
04482     }
04483     else {
04484         if (c < 0x100) {
04485             snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
04486         }
04487         else {
04488             snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
04489         }
04490     }
04491     l = (int)strlen(buf);       /* CHAR_ESC_LEN cannot exceed INT_MAX */
04492     rb_str_buf_cat(result, buf, l);
04493     return l;
04494 }
04495 
04496 /*
04497  * call-seq:
04498  *   str.inspect   -> string
04499  *
04500  * Returns a printable version of _str_, surrounded by quote marks,
04501  * with special characters escaped.
04502  *
04503  *    str = "hello"
04504  *    str[3] = "\b"
04505  *    str.inspect       #=> "\"hel\\bo\""
04506  */
04507 
04508 VALUE
04509 rb_str_inspect(VALUE str)
04510 {
04511     rb_encoding *enc = STR_ENC_GET(str);
04512     const char *p, *pend, *prev;
04513     char buf[CHAR_ESC_LEN + 1];
04514     VALUE result = rb_str_buf_new(0);
04515     rb_encoding *resenc = rb_default_internal_encoding();
04516     int unicode_p = rb_enc_unicode_p(enc);
04517     int asciicompat = rb_enc_asciicompat(enc);
04518     static rb_encoding *utf16, *utf32;
04519 
04520     if (!utf16) utf16 = rb_enc_find("UTF-16");
04521     if (!utf32) utf32 = rb_enc_find("UTF-32");
04522     if (resenc == NULL) resenc = rb_default_external_encoding();
04523     if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
04524     rb_enc_associate(result, resenc);
04525     str_buf_cat2(result, "\"");
04526 
04527     p = RSTRING_PTR(str); pend = RSTRING_END(str);
04528     prev = p;
04529     if (enc == utf16) {
04530         const unsigned char *q = (const unsigned char *)p;
04531         if (q[0] == 0xFE && q[1] == 0xFF)
04532             enc = rb_enc_find("UTF-16BE");
04533         else if (q[0] == 0xFF && q[1] == 0xFE)
04534             enc = rb_enc_find("UTF-16LE");
04535         else
04536             unicode_p = 0;
04537     }
04538     else if (enc == utf32) {
04539         const unsigned char *q = (const unsigned char *)p;
04540         if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF)
04541             enc = rb_enc_find("UTF-32BE");
04542         else if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF)
04543             enc = rb_enc_find("UTF-32LE");
04544         else
04545             unicode_p = 0;
04546     }
04547     while (p < pend) {
04548         unsigned int c, cc;
04549         int n;
04550 
04551         n = rb_enc_precise_mbclen(p, pend, enc);
04552         if (!MBCLEN_CHARFOUND_P(n)) {
04553             if (p > prev) str_buf_cat(result, prev, p - prev);
04554             n = rb_enc_mbminlen(enc);
04555             if (pend < p + n)
04556                 n = (int)(pend - p);
04557             while (n--) {
04558                 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
04559                 str_buf_cat(result, buf, strlen(buf));
04560                 prev = ++p;
04561             }
04562             continue;
04563         }
04564         n = MBCLEN_CHARFOUND_LEN(n);
04565         c = rb_enc_mbc_to_codepoint(p, pend, enc);
04566         p += n;
04567         if ((asciicompat || unicode_p) &&
04568           (c == '"'|| c == '\\' ||
04569             (c == '#' &&
04570              p < pend &&
04571              MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
04572              (cc = rb_enc_codepoint(p,pend,enc),
04573               (cc == '$' || cc == '@' || cc == '{'))))) {
04574             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04575             str_buf_cat2(result, "\\");
04576             if (asciicompat || enc == resenc) {
04577                 prev = p - n;
04578                 continue;
04579             }
04580         }
04581         switch (c) {
04582           case '\n': cc = 'n'; break;
04583           case '\r': cc = 'r'; break;
04584           case '\t': cc = 't'; break;
04585           case '\f': cc = 'f'; break;
04586           case '\013': cc = 'v'; break;
04587           case '\010': cc = 'b'; break;
04588           case '\007': cc = 'a'; break;
04589           case 033: cc = 'e'; break;
04590           default: cc = 0; break;
04591         }
04592         if (cc) {
04593             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04594             buf[0] = '\\';
04595             buf[1] = (char)cc;
04596             str_buf_cat(result, buf, 2);
04597             prev = p;
04598             continue;
04599         }
04600         if ((enc == resenc && rb_enc_isprint(c, enc)) ||
04601             (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
04602             continue;
04603         }
04604         else {
04605             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04606             rb_str_buf_cat_escaped_char(result, c, unicode_p);
04607             prev = p;
04608             continue;
04609         }
04610     }
04611     if (p > prev) str_buf_cat(result, prev, p - prev);
04612     str_buf_cat2(result, "\"");
04613 
04614     OBJ_INFECT(result, str);
04615     return result;
04616 }
04617 
04618 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
04619 
04620 /*
04621  *  call-seq:
04622  *     str.dump   -> new_str
04623  *
04624  *  Produces a version of +str+ with all non-printing characters replaced by
04625  *  <code>\nnn</code> notation and all special characters escaped.
04626  *
04627  *    "hello \n ''".dump  #=> "\"hello \\n ''\"
04628  */
04629 
04630 VALUE
04631 rb_str_dump(VALUE str)
04632 {
04633     rb_encoding *enc = rb_enc_get(str);
04634     long len;
04635     const char *p, *pend;
04636     char *q, *qend;
04637     VALUE result;
04638     int u8 = (enc == rb_utf8_encoding());
04639 
04640     len = 2;                    /* "" */
04641     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
04642     while (p < pend) {
04643         unsigned char c = *p++;
04644         switch (c) {
04645           case '"':  case '\\':
04646           case '\n': case '\r':
04647           case '\t': case '\f':
04648           case '\013': case '\010': case '\007': case '\033':
04649             len += 2;
04650             break;
04651 
04652           case '#':
04653             len += IS_EVSTR(p, pend) ? 2 : 1;
04654             break;
04655 
04656           default:
04657             if (ISPRINT(c)) {
04658                 len++;
04659             }
04660             else {
04661                 if (u8) {       /* \u{NN} */
04662                     int n = rb_enc_precise_mbclen(p-1, pend, enc);
04663                     if (MBCLEN_CHARFOUND_P(n-1)) {
04664                         unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
04665                         while (cc >>= 4) len++;
04666                         len += 5;
04667                         p += MBCLEN_CHARFOUND_LEN(n)-1;
04668                         break;
04669                     }
04670                 }
04671                 len += 4;       /* \xNN */
04672             }
04673             break;
04674         }
04675     }
04676     if (!rb_enc_asciicompat(enc)) {
04677         len += 19;              /* ".force_encoding('')" */
04678         len += strlen(enc->name);
04679     }
04680 
04681     result = rb_str_new5(str, 0, len);
04682     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
04683     q = RSTRING_PTR(result); qend = q + len + 1;
04684 
04685     *q++ = '"';
04686     while (p < pend) {
04687         unsigned char c = *p++;
04688 
04689         if (c == '"' || c == '\\') {
04690             *q++ = '\\';
04691             *q++ = c;
04692         }
04693         else if (c == '#') {
04694             if (IS_EVSTR(p, pend)) *q++ = '\\';
04695             *q++ = '#';
04696         }
04697         else if (c == '\n') {
04698             *q++ = '\\';
04699             *q++ = 'n';
04700         }
04701         else if (c == '\r') {
04702             *q++ = '\\';
04703             *q++ = 'r';
04704         }
04705         else if (c == '\t') {
04706             *q++ = '\\';
04707             *q++ = 't';
04708         }
04709         else if (c == '\f') {
04710             *q++ = '\\';
04711             *q++ = 'f';
04712         }
04713         else if (c == '\013') {
04714             *q++ = '\\';
04715             *q++ = 'v';
04716         }
04717         else if (c == '\010') {
04718             *q++ = '\\';
04719             *q++ = 'b';
04720         }
04721         else if (c == '\007') {
04722             *q++ = '\\';
04723             *q++ = 'a';
04724         }
04725         else if (c == '\033') {
04726             *q++ = '\\';
04727             *q++ = 'e';
04728         }
04729         else if (ISPRINT(c)) {
04730             *q++ = c;
04731         }
04732         else {
04733             *q++ = '\\';
04734             if (u8) {
04735                 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
04736                 if (MBCLEN_CHARFOUND_P(n)) {
04737                     int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
04738                     p += n;
04739                     snprintf(q, qend-q, "u{%x}", cc);
04740                     q += strlen(q);
04741                     continue;
04742                 }
04743             }
04744             snprintf(q, qend-q, "x%02X", c);
04745             q += 3;
04746         }
04747     }
04748     *q++ = '"';
04749     *q = '\0';
04750     if (!rb_enc_asciicompat(enc)) {
04751         snprintf(q, qend-q, ".force_encoding(\"%s\")", enc->name);
04752         enc = rb_ascii8bit_encoding();
04753     }
04754     OBJ_INFECT(result, str);
04755     /* result from dump is ASCII */
04756     rb_enc_associate(result, enc);
04757     ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT);
04758     return result;
04759 }
04760 
04761 
04762 static void
04763 rb_str_check_dummy_enc(rb_encoding *enc)
04764 {
04765     if (rb_enc_dummy_p(enc)) {
04766         rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
04767                  rb_enc_name(enc));
04768     }
04769 }
04770 
04771 /*
04772  *  call-seq:
04773  *     str.upcase!   -> str or nil
04774  *
04775  *  Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
04776  *  were made.
04777  *  Note: case replacement is effective only in ASCII region.
04778  */
04779 
04780 static VALUE
04781 rb_str_upcase_bang(VALUE str)
04782 {
04783     rb_encoding *enc;
04784     char *s, *send;
04785     int modify = 0;
04786     int n;
04787 
04788     str_modify_keep_cr(str);
04789     enc = STR_ENC_GET(str);
04790     rb_str_check_dummy_enc(enc);
04791     s = RSTRING_PTR(str); send = RSTRING_END(str);
04792     if (single_byte_optimizable(str)) {
04793         while (s < send) {
04794             unsigned int c = *(unsigned char*)s;
04795 
04796             if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
04797                 *s = 'A' + (c - 'a');
04798                 modify = 1;
04799             }
04800             s++;
04801         }
04802     }
04803     else {
04804         int ascompat = rb_enc_asciicompat(enc);
04805 
04806         while (s < send) {
04807             unsigned int c;
04808 
04809             if (ascompat && (c = *(unsigned char*)s) < 0x80) {
04810                 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
04811                     *s = 'A' + (c - 'a');
04812                     modify = 1;
04813                 }
04814                 s++;
04815             }
04816             else {
04817                 c = rb_enc_codepoint_len(s, send, &n, enc);
04818                 if (rb_enc_islower(c, enc)) {
04819                     /* assuming toupper returns codepoint with same size */
04820                     rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04821                     modify = 1;
04822                 }
04823                 s += n;
04824             }
04825         }
04826     }
04827 
04828     if (modify) return str;
04829     return Qnil;
04830 }
04831 
04832 
04833 /*
04834  *  call-seq:
04835  *     str.upcase   -> new_str
04836  *
04837  *  Returns a copy of <i>str</i> with all lowercase letters replaced with their
04838  *  uppercase counterparts. The operation is locale insensitive---only
04839  *  characters ``a'' to ``z'' are affected.
04840  *  Note: case replacement is effective only in ASCII region.
04841  *
04842  *     "hEllO".upcase   #=> "HELLO"
04843  */
04844 
04845 static VALUE
04846 rb_str_upcase(VALUE str)
04847 {
04848     str = rb_str_dup(str);
04849     rb_str_upcase_bang(str);
04850     return str;
04851 }
04852 
04853 
04854 /*
04855  *  call-seq:
04856  *     str.downcase!   -> str or nil
04857  *
04858  *  Downcases the contents of <i>str</i>, returning <code>nil</code> if no
04859  *  changes were made.
04860  *  Note: case replacement is effective only in ASCII region.
04861  */
04862 
04863 static VALUE
04864 rb_str_downcase_bang(VALUE str)
04865 {
04866     rb_encoding *enc;
04867     char *s, *send;
04868     int modify = 0;
04869 
04870     str_modify_keep_cr(str);
04871     enc = STR_ENC_GET(str);
04872     rb_str_check_dummy_enc(enc);
04873     s = RSTRING_PTR(str); send = RSTRING_END(str);
04874     if (single_byte_optimizable(str)) {
04875         while (s < send) {
04876             unsigned int c = *(unsigned char*)s;
04877 
04878             if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
04879                 *s = 'a' + (c - 'A');
04880                 modify = 1;
04881             }
04882             s++;
04883         }
04884     }
04885     else {
04886         int ascompat = rb_enc_asciicompat(enc);
04887 
04888         while (s < send) {
04889             unsigned int c;
04890             int n;
04891 
04892             if (ascompat && (c = *(unsigned char*)s) < 0x80) {
04893                 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
04894                     *s = 'a' + (c - 'A');
04895                     modify = 1;
04896                 }
04897                 s++;
04898             }
04899             else {
04900                 c = rb_enc_codepoint_len(s, send, &n, enc);
04901                 if (rb_enc_isupper(c, enc)) {
04902                     /* assuming toupper returns codepoint with same size */
04903                     rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04904                     modify = 1;
04905                 }
04906                 s += n;
04907             }
04908         }
04909     }
04910 
04911     if (modify) return str;
04912     return Qnil;
04913 }
04914 
04915 
04916 /*
04917  *  call-seq:
04918  *     str.downcase   -> new_str
04919  *
04920  *  Returns a copy of <i>str</i> with all uppercase letters replaced with their
04921  *  lowercase counterparts. The operation is locale insensitive---only
04922  *  characters ``A'' to ``Z'' are affected.
04923  *  Note: case replacement is effective only in ASCII region.
04924  *
04925  *     "hEllO".downcase   #=> "hello"
04926  */
04927 
04928 static VALUE
04929 rb_str_downcase(VALUE str)
04930 {
04931     str = rb_str_dup(str);
04932     rb_str_downcase_bang(str);
04933     return str;
04934 }
04935 
04936 
04937 /*
04938  *  call-seq:
04939  *     str.capitalize!   -> str or nil
04940  *
04941  *  Modifies <i>str</i> by converting the first character to uppercase and the
04942  *  remainder to lowercase. Returns <code>nil</code> if no changes are made.
04943  *  Note: case conversion is effective only in ASCII region.
04944  *
04945  *     a = "hello"
04946  *     a.capitalize!   #=> "Hello"
04947  *     a               #=> "Hello"
04948  *     a.capitalize!   #=> nil
04949  */
04950 
04951 static VALUE
04952 rb_str_capitalize_bang(VALUE str)
04953 {
04954     rb_encoding *enc;
04955     char *s, *send;
04956     int modify = 0;
04957     unsigned int c;
04958     int n;
04959 
04960     str_modify_keep_cr(str);
04961     enc = STR_ENC_GET(str);
04962     rb_str_check_dummy_enc(enc);
04963     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
04964     s = RSTRING_PTR(str); send = RSTRING_END(str);
04965 
04966     c = rb_enc_codepoint_len(s, send, &n, enc);
04967     if (rb_enc_islower(c, enc)) {
04968         rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04969         modify = 1;
04970     }
04971     s += n;
04972     while (s < send) {
04973         c = rb_enc_codepoint_len(s, send, &n, enc);
04974         if (rb_enc_isupper(c, enc)) {
04975             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04976             modify = 1;
04977         }
04978         s += n;
04979     }
04980 
04981     if (modify) return str;
04982     return Qnil;
04983 }
04984 
04985 
04986 /*
04987  *  call-seq:
04988  *     str.capitalize   -> new_str
04989  *
04990  *  Returns a copy of <i>str</i> with the first character converted to uppercase
04991  *  and the remainder to lowercase.
04992  *  Note: case conversion is effective only in ASCII region.
04993  *
04994  *     "hello".capitalize    #=> "Hello"
04995  *     "HELLO".capitalize    #=> "Hello"
04996  *     "123ABC".capitalize   #=> "123abc"
04997  */
04998 
04999 static VALUE
05000 rb_str_capitalize(VALUE str)
05001 {
05002     str = rb_str_dup(str);
05003     rb_str_capitalize_bang(str);
05004     return str;
05005 }
05006 
05007 
05008 /*
05009  *  call-seq:
05010  *     str.swapcase!   -> str or nil
05011  *
05012  *  Equivalent to <code>String#swapcase</code>, but modifies the receiver in
05013  *  place, returning <i>str</i>, or <code>nil</code> if no changes were made.
05014  *  Note: case conversion is effective only in ASCII region.
05015  */
05016 
05017 static VALUE
05018 rb_str_swapcase_bang(VALUE str)
05019 {
05020     rb_encoding *enc;
05021     char *s, *send;
05022     int modify = 0;
05023     int n;
05024 
05025     str_modify_keep_cr(str);
05026     enc = STR_ENC_GET(str);
05027     rb_str_check_dummy_enc(enc);
05028     s = RSTRING_PTR(str); send = RSTRING_END(str);
05029     while (s < send) {
05030         unsigned int c = rb_enc_codepoint_len(s, send, &n, enc);
05031 
05032         if (rb_enc_isupper(c, enc)) {
05033             /* assuming toupper returns codepoint with same size */
05034             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
05035             modify = 1;
05036         }
05037         else if (rb_enc_islower(c, enc)) {
05038             /* assuming tolower returns codepoint with same size */
05039             rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
05040             modify = 1;
05041         }
05042         s += n;
05043     }
05044 
05045     if (modify) return str;
05046     return Qnil;
05047 }
05048 
05049 
05050 /*
05051  *  call-seq:
05052  *     str.swapcase   -> new_str
05053  *
05054  *  Returns a copy of <i>str</i> with uppercase alphabetic characters converted
05055  *  to lowercase and lowercase characters converted to uppercase.
05056  *  Note: case conversion is effective only in ASCII region.
05057  *
05058  *     "Hello".swapcase          #=> "hELLO"
05059  *     "cYbEr_PuNk11".swapcase   #=> "CyBeR_pUnK11"
05060  */
05061 
05062 static VALUE
05063 rb_str_swapcase(VALUE str)
05064 {
05065     str = rb_str_dup(str);
05066     rb_str_swapcase_bang(str);
05067     return str;
05068 }
05069 
05070 typedef unsigned char *USTR;
05071 
05072 struct tr {
05073     int gen;
05074     unsigned int now, max;
05075     char *p, *pend;
05076 };
05077 
05078 static unsigned int
05079 trnext(struct tr *t, rb_encoding *enc)
05080 {
05081     int n;
05082 
05083     for (;;) {
05084         if (!t->gen) {
05085 nextpart:
05086             if (t->p == t->pend) return -1;
05087             if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
05088                 t->p += n;
05089             }
05090             t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
05091             t->p += n;
05092             if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
05093                 t->p += n;
05094                 if (t->p < t->pend) {
05095                     unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
05096                     t->p += n;
05097                     if (t->now > c) {
05098                         if (t->now < 0x80 && c < 0x80) {
05099                             rb_raise(rb_eArgError,
05100                                      "invalid range \"%c-%c\" in string transliteration",
05101                                      t->now, c);
05102                         }
05103                         else {
05104                             rb_raise(rb_eArgError, "invalid range in string transliteration");
05105                         }
05106                         continue; /* not reached */
05107                     }
05108                     t->gen = 1;
05109                     t->max = c;
05110                 }
05111             }
05112             return t->now;
05113         }
05114         else {
05115             while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
05116                 if (t->now == t->max) {
05117                     t->gen = 0;
05118                     goto nextpart;
05119                 }
05120             }
05121             if (t->now < t->max) {
05122                 return t->now;
05123             }
05124             else {
05125                 t->gen = 0;
05126                 return t->max;
05127             }
05128         }
05129     }
05130 }
05131 
05132 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
05133 
05134 static VALUE
05135 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
05136 {
05137     const unsigned int errc = -1;
05138     unsigned int trans[256];
05139     rb_encoding *enc, *e1, *e2;
05140     struct tr trsrc, trrepl;
05141     int cflag = 0;
05142     unsigned int c, c0, last = 0;
05143     int modify = 0, i, l;
05144     char *s, *send;
05145     VALUE hash = 0;
05146     int singlebyte = single_byte_optimizable(str);
05147     int cr;
05148 
05149 #define CHECK_IF_ASCII(c) \
05150     (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
05151            (cr = ENC_CODERANGE_VALID) : 0)
05152 
05153     StringValue(src);
05154     StringValue(repl);
05155     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
05156     if (RSTRING_LEN(repl) == 0) {
05157         return rb_str_delete_bang(1, &src, str);
05158     }
05159 
05160     cr = ENC_CODERANGE(str);
05161     e1 = rb_enc_check(str, src);
05162     e2 = rb_enc_check(str, repl);
05163     if (e1 == e2) {
05164         enc = e1;
05165     }
05166     else {
05167         enc = rb_enc_check(src, repl);
05168     }
05169     trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
05170     if (RSTRING_LEN(src) > 1 &&
05171         rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
05172         trsrc.p + l < trsrc.pend) {
05173         cflag = 1;
05174         trsrc.p += l;
05175     }
05176     trrepl.p = RSTRING_PTR(repl);
05177     trrepl.pend = trrepl.p + RSTRING_LEN(repl);
05178     trsrc.gen = trrepl.gen = 0;
05179     trsrc.now = trrepl.now = 0;
05180     trsrc.max = trrepl.max = 0;
05181 
05182     if (cflag) {
05183         for (i=0; i<256; i++) {
05184             trans[i] = 1;
05185         }
05186         while ((c = trnext(&trsrc, enc)) != errc) {
05187             if (c < 256) {
05188                 trans[c] = errc;
05189             }
05190             else {
05191                 if (!hash) hash = rb_hash_new();
05192                 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
05193             }
05194         }
05195         while ((c = trnext(&trrepl, enc)) != errc)
05196             /* retrieve last replacer */;
05197         last = trrepl.now;
05198         for (i=0; i<256; i++) {
05199             if (trans[i] != errc) {
05200                 trans[i] = last;
05201             }
05202         }
05203     }
05204     else {
05205         unsigned int r;
05206 
05207         for (i=0; i<256; i++) {
05208             trans[i] = errc;
05209         }
05210         while ((c = trnext(&trsrc, enc)) != errc) {
05211             r = trnext(&trrepl, enc);
05212             if (r == errc) r = trrepl.now;
05213             if (c < 256) {
05214                 trans[c] = r;
05215                 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
05216             }
05217             else {
05218                 if (!hash) hash = rb_hash_new();
05219                 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
05220             }
05221         }
05222     }
05223 
05224     if (cr == ENC_CODERANGE_VALID)
05225         cr = ENC_CODERANGE_7BIT;
05226     str_modify_keep_cr(str);
05227     s = RSTRING_PTR(str); send = RSTRING_END(str);
05228     if (sflag) {
05229         int clen, tlen;
05230         long offset, max = RSTRING_LEN(str);
05231         unsigned int save = -1;
05232         char *buf = ALLOC_N(char, max), *t = buf;
05233 
05234         while (s < send) {
05235             int may_modify = 0;
05236 
05237             c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
05238             tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
05239 
05240             s += clen;
05241             if (c < 256) {
05242                 c = trans[c];
05243             }
05244             else if (hash) {
05245                 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
05246                 if (NIL_P(tmp)) {
05247                     if (cflag) c = last;
05248                     else c = errc;
05249                 }
05250                 else if (cflag) c = errc;
05251                 else c = NUM2INT(tmp);
05252             }
05253             else {
05254                 c = errc;
05255             }
05256             if (c != (unsigned int)-1) {
05257                 if (save == c) {
05258                     CHECK_IF_ASCII(c);
05259                     continue;
05260                 }
05261                 save = c;
05262                 tlen = rb_enc_codelen(c, enc);
05263                 modify = 1;
05264             }
05265             else {
05266                 save = -1;
05267                 c = c0;
05268                 if (enc != e1) may_modify = 1;
05269             }
05270             while (t - buf + tlen >= max) {
05271                 offset = t - buf;
05272                 max *= 2;
05273                 REALLOC_N(buf, char, max);
05274                 t = buf + offset;
05275             }
05276             rb_enc_mbcput(c, t, enc);
05277             if (may_modify && memcmp(s, t, tlen) != 0) {
05278                 modify = 1;
05279             }
05280             CHECK_IF_ASCII(c);
05281             t += tlen;
05282         }
05283         if (!STR_EMBED_P(str)) {
05284             xfree(RSTRING(str)->as.heap.ptr);
05285         }
05286         *t = '\0';
05287         RSTRING(str)->as.heap.ptr = buf;
05288         RSTRING(str)->as.heap.len = t - buf;
05289         STR_SET_NOEMBED(str);
05290         RSTRING(str)->as.heap.aux.capa = max;
05291     }
05292     else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
05293         while (s < send) {
05294             c = (unsigned char)*s;
05295             if (trans[c] != errc) {
05296                 if (!cflag) {
05297                     c = trans[c];
05298                     *s = c;
05299                     modify = 1;
05300                 }
05301                 else {
05302                     *s = last;
05303                     modify = 1;
05304                 }
05305             }
05306             CHECK_IF_ASCII(c);
05307             s++;
05308         }
05309     }
05310     else {
05311         int clen, tlen, max = (int)(RSTRING_LEN(str) * 1.2);
05312         long offset;
05313         char *buf = ALLOC_N(char, max), *t = buf;
05314 
05315         while (s < send) {
05316             int may_modify = 0;
05317             c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
05318             tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
05319 
05320             if (c < 256) {
05321                 c = trans[c];
05322             }
05323             else if (hash) {
05324                 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
05325                 if (NIL_P(tmp)) {
05326                     if (cflag) c = last;
05327                     else c = errc;
05328                 }
05329                 else if (cflag) c = errc;
05330                 else c = NUM2INT(tmp);
05331             }
05332             else {
05333                 c = cflag ? last : errc;
05334             }
05335             if (c != errc) {
05336                 tlen = rb_enc_codelen(c, enc);
05337                 modify = 1;
05338             }
05339             else {
05340                 c = c0;
05341                 if (enc != e1) may_modify = 1;
05342             }
05343             while (t - buf + tlen >= max) {
05344                 offset = t - buf;
05345                 max *= 2;
05346                 REALLOC_N(buf, char, max);
05347                 t = buf + offset;
05348             }
05349             if (s != t) {
05350                 rb_enc_mbcput(c, t, enc);
05351                 if (may_modify && memcmp(s, t, tlen) != 0) {
05352                     modify = 1;
05353                 }
05354             }
05355             CHECK_IF_ASCII(c);
05356             s += clen;
05357             t += tlen;
05358         }
05359         if (!STR_EMBED_P(str)) {
05360             xfree(RSTRING(str)->as.heap.ptr);
05361         }
05362         *t = '\0';
05363         RSTRING(str)->as.heap.ptr = buf;
05364         RSTRING(str)->as.heap.len = t - buf;
05365         STR_SET_NOEMBED(str);
05366         RSTRING(str)->as.heap.aux.capa = max;
05367     }
05368 
05369     if (modify) {
05370         if (cr != ENC_CODERANGE_BROKEN)
05371             ENC_CODERANGE_SET(str, cr);
05372         rb_enc_associate(str, enc);
05373         return str;
05374     }
05375     return Qnil;
05376 }
05377 
05378 
05379 /*
05380  *  call-seq:
05381  *     str.tr!(from_str, to_str)   -> str or nil
05382  *
05383  *  Translates <i>str</i> in place, using the same rules as
05384  *  <code>String#tr</code>. Returns <i>str</i>, or <code>nil</code> if no
05385  *  changes were made.
05386  */
05387 
05388 static VALUE
05389 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
05390 {
05391     return tr_trans(str, src, repl, 0);
05392 }
05393 
05394 
05395 /*
05396  *  call-seq:
05397  *     str.tr(from_str, to_str)   => new_str
05398  *
05399  *  Returns a copy of +str+ with the characters in +from_str+ replaced by the
05400  *  corresponding characters in +to_str+.  If +to_str+ is shorter than
05401  *  +from_str+, it is padded with its last character in order to maintain the
05402  *  correspondence.
05403  *
05404  *     "hello".tr('el', 'ip')      #=> "hippo"
05405  *     "hello".tr('aeiou', '*')    #=> "h*ll*"
05406  *     "hello".tr('aeiou', 'AA*')  #=> "hAll*"
05407  *
05408  *  Both strings may use the <code>c1-c2</code> notation to denote ranges of
05409  *  characters, and +from_str+ may start with a <code>^</code>, which denotes
05410  *  all characters except those listed.
05411  *
05412  *     "hello".tr('a-y', 'b-z')    #=> "ifmmp"
05413  *     "hello".tr('^aeiou', '*')   #=> "*e**o"
05414  *
05415  *  The backslash character <code></code> can be used to escape
05416  *  <code>^</code> or <code>-</code> and is otherwise ignored unless it
05417  *  appears at the end of a range or the end of the +from_str+ or +to_str+:
05418  *
05419  *     "hello^world".tr("\\^aeiou", "*") #=> "h*ll**w*rld"
05420  *     "hello-world".tr("a\\-eo", "*")   #=> "h*ll**w*rld"
05421  *
05422  *     "hello\r\nworld".tr("\r", "")   #=> "hello\nworld"
05423  *     "hello\r\nworld".tr("\\r", "")  #=> "hello\r\nwold"
05424  *     "hello\r\nworld".tr("\\\r", "") #=> "hello\nworld"
05425  *
05426  *     "X['\\b']".tr("X\\", "")   #=> "['b']"
05427  *     "X['\\b']".tr("X-\\]", "") #=> "'b'"
05428  */
05429 
05430 static VALUE
05431 rb_str_tr(VALUE str, VALUE src, VALUE repl)
05432 {
05433     str = rb_str_dup(str);
05434     tr_trans(str, src, repl, 0);
05435     return str;
05436 }
05437 
05438 #define TR_TABLE_SIZE 257
05439 static void
05440 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
05441                VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
05442 {
05443     const unsigned int errc = -1;
05444     char buf[256];
05445     struct tr tr;
05446     unsigned int c;
05447     VALUE table = 0, ptable = 0;
05448     int i, l, cflag = 0;
05449 
05450     tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
05451     tr.gen = tr.now = tr.max = 0;
05452 
05453     if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
05454         cflag = 1;
05455         tr.p += l;
05456     }
05457     if (first) {
05458         for (i=0; i<256; i++) {
05459             stable[i] = 1;
05460         }
05461         stable[256] = cflag;
05462     }
05463     else if (stable[256] && !cflag) {
05464         stable[256] = 0;
05465     }
05466     for (i=0; i<256; i++) {
05467         buf[i] = cflag;
05468     }
05469 
05470     while ((c = trnext(&tr, enc)) != errc) {
05471         if (c < 256) {
05472             buf[c & 0xff] = !cflag;
05473         }
05474         else {
05475             VALUE key = UINT2NUM(c);
05476 
05477             if (!table && (first || *tablep || stable[256])) {
05478                 if (cflag) {
05479                     ptable = *ctablep;
05480                     table = ptable ? ptable : rb_hash_new();
05481                     *ctablep = table;
05482                 }
05483                 else {
05484                     table = rb_hash_new();
05485                     ptable = *tablep;
05486                     *tablep = table;
05487                 }
05488             }
05489             if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
05490                 rb_hash_aset(table, key, Qtrue);
05491             }
05492         }
05493     }
05494     for (i=0; i<256; i++) {
05495         stable[i] = stable[i] && buf[i];
05496     }
05497     if (!table && !cflag) {
05498         *tablep = 0;
05499     }
05500 }
05501 
05502 
05503 static int
05504 tr_find(unsigned int c, char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
05505 {
05506     if (c < 256) {
05507         return table[c] != 0;
05508     }
05509     else {
05510         VALUE v = UINT2NUM(c);
05511 
05512         if (del) {
05513             if (!NIL_P(rb_hash_lookup(del, v)) &&
05514                     (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
05515                 return TRUE;
05516             }
05517         }
05518         else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
05519             return FALSE;
05520         }
05521         return table[256] ? TRUE : FALSE;
05522     }
05523 }
05524 
05525 /*
05526  *  call-seq:
05527  *     str.delete!([other_str]+)   -> str or nil
05528  *
05529  *  Performs a <code>delete</code> operation in place, returning <i>str</i>, or
05530  *  <code>nil</code> if <i>str</i> was not modified.
05531  */
05532 
05533 static VALUE
05534 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
05535 {
05536     char squeez[TR_TABLE_SIZE];
05537     rb_encoding *enc = 0;
05538     char *s, *send, *t;
05539     VALUE del = 0, nodel = 0;
05540     int modify = 0;
05541     int i, ascompat, cr;
05542 
05543     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
05544     rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
05545     for (i=0; i<argc; i++) {
05546         VALUE s = argv[i];
05547 
05548         StringValue(s);
05549         enc = rb_enc_check(str, s);
05550         tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
05551     }
05552 
05553     str_modify_keep_cr(str);
05554     ascompat = rb_enc_asciicompat(enc);
05555     s = t = RSTRING_PTR(str);
05556     send = RSTRING_END(str);
05557     cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
05558     while (s < send) {
05559         unsigned int c;
05560         int clen;
05561 
05562         if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05563             if (squeez[c]) {
05564                 modify = 1;
05565             }
05566             else {
05567                 if (t != s) *t = c;
05568                 t++;
05569             }
05570             s++;
05571         }
05572         else {
05573             c = rb_enc_codepoint_len(s, send, &clen, enc);
05574 
05575             if (tr_find(c, squeez, del, nodel)) {
05576                 modify = 1;
05577             }
05578             else {
05579                 if (t != s) rb_enc_mbcput(c, t, enc);
05580                 t += clen;
05581                 if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
05582             }
05583             s += clen;
05584         }
05585     }
05586     *t = '\0';
05587     STR_SET_LEN(str, t - RSTRING_PTR(str));
05588     ENC_CODERANGE_SET(str, cr);
05589 
05590     if (modify) return str;
05591     return Qnil;
05592 }
05593 
05594 
05595 /*
05596  *  call-seq:
05597  *     str.delete([other_str]+)   -> new_str
05598  *
05599  *  Returns a copy of <i>str</i> with all characters in the intersection of its
05600  *  arguments deleted. Uses the same rules for building the set of characters as
05601  *  <code>String#count</code>.
05602  *
05603  *     "hello".delete "l","lo"        #=> "heo"
05604  *     "hello".delete "lo"            #=> "he"
05605  *     "hello".delete "aeiou", "^e"   #=> "hell"
05606  *     "hello".delete "ej-m"          #=> "ho"
05607  */
05608 
05609 static VALUE
05610 rb_str_delete(int argc, VALUE *argv, VALUE str)
05611 {
05612     str = rb_str_dup(str);
05613     rb_str_delete_bang(argc, argv, str);
05614     return str;
05615 }
05616 
05617 
05618 /*
05619  *  call-seq:
05620  *     str.squeeze!([other_str]*)   -> str or nil
05621  *
05622  *  Squeezes <i>str</i> in place, returning either <i>str</i>, or
05623  *  <code>nil</code> if no changes were made.
05624  */
05625 
05626 static VALUE
05627 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
05628 {
05629     char squeez[TR_TABLE_SIZE];
05630     rb_encoding *enc = 0;
05631     VALUE del = 0, nodel = 0;
05632     char *s, *send, *t;
05633     int i, modify = 0;
05634     int ascompat, singlebyte = single_byte_optimizable(str);
05635     unsigned int save;
05636 
05637     if (argc == 0) {
05638         enc = STR_ENC_GET(str);
05639     }
05640     else {
05641         for (i=0; i<argc; i++) {
05642             VALUE s = argv[i];
05643 
05644             StringValue(s);
05645             enc = rb_enc_check(str, s);
05646             if (singlebyte && !single_byte_optimizable(s))
05647                 singlebyte = 0;
05648             tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
05649         }
05650     }
05651 
05652     str_modify_keep_cr(str);
05653     s = t = RSTRING_PTR(str);
05654     if (!s || RSTRING_LEN(str) == 0) return Qnil;
05655     send = RSTRING_END(str);
05656     save = -1;
05657     ascompat = rb_enc_asciicompat(enc);
05658 
05659     if (singlebyte) {
05660         while (s < send) {
05661             unsigned int c = *(unsigned char*)s++;
05662             if (c != save || (argc > 0 && !squeez[c])) {
05663                 *t++ = save = c;
05664             }
05665         }
05666     } else {
05667         while (s < send) {
05668             unsigned int c;
05669             int clen;
05670 
05671             if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05672                 if (c != save || (argc > 0 && !squeez[c])) {
05673                     *t++ = save = c;
05674                 }
05675                 s++;
05676             }
05677             else {
05678                 c = rb_enc_codepoint_len(s, send, &clen, enc);
05679 
05680                 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
05681                     if (t != s) rb_enc_mbcput(c, t, enc);
05682                     save = c;
05683                     t += clen;
05684                 }
05685                 s += clen;
05686             }
05687         }
05688     }
05689 
05690     *t = '\0';
05691     if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
05692         STR_SET_LEN(str, t - RSTRING_PTR(str));
05693         modify = 1;
05694     }
05695 
05696     if (modify) return str;
05697     return Qnil;
05698 }
05699 
05700 
05701 /*
05702  *  call-seq:
05703  *     str.squeeze([other_str]*)    -> new_str
05704  *
05705  *  Builds a set of characters from the <i>other_str</i> parameter(s) using the
05706  *  procedure described for <code>String#count</code>. Returns a new string
05707  *  where runs of the same character that occur in this set are replaced by a
05708  *  single character. If no arguments are given, all runs of identical
05709  *  characters are replaced by a single character.
05710  *
05711  *     "yellow moon".squeeze                  #=> "yelow mon"
05712  *     "  now   is  the".squeeze(" ")         #=> " now is the"
05713  *     "putters shoot balls".squeeze("m-z")   #=> "puters shot balls"
05714  */
05715 
05716 static VALUE
05717 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
05718 {
05719     str = rb_str_dup(str);
05720     rb_str_squeeze_bang(argc, argv, str);
05721     return str;
05722 }
05723 
05724 
05725 /*
05726  *  call-seq:
05727  *     str.tr_s!(from_str, to_str)   -> str or nil
05728  *
05729  *  Performs <code>String#tr_s</code> processing on <i>str</i> in place,
05730  *  returning <i>str</i>, or <code>nil</code> if no changes were made.
05731  */
05732 
05733 static VALUE
05734 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
05735 {
05736     return tr_trans(str, src, repl, 1);
05737 }
05738 
05739 
05740 /*
05741  *  call-seq:
05742  *     str.tr_s(from_str, to_str)   -> new_str
05743  *
05744  *  Processes a copy of <i>str</i> as described under <code>String#tr</code>,
05745  *  then removes duplicate characters in regions that were affected by the
05746  *  translation.
05747  *
05748  *     "hello".tr_s('l', 'r')     #=> "hero"
05749  *     "hello".tr_s('el', '*')    #=> "h*o"
05750  *     "hello".tr_s('el', 'hx')   #=> "hhxo"
05751  */
05752 
05753 static VALUE
05754 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
05755 {
05756     str = rb_str_dup(str);
05757     tr_trans(str, src, repl, 1);
05758     return str;
05759 }
05760 
05761 
05762 /*
05763  *  call-seq:
05764  *     str.count([other_str]+)   -> fixnum
05765  *
05766  *  Each +other_str+ parameter defines a set of characters to count.  The
05767  *  intersection of these sets defines the characters to count in +str+.  Any
05768  *  +other_str+ that starts with a caret <code>^</code> is negated.  The
05769  *  sequence <code>c1-c2</code> means all characters between c1 and c2.  The
05770  *  backslash character <code></code> can be used to escape <code>^</code> or
05771  *  <code>-</code> and is otherwise ignored unless it appears at the end of a
05772  *  sequence or the end of a +other_str+.
05773  *
05774  *     a = "hello world"
05775  *     a.count "lo"                   #=> 5
05776  *     a.count "lo", "o"              #=> 2
05777  *     a.count "hello", "^l"          #=> 4
05778  *     a.count "ej-m"                 #=> 4
05779  *
05780  *     "hello^world".count "\\^aeiou" #=> 4
05781  *     "hello-world".count "a\\-eo"   #=> 4
05782  *
05783  *     c = "hello world\\r\\n"
05784  *     c.count "\\"                   #=> 2
05785  *     c.count "\\A"                  #=> 0
05786  *     c.count "X-\\w"                #=> 3
05787  */
05788 
05789 static VALUE
05790 rb_str_count(int argc, VALUE *argv, VALUE str)
05791 {
05792     char table[TR_TABLE_SIZE];
05793     rb_encoding *enc = 0;
05794     VALUE del = 0, nodel = 0;
05795     char *s, *send;
05796     int i;
05797     int ascompat;
05798 
05799     rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
05800     for (i=0; i<argc; i++) {
05801         VALUE tstr = argv[i];
05802         unsigned char c;
05803 
05804         StringValue(tstr);
05805         enc = rb_enc_check(str, tstr);
05806         if (argc == 1 && RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
05807             (c = RSTRING_PTR(tstr)[0]) < 0x80 && !is_broken_string(str)) {
05808             int n = 0;
05809 
05810             s = RSTRING_PTR(str);
05811             if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
05812             send = RSTRING_END(str);
05813             while (s < send) {
05814                 if (*(unsigned char*)s++ == c) n++;
05815             }
05816             return INT2NUM(n);
05817         }
05818         tr_setup_table(tstr, table, i==0, &del, &nodel, enc);
05819     }
05820 
05821     s = RSTRING_PTR(str);
05822     if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
05823     send = RSTRING_END(str);
05824     ascompat = rb_enc_asciicompat(enc);
05825     i = 0;
05826     while (s < send) {
05827         unsigned int c;
05828 
05829         if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05830             if (table[c]) {
05831                 i++;
05832             }
05833             s++;
05834         }
05835         else {
05836             int clen;
05837             c = rb_enc_codepoint_len(s, send, &clen, enc);
05838             if (tr_find(c, table, del, nodel)) {
05839                 i++;
05840             }
05841             s += clen;
05842         }
05843     }
05844 
05845     return INT2NUM(i);
05846 }
05847 
05848 static const char isspacetable[256] = {
05849     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
05850     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05851     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05852     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05853     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05854     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05855     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05856     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05857     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05858     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05859     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05860     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05861     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05862     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05863     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05864     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
05865 };
05866 
05867 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
05868 
05869 /*
05870  *  call-seq:
05871  *     str.split(pattern=$;, [limit])   -> anArray
05872  *
05873  *  Divides <i>str</i> into substrings based on a delimiter, returning an array
05874  *  of these substrings.
05875  *
05876  *  If <i>pattern</i> is a <code>String</code>, then its contents are used as
05877  *  the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
05878  *  space, <i>str</i> is split on whitespace, with leading whitespace and runs
05879  *  of contiguous whitespace characters ignored.
05880  *
05881  *  If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the
05882  *  pattern matches. Whenever the pattern matches a zero-length string,
05883  *  <i>str</i> is split into individual characters. If <i>pattern</i> contains
05884  *  groups, the respective matches will be returned in the array as well.
05885  *
05886  *  If <i>pattern</i> is omitted, the value of <code>$;</code> is used.  If
05887  *  <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
05888  *  split on whitespace as if ` ' were specified.
05889  *
05890  *  If the <i>limit</i> parameter is omitted, trailing null fields are
05891  *  suppressed. If <i>limit</i> is a positive number, at most that number of
05892  *  fields will be returned (if <i>limit</i> is <code>1</code>, the entire
05893  *  string is returned as the only entry in an array). If negative, there is no
05894  *  limit to the number of fields returned, and trailing null fields are not
05895  *  suppressed.
05896  *
05897  *  When the input +str+ is empty an empty Array is returned as the string is
05898  *  considered to have no fields to split.
05899  *
05900  *     " now's  the time".split        #=> ["now's", "the", "time"]
05901  *     " now's  the time".split(' ')   #=> ["now's", "the", "time"]
05902  *     " now's  the time".split(/ /)   #=> ["", "now's", "", "the", "time"]
05903  *     "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
05904  *     "hello".split(//)               #=> ["h", "e", "l", "l", "o"]
05905  *     "hello".split(//, 3)            #=> ["h", "e", "llo"]
05906  *     "hi mom".split(%r{\s*})         #=> ["h", "i", "m", "o", "m"]
05907  *
05908  *     "mellow yellow".split("ello")   #=> ["m", "w y", "w"]
05909  *     "1,2,,3,4,,".split(',')         #=> ["1", "2", "", "3", "4"]
05910  *     "1,2,,3,4,,".split(',', 4)      #=> ["1", "2", "", "3,4,,"]
05911  *     "1,2,,3,4,,".split(',', -4)     #=> ["1", "2", "", "3", "4", "", ""]
05912  *
05913  *     "".split(',', -1)               #=> []
05914  */
05915 
05916 static VALUE
05917 rb_str_split_m(int argc, VALUE *argv, VALUE str)
05918 {
05919     rb_encoding *enc;
05920     VALUE spat;
05921     VALUE limit;
05922     enum {awk, string, regexp} split_type;
05923     long beg, end, i = 0;
05924     int lim = 0;
05925     VALUE result, tmp;
05926 
05927     if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
05928         lim = NUM2INT(limit);
05929         if (lim <= 0) limit = Qnil;
05930         else if (lim == 1) {
05931             if (RSTRING_LEN(str) == 0)
05932                 return rb_ary_new2(0);
05933             return rb_ary_new3(1, str);
05934         }
05935         i = 1;
05936     }
05937 
05938     enc = STR_ENC_GET(str);
05939     if (NIL_P(spat)) {
05940         if (!NIL_P(rb_fs)) {
05941             spat = rb_fs;
05942             goto fs_set;
05943         }
05944         split_type = awk;
05945     }
05946     else {
05947       fs_set:
05948         if (RB_TYPE_P(spat, T_STRING)) {
05949             rb_encoding *enc2 = STR_ENC_GET(spat);
05950 
05951             split_type = string;
05952             if (RSTRING_LEN(spat) == 0) {
05953                 /* Special case - split into chars */
05954                 spat = rb_reg_regcomp(spat);
05955                 split_type = regexp;
05956             }
05957             else if (rb_enc_asciicompat(enc2) == 1) {
05958                 if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
05959                     split_type = awk;
05960                 }
05961             }
05962             else {
05963                 int l;
05964                 if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
05965                     RSTRING_LEN(spat) == l) {
05966                     split_type = awk;
05967                 }
05968             }
05969         }
05970         else {
05971             spat = get_pat(spat, 1);
05972             split_type = regexp;
05973         }
05974     }
05975 
05976     result = rb_ary_new();
05977     beg = 0;
05978     if (split_type == awk) {
05979         char *ptr = RSTRING_PTR(str);
05980         char *eptr = RSTRING_END(str);
05981         char *bptr = ptr;
05982         int skip = 1;
05983         unsigned int c;
05984 
05985         end = beg;
05986         if (is_ascii_string(str)) {
05987             while (ptr < eptr) {
05988                 c = (unsigned char)*ptr++;
05989                 if (skip) {
05990                     if (ascii_isspace(c)) {
05991                         beg = ptr - bptr;
05992                     }
05993                     else {
05994                         end = ptr - bptr;
05995                         skip = 0;
05996                         if (!NIL_P(limit) && lim <= i) break;
05997                     }
05998                 }
05999                 else if (ascii_isspace(c)) {
06000                     rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
06001                     skip = 1;
06002                     beg = ptr - bptr;
06003                     if (!NIL_P(limit)) ++i;
06004                 }
06005                 else {
06006                     end = ptr - bptr;
06007                 }
06008             }
06009         }
06010         else {
06011             while (ptr < eptr) {
06012                 int n;
06013 
06014                 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
06015                 ptr += n;
06016                 if (skip) {
06017                     if (rb_isspace(c)) {
06018                         beg = ptr - bptr;
06019                     }
06020                     else {
06021                         end = ptr - bptr;
06022                         skip = 0;
06023                         if (!NIL_P(limit) && lim <= i) break;
06024                     }
06025                 }
06026                 else if (rb_isspace(c)) {
06027                     rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
06028                     skip = 1;
06029                     beg = ptr - bptr;
06030                     if (!NIL_P(limit)) ++i;
06031                 }
06032                 else {
06033                     end = ptr - bptr;
06034                 }
06035             }
06036         }
06037     }
06038     else if (split_type == string) {
06039         char *ptr = RSTRING_PTR(str);
06040         char *temp = ptr;
06041         char *eptr = RSTRING_END(str);
06042         char *sptr = RSTRING_PTR(spat);
06043         long slen = RSTRING_LEN(spat);
06044 
06045         if (is_broken_string(str)) {
06046             rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
06047         }
06048         if (is_broken_string(spat)) {
06049             rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat)));
06050         }
06051         enc = rb_enc_check(str, spat);
06052         while (ptr < eptr &&
06053                (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
06054             /* Check we are at the start of a char */
06055             char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
06056             if (t != ptr + end) {
06057                 ptr = t;
06058                 continue;
06059             }
06060             rb_ary_push(result, rb_str_subseq(str, ptr - temp, end));
06061             ptr += end + slen;
06062             if (!NIL_P(limit) && lim <= ++i) break;
06063         }
06064         beg = ptr - temp;
06065     }
06066     else {
06067         char *ptr = RSTRING_PTR(str);
06068         long len = RSTRING_LEN(str);
06069         long start = beg;
06070         long idx;
06071         int last_null = 0;
06072         struct re_registers *regs;
06073 
06074         while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
06075             regs = RMATCH_REGS(rb_backref_get());
06076             if (start == end && BEG(0) == END(0)) {
06077                 if (!ptr) {
06078                     rb_ary_push(result, str_new_empty(str));
06079                     break;
06080                 }
06081                 else if (last_null == 1) {
06082                     rb_ary_push(result, rb_str_subseq(str, beg,
06083                                                       rb_enc_fast_mbclen(ptr+beg,
06084                                                                          ptr+len,
06085                                                                          enc)));
06086                     beg = start;
06087                 }
06088                 else {
06089                     if (ptr+start == ptr+len)
06090                         start++;
06091                     else
06092                         start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc);
06093                     last_null = 1;
06094                     continue;
06095                 }
06096             }
06097             else {
06098                 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
06099                 beg = start = END(0);
06100             }
06101             last_null = 0;
06102 
06103             for (idx=1; idx < regs->num_regs; idx++) {
06104                 if (BEG(idx) == -1) continue;
06105                 if (BEG(idx) == END(idx))
06106                     tmp = str_new_empty(str);
06107                 else
06108                     tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
06109                 rb_ary_push(result, tmp);
06110             }
06111             if (!NIL_P(limit) && lim <= ++i) break;
06112         }
06113     }
06114     if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
06115         if (RSTRING_LEN(str) == beg)
06116             tmp = str_new_empty(str);
06117         else
06118             tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
06119         rb_ary_push(result, tmp);
06120     }
06121     if (NIL_P(limit) && lim == 0) {
06122         long len;
06123         while ((len = RARRAY_LEN(result)) > 0 &&
06124                (tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0))
06125             rb_ary_pop(result);
06126     }
06127 
06128     return result;
06129 }
06130 
06131 VALUE
06132 rb_str_split(VALUE str, const char *sep0)
06133 {
06134     VALUE sep;
06135 
06136     StringValue(str);
06137     sep = rb_str_new2(sep0);
06138     return rb_str_split_m(1, &sep, str);
06139 }
06140 
06141 
06142 static VALUE
06143 rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, int wantarray)
06144 {
06145     rb_encoding *enc;
06146     VALUE rs;
06147     unsigned int newline;
06148     const char *p, *pend, *s, *ptr;
06149     long len, rslen;
06150     VALUE line;
06151     int n;
06152     VALUE orig = str;
06153     VALUE UNINITIALIZED_VAR(ary);
06154 
06155     if (argc == 0) {
06156         rs = rb_rs;
06157     }
06158     else {
06159         rb_scan_args(argc, argv, "01", &rs);
06160     }
06161 
06162     if (rb_block_given_p()) {
06163         if (wantarray) {
06164 #if 0 /* next major */
06165             rb_warn("given block not used");
06166             ary = rb_ary_new();
06167 #else
06168             rb_warning("passing a block to String#lines is deprecated");
06169             wantarray = 0;
06170 #endif
06171         }
06172     }
06173     else {
06174         if (wantarray)
06175             ary = rb_ary_new();
06176         else
06177             RETURN_ENUMERATOR(str, argc, argv);
06178     }
06179 
06180     if (NIL_P(rs)) {
06181         if (wantarray) {
06182             rb_ary_push(ary, str);
06183             return ary;
06184         }
06185         else {
06186             rb_yield(str);
06187             return orig;
06188         }
06189     }
06190     str = rb_str_new4(str);
06191     ptr = p = s = RSTRING_PTR(str);
06192     pend = p + RSTRING_LEN(str);
06193     len = RSTRING_LEN(str);
06194     StringValue(rs);
06195     if (rs == rb_default_rs) {
06196         enc = rb_enc_get(str);
06197         while (p < pend) {
06198             char *p0;
06199 
06200             p = memchr(p, '\n', pend - p);
06201             if (!p) break;
06202             p0 = rb_enc_left_char_head(s, p, pend, enc);
06203             if (!rb_enc_is_newline(p0, pend, enc)) {
06204                 p++;
06205                 continue;
06206             }
06207             p = p0 + rb_enc_mbclen(p0, pend, enc);
06208             line = rb_str_subseq(str, s - ptr, p - s);
06209             if (wantarray)
06210                 rb_ary_push(ary, line);
06211             else
06212                 rb_yield(line);
06213             str_mod_check(str, ptr, len);
06214             s = p;
06215         }
06216         goto finish;
06217     }
06218 
06219     enc = rb_enc_check(str, rs);
06220     rslen = RSTRING_LEN(rs);
06221     if (rslen == 0) {
06222         newline = '\n';
06223     }
06224     else {
06225         newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc);
06226     }
06227 
06228     while (p < pend) {
06229         unsigned int c = rb_enc_codepoint_len(p, pend, &n, enc);
06230 
06231       again:
06232         if (rslen == 0 && c == newline) {
06233             p += n;
06234             if (p < pend && (c = rb_enc_codepoint_len(p, pend, &n, enc)) != newline) {
06235                 goto again;
06236             }
06237             while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) {
06238                 p += n;
06239             }
06240             p -= n;
06241         }
06242         if (c == newline &&
06243             (rslen <= 1 ||
06244              (pend - p >= rslen && memcmp(RSTRING_PTR(rs), p, rslen) == 0))) {
06245             const char *pp = p + (rslen ? rslen : n);
06246             line = rb_str_subseq(str, s - ptr, pp - s);
06247             if (wantarray)
06248                 rb_ary_push(ary, line);
06249             else
06250                 rb_yield(line);
06251             str_mod_check(str, ptr, len);
06252             s = pp;
06253         }
06254         p += n;
06255     }
06256 
06257   finish:
06258     if (s != pend) {
06259         line = rb_str_subseq(str, s - ptr, pend - s);
06260         if (wantarray)
06261             rb_ary_push(ary, line);
06262         else
06263             rb_yield(line);
06264         RB_GC_GUARD(str);
06265     }
06266 
06267     if (wantarray)
06268         return ary;
06269     else
06270         return orig;
06271 }
06272 
06273 /*
06274  *  call-seq:
06275  *     str.each_line(separator=$/) {|substr| block }   -> str
06276  *     str.each_line(separator=$/)                     -> an_enumerator
06277  *
06278  *  Splits <i>str</i> using the supplied parameter as the record
06279  *  separator (<code>$/</code> by default), passing each substring in
06280  *  turn to the supplied block.  If a zero-length record separator is
06281  *  supplied, the string is split into paragraphs delimited by
06282  *  multiple successive newlines.
06283  *
06284  *  If no block is given, an enumerator is returned instead.
06285  *
06286  *     print "Example one\n"
06287  *     "hello\nworld".each_line {|s| p s}
06288  *     print "Example two\n"
06289  *     "hello\nworld".each_line('l') {|s| p s}
06290  *     print "Example three\n"
06291  *     "hello\n\n\nworld".each_line('') {|s| p s}
06292  *
06293  *  <em>produces:</em>
06294  *
06295  *     Example one
06296  *     "hello\n"
06297  *     "world"
06298  *     Example two
06299  *     "hel"
06300  *     "l"
06301  *     "o\nworl"
06302  *     "d"
06303  *     Example three
06304  *     "hello\n\n\n"
06305  *     "world"
06306  */
06307 
06308 static VALUE
06309 rb_str_each_line(int argc, VALUE *argv, VALUE str)
06310 {
06311     return rb_str_enumerate_lines(argc, argv, str, 0);
06312 }
06313 
06314 /*
06315  *  call-seq:
06316  *     str.lines(separator=$/)  -> an_array
06317  *
06318  *  Returns an array of lines in <i>str</i> split using the supplied
06319  *  record separator (<code>$/</code> by default).  This is a
06320  *  shorthand for <code>str.each_line(separator).to_a</code>.
06321  *
06322  *  If a block is given, which is a deprecated form, works the same as
06323  *  <code>each_line</code>.
06324  */
06325 
06326 static VALUE
06327 rb_str_lines(int argc, VALUE *argv, VALUE str)
06328 {
06329     return rb_str_enumerate_lines(argc, argv, str, 1);
06330 }
06331 
06332 static VALUE
06333 rb_str_each_byte_size(VALUE str, VALUE args)
06334 {
06335     return LONG2FIX(RSTRING_LEN(str));
06336 }
06337 
06338 static VALUE
06339 rb_str_enumerate_bytes(VALUE str, int wantarray)
06340 {
06341     long i;
06342     VALUE UNINITIALIZED_VAR(ary);
06343 
06344     if (rb_block_given_p()) {
06345         if (wantarray) {
06346 #if 0 /* next major */
06347             rb_warn("given block not used");
06348             ary = rb_ary_new();
06349 #else
06350             rb_warning("passing a block to String#bytes is deprecated");
06351             wantarray = 0;
06352 #endif
06353         }
06354     }
06355     else {
06356         if (wantarray)
06357             ary = rb_ary_new2(RSTRING_LEN(str));
06358         else
06359             RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
06360     }
06361 
06362     for (i=0; i<RSTRING_LEN(str); i++) {
06363         if (wantarray)
06364             rb_ary_push(ary, INT2FIX(RSTRING_PTR(str)[i] & 0xff));
06365         else
06366             rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff));
06367     }
06368     if (wantarray)
06369         return ary;
06370     else
06371         return str;
06372 }
06373 
06374 /*
06375  *  call-seq:
06376  *     str.each_byte {|fixnum| block }    -> str
06377  *     str.each_byte                      -> an_enumerator
06378  *
06379  *  Passes each byte in <i>str</i> to the given block, or returns an
06380  *  enumerator if no block is given.
06381  *
06382  *     "hello".each_byte {|c| print c, ' ' }
06383  *
06384  *  <em>produces:</em>
06385  *
06386  *     104 101 108 108 111
06387  */
06388 
06389 static VALUE
06390 rb_str_each_byte(VALUE str)
06391 {
06392     return rb_str_enumerate_bytes(str, 0);
06393 }
06394 
06395 /*
06396  *  call-seq:
06397  *     str.bytes    -> an_array
06398  *
06399  *  Returns an array of bytes in <i>str</i>.  This is a shorthand for
06400  *  <code>str.each_byte.to_a</code>.
06401  *
06402  *  If a block is given, which is a deprecated form, works the same as
06403  *  <code>each_byte</code>.
06404  */
06405 
06406 static VALUE
06407 rb_str_bytes(VALUE str)
06408 {
06409     return rb_str_enumerate_bytes(str, 1);
06410 }
06411 
06412 static VALUE
06413 rb_str_each_char_size(VALUE str)
06414 {
06415     long len = RSTRING_LEN(str);
06416     if (!single_byte_optimizable(str)) {
06417         const char *ptr = RSTRING_PTR(str);
06418         rb_encoding *enc = rb_enc_get(str);
06419         const char *end_ptr = ptr + len;
06420         for (len = 0; ptr < end_ptr; ++len) {
06421             ptr += rb_enc_mbclen(ptr, end_ptr, enc);
06422         }
06423     }
06424     return LONG2FIX(len);
06425 }
06426 
06427 static VALUE
06428 rb_str_enumerate_chars(VALUE str, int wantarray)
06429 {
06430     VALUE orig = str;
06431     VALUE substr;
06432     long i, len, n;
06433     const char *ptr;
06434     rb_encoding *enc;
06435     VALUE UNINITIALIZED_VAR(ary);
06436 
06437     if (rb_block_given_p()) {
06438         if (wantarray) {
06439 #if 0 /* next major */
06440             rb_warn("given block not used");
06441             ary = rb_ary_new();
06442 #else
06443             rb_warning("passing a block to String#chars is deprecated");
06444             wantarray = 0;
06445 #endif
06446         }
06447     }
06448     else {
06449         if (wantarray)
06450             ary = rb_ary_new();
06451         else
06452             RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
06453     }
06454 
06455     str = rb_str_new4(str);
06456     ptr = RSTRING_PTR(str);
06457     len = RSTRING_LEN(str);
06458     enc = rb_enc_get(str);
06459     switch (ENC_CODERANGE(str)) {
06460       case ENC_CODERANGE_VALID:
06461       case ENC_CODERANGE_7BIT:
06462         for (i = 0; i < len; i += n) {
06463             n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
06464             substr = rb_str_subseq(str, i, n);
06465             if (wantarray)
06466                 rb_ary_push(ary, substr);
06467             else
06468                 rb_yield(substr);
06469         }
06470         break;
06471       default:
06472         for (i = 0; i < len; i += n) {
06473             n = rb_enc_mbclen(ptr + i, ptr + len, enc);
06474             substr = rb_str_subseq(str, i, n);
06475             if (wantarray)
06476                 rb_ary_push(ary, substr);
06477             else
06478                 rb_yield(substr);
06479         }
06480     }
06481     RB_GC_GUARD(str);
06482     if (wantarray)
06483         return ary;
06484     else
06485         return orig;
06486 }
06487 
06488 /*
06489  *  call-seq:
06490  *     str.each_char {|cstr| block }    -> str
06491  *     str.each_char                    -> an_enumerator
06492  *
06493  *  Passes each character in <i>str</i> to the given block, or returns
06494  *  an enumerator if no block is given.
06495  *
06496  *     "hello".each_char {|c| print c, ' ' }
06497  *
06498  *  <em>produces:</em>
06499  *
06500  *     h e l l o
06501  */
06502 
06503 static VALUE
06504 rb_str_each_char(VALUE str)
06505 {
06506     return rb_str_enumerate_chars(str, 0);
06507 }
06508 
06509 /*
06510  *  call-seq:
06511  *     str.chars    -> an_array
06512  *
06513  *  Returns an array of characters in <i>str</i>.  This is a shorthand
06514  *  for <code>str.each_char.to_a</code>.
06515  *
06516  *  If a block is given, which is a deprecated form, works the same as
06517  *  <code>each_char</code>.
06518  */
06519 
06520 static VALUE
06521 rb_str_chars(VALUE str)
06522 {
06523     return rb_str_enumerate_chars(str, 1);
06524 }
06525 
06526 
06527 static VALUE
06528 rb_str_enumerate_codepoints(VALUE str, int wantarray)
06529 {
06530     VALUE orig = str;
06531     int n;
06532     unsigned int c;
06533     const char *ptr, *end;
06534     rb_encoding *enc;
06535     VALUE UNINITIALIZED_VAR(ary);
06536 
06537     if (single_byte_optimizable(str))
06538         return rb_str_enumerate_bytes(str, wantarray);
06539 
06540     if (rb_block_given_p()) {
06541         if (wantarray) {
06542 #if 0 /* next major */
06543             rb_warn("given block not used");
06544             ary = rb_ary_new();
06545 #else
06546             rb_warning("passing a block to String#codepoints is deprecated");
06547             wantarray = 0;
06548 #endif
06549         }
06550     }
06551     else {
06552         if (wantarray)
06553             ary = rb_ary_new();
06554         else
06555             RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
06556     }
06557 
06558     str = rb_str_new4(str);
06559     ptr = RSTRING_PTR(str);
06560     end = RSTRING_END(str);
06561     enc = STR_ENC_GET(str);
06562     while (ptr < end) {
06563         c = rb_enc_codepoint_len(ptr, end, &n, enc);
06564         if (wantarray)
06565             rb_ary_push(ary, UINT2NUM(c));
06566         else
06567             rb_yield(UINT2NUM(c));
06568         ptr += n;
06569     }
06570     RB_GC_GUARD(str);
06571     if (wantarray)
06572         return ary;
06573     else
06574         return orig;
06575 }
06576 
06577 /*
06578  *  call-seq:
06579  *     str.each_codepoint {|integer| block }    -> str
06580  *     str.each_codepoint                       -> an_enumerator
06581  *
06582  *  Passes the <code>Integer</code> ordinal of each character in <i>str</i>,
06583  *  also known as a <i>codepoint</i> when applied to Unicode strings to the
06584  *  given block.
06585  *
06586  *  If no block is given, an enumerator is returned instead.
06587  *
06588  *     "hello\u0639".each_codepoint {|c| print c, ' ' }
06589  *
06590  *  <em>produces:</em>
06591  *
06592  *     104 101 108 108 111 1593
06593  */
06594 
06595 static VALUE
06596 rb_str_each_codepoint(VALUE str)
06597 {
06598     return rb_str_enumerate_codepoints(str, 0);
06599 }
06600 
06601 /*
06602  *  call-seq:
06603  *     str.codepoints   -> an_array
06604  *
06605  *  Returns an array of the <code>Integer</code> ordinals of the
06606  *  characters in <i>str</i>.  This is a shorthand for
06607  *  <code>str.each_codepoint.to_a</code>.
06608  *
06609  *  If a block is given, which is a deprecated form, works the same as
06610  *  <code>each_codepoint</code>.
06611  */
06612 
06613 static VALUE
06614 rb_str_codepoints(VALUE str)
06615 {
06616     return rb_str_enumerate_codepoints(str, 1);
06617 }
06618 
06619 
06620 static long
06621 chopped_length(VALUE str)
06622 {
06623     rb_encoding *enc = STR_ENC_GET(str);
06624     const char *p, *p2, *beg, *end;
06625 
06626     beg = RSTRING_PTR(str);
06627     end = beg + RSTRING_LEN(str);
06628     if (beg > end) return 0;
06629     p = rb_enc_prev_char(beg, end, end, enc);
06630     if (!p) return 0;
06631     if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
06632         p2 = rb_enc_prev_char(beg, p, end, enc);
06633         if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
06634     }
06635     return p - beg;
06636 }
06637 
06638 /*
06639  *  call-seq:
06640  *     str.chop!   -> str or nil
06641  *
06642  *  Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>,
06643  *  or <code>nil</code> if <i>str</i> is the empty string.  See also
06644  *  <code>String#chomp!</code>.
06645  */
06646 
06647 static VALUE
06648 rb_str_chop_bang(VALUE str)
06649 {
06650     str_modify_keep_cr(str);
06651     if (RSTRING_LEN(str) > 0) {
06652         long len;
06653         len = chopped_length(str);
06654         STR_SET_LEN(str, len);
06655         RSTRING_PTR(str)[len] = '\0';
06656         if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
06657             ENC_CODERANGE_CLEAR(str);
06658         }
06659         return str;
06660     }
06661     return Qnil;
06662 }
06663 
06664 
06665 /*
06666  *  call-seq:
06667  *     str.chop   -> new_str
06668  *
06669  *  Returns a new <code>String</code> with the last character removed.  If the
06670  *  string ends with <code>\r\n</code>, both characters are removed. Applying
06671  *  <code>chop</code> to an empty string returns an empty
06672  *  string. <code>String#chomp</code> is often a safer alternative, as it leaves
06673  *  the string unchanged if it doesn't end in a record separator.
06674  *
06675  *     "string\r\n".chop   #=> "string"
06676  *     "string\n\r".chop   #=> "string\n"
06677  *     "string\n".chop     #=> "string"
06678  *     "string".chop       #=> "strin"
06679  *     "x".chop.chop       #=> ""
06680  */
06681 
06682 static VALUE
06683 rb_str_chop(VALUE str)
06684 {
06685     return rb_str_subseq(str, 0, chopped_length(str));
06686 }
06687 
06688 
06689 /*
06690  *  call-seq:
06691  *     str.chomp!(separator=$/)   -> str or nil
06692  *
06693  *  Modifies <i>str</i> in place as described for <code>String#chomp</code>,
06694  *  returning <i>str</i>, or <code>nil</code> if no modifications were made.
06695  */
06696 
06697 static VALUE
06698 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
06699 {
06700     rb_encoding *enc;
06701     VALUE rs;
06702     int newline;
06703     char *p, *pp, *e;
06704     long len, rslen;
06705 
06706     str_modify_keep_cr(str);
06707     len = RSTRING_LEN(str);
06708     if (len == 0) return Qnil;
06709     p = RSTRING_PTR(str);
06710     e = p + len;
06711     if (argc == 0) {
06712         rs = rb_rs;
06713         if (rs == rb_default_rs) {
06714           smart_chomp:
06715             enc = rb_enc_get(str);
06716             if (rb_enc_mbminlen(enc) > 1) {
06717                 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
06718                 if (rb_enc_is_newline(pp, e, enc)) {
06719                     e = pp;
06720                 }
06721                 pp = e - rb_enc_mbminlen(enc);
06722                 if (pp >= p) {
06723                     pp = rb_enc_left_char_head(p, pp, e, enc);
06724                     if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
06725                         e = pp;
06726                     }
06727                 }
06728                 if (e == RSTRING_END(str)) {
06729                     return Qnil;
06730                 }
06731                 len = e - RSTRING_PTR(str);
06732                 STR_SET_LEN(str, len);
06733             }
06734             else {
06735                 if (RSTRING_PTR(str)[len-1] == '\n') {
06736                     STR_DEC_LEN(str);
06737                     if (RSTRING_LEN(str) > 0 &&
06738                         RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') {
06739                         STR_DEC_LEN(str);
06740                     }
06741                 }
06742                 else if (RSTRING_PTR(str)[len-1] == '\r') {
06743                     STR_DEC_LEN(str);
06744                 }
06745                 else {
06746                     return Qnil;
06747                 }
06748             }
06749             RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06750             return str;
06751         }
06752     }
06753     else {
06754         rb_scan_args(argc, argv, "01", &rs);
06755     }
06756     if (NIL_P(rs)) return Qnil;
06757     StringValue(rs);
06758     rslen = RSTRING_LEN(rs);
06759     if (rslen == 0) {
06760         while (len>0 && p[len-1] == '\n') {
06761             len--;
06762             if (len>0 && p[len-1] == '\r')
06763                 len--;
06764         }
06765         if (len < RSTRING_LEN(str)) {
06766             STR_SET_LEN(str, len);
06767             RSTRING_PTR(str)[len] = '\0';
06768             return str;
06769         }
06770         return Qnil;
06771     }
06772     if (rslen > len) return Qnil;
06773     newline = RSTRING_PTR(rs)[rslen-1];
06774     if (rslen == 1 && newline == '\n')
06775         goto smart_chomp;
06776 
06777     enc = rb_enc_check(str, rs);
06778     if (is_broken_string(rs)) {
06779         return Qnil;
06780     }
06781     pp = e - rslen;
06782     if (p[len-1] == newline &&
06783         (rslen <= 1 ||
06784          memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
06785         if (rb_enc_left_char_head(p, pp, e, enc) != pp)
06786             return Qnil;
06787         if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
06788             ENC_CODERANGE_CLEAR(str);
06789         }
06790         STR_SET_LEN(str, RSTRING_LEN(str) - rslen);
06791         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06792         return str;
06793     }
06794     return Qnil;
06795 }
06796 
06797 
06798 /*
06799  *  call-seq:
06800  *     str.chomp(separator=$/)   -> new_str
06801  *
06802  *  Returns a new <code>String</code> with the given record separator removed
06803  *  from the end of <i>str</i> (if present). If <code>$/</code> has not been
06804  *  changed from the default Ruby record separator, then <code>chomp</code> also
06805  *  removes carriage return characters (that is it will remove <code>\n</code>,
06806  *  <code>\r</code>, and <code>\r\n</code>).
06807  *
06808  *     "hello".chomp            #=> "hello"
06809  *     "hello\n".chomp          #=> "hello"
06810  *     "hello\r\n".chomp        #=> "hello"
06811  *     "hello\n\r".chomp        #=> "hello\n"
06812  *     "hello\r".chomp          #=> "hello"
06813  *     "hello \n there".chomp   #=> "hello \n there"
06814  *     "hello".chomp("llo")     #=> "he"
06815  */
06816 
06817 static VALUE
06818 rb_str_chomp(int argc, VALUE *argv, VALUE str)
06819 {
06820     str = rb_str_dup(str);
06821     rb_str_chomp_bang(argc, argv, str);
06822     return str;
06823 }
06824 
06825 /*
06826  *  call-seq:
06827  *     str.lstrip!   -> self or nil
06828  *
06829  *  Removes leading whitespace from <i>str</i>, returning <code>nil</code> if no
06830  *  change was made. See also <code>String#rstrip!</code> and
06831  *  <code>String#strip!</code>.
06832  *
06833  *     "  hello  ".lstrip   #=> "hello  "
06834  *     "hello".lstrip!      #=> nil
06835  */
06836 
06837 static VALUE
06838 rb_str_lstrip_bang(VALUE str)
06839 {
06840     rb_encoding *enc;
06841     char *s, *t, *e;
06842 
06843     str_modify_keep_cr(str);
06844     enc = STR_ENC_GET(str);
06845     s = RSTRING_PTR(str);
06846     if (!s || RSTRING_LEN(str) == 0) return Qnil;
06847     e = t = RSTRING_END(str);
06848     /* remove spaces at head */
06849     while (s < e) {
06850         int n;
06851         unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
06852 
06853         if (!rb_isspace(cc)) break;
06854         s += n;
06855     }
06856 
06857     if (s > RSTRING_PTR(str)) {
06858         STR_SET_LEN(str, t-s);
06859         memmove(RSTRING_PTR(str), s, RSTRING_LEN(str));
06860         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06861         return str;
06862     }
06863     return Qnil;
06864 }
06865 
06866 
06867 /*
06868  *  call-seq:
06869  *     str.lstrip   -> new_str
06870  *
06871  *  Returns a copy of <i>str</i> with leading whitespace removed. See also
06872  *  <code>String#rstrip</code> and <code>String#strip</code>.
06873  *
06874  *     "  hello  ".lstrip   #=> "hello  "
06875  *     "hello".lstrip       #=> "hello"
06876  */
06877 
06878 static VALUE
06879 rb_str_lstrip(VALUE str)
06880 {
06881     str = rb_str_dup(str);
06882     rb_str_lstrip_bang(str);
06883     return str;
06884 }
06885 
06886 
06887 /*
06888  *  call-seq:
06889  *     str.rstrip!   -> self or nil
06890  *
06891  *  Removes trailing whitespace from <i>str</i>, returning <code>nil</code> if
06892  *  no change was made. See also <code>String#lstrip!</code> and
06893  *  <code>String#strip!</code>.
06894  *
06895  *     "  hello  ".rstrip   #=> "  hello"
06896  *     "hello".rstrip!      #=> nil
06897  */
06898 
06899 static VALUE
06900 rb_str_rstrip_bang(VALUE str)
06901 {
06902     rb_encoding *enc;
06903     char *s, *t, *e;
06904 
06905     str_modify_keep_cr(str);
06906     enc = STR_ENC_GET(str);
06907     rb_str_check_dummy_enc(enc);
06908     s = RSTRING_PTR(str);
06909     if (!s || RSTRING_LEN(str) == 0) return Qnil;
06910     t = e = RSTRING_END(str);
06911 
06912     /* remove trailing spaces or '\0's */
06913     if (single_byte_optimizable(str)) {
06914         unsigned char c;
06915         while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
06916     }
06917     else {
06918         char *tp;
06919 
06920         while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
06921             unsigned int c = rb_enc_codepoint(tp, e, enc);
06922             if (c && !rb_isspace(c)) break;
06923             t = tp;
06924         }
06925     }
06926     if (t < e) {
06927         long len = t-RSTRING_PTR(str);
06928 
06929         STR_SET_LEN(str, len);
06930         RSTRING_PTR(str)[len] = '\0';
06931         return str;
06932     }
06933     return Qnil;
06934 }
06935 
06936 
06937 /*
06938  *  call-seq:
06939  *     str.rstrip   -> new_str
06940  *
06941  *  Returns a copy of <i>str</i> with trailing whitespace removed. See also
06942  *  <code>String#lstrip</code> and <code>String#strip</code>.
06943  *
06944  *     "  hello  ".rstrip   #=> "  hello"
06945  *     "hello".rstrip       #=> "hello"
06946  */
06947 
06948 static VALUE
06949 rb_str_rstrip(VALUE str)
06950 {
06951     str = rb_str_dup(str);
06952     rb_str_rstrip_bang(str);
06953     return str;
06954 }
06955 
06956 
06957 /*
06958  *  call-seq:
06959  *     str.strip!   -> str or nil
06960  *
06961  *  Removes leading and trailing whitespace from <i>str</i>. Returns
06962  *  <code>nil</code> if <i>str</i> was not altered.
06963  */
06964 
06965 static VALUE
06966 rb_str_strip_bang(VALUE str)
06967 {
06968     VALUE l = rb_str_lstrip_bang(str);
06969     VALUE r = rb_str_rstrip_bang(str);
06970 
06971     if (NIL_P(l) && NIL_P(r)) return Qnil;
06972     return str;
06973 }
06974 
06975 
06976 /*
06977  *  call-seq:
06978  *     str.strip   -> new_str
06979  *
06980  *  Returns a copy of <i>str</i> with leading and trailing whitespace removed.
06981  *
06982  *     "    hello    ".strip   #=> "hello"
06983  *     "\tgoodbye\r\n".strip   #=> "goodbye"
06984  */
06985 
06986 static VALUE
06987 rb_str_strip(VALUE str)
06988 {
06989     str = rb_str_dup(str);
06990     rb_str_strip_bang(str);
06991     return str;
06992 }
06993 
06994 static VALUE
06995 scan_once(VALUE str, VALUE pat, long *start)
06996 {
06997     VALUE result, match;
06998     struct re_registers *regs;
06999     int i;
07000 
07001     if (rb_reg_search(pat, str, *start, 0) >= 0) {
07002         match = rb_backref_get();
07003         regs = RMATCH_REGS(match);
07004         if (BEG(0) == END(0)) {
07005             rb_encoding *enc = STR_ENC_GET(str);
07006             /*
07007              * Always consume at least one character of the input string
07008              */
07009             if (RSTRING_LEN(str) > END(0))
07010                 *start = END(0)+rb_enc_fast_mbclen(RSTRING_PTR(str)+END(0),
07011                                                    RSTRING_END(str), enc);
07012             else
07013                 *start = END(0)+1;
07014         }
07015         else {
07016             *start = END(0);
07017         }
07018         if (regs->num_regs == 1) {
07019             return rb_reg_nth_match(0, match);
07020         }
07021         result = rb_ary_new2(regs->num_regs);
07022         for (i=1; i < regs->num_regs; i++) {
07023             rb_ary_push(result, rb_reg_nth_match(i, match));
07024         }
07025 
07026         return result;
07027     }
07028     return Qnil;
07029 }
07030 
07031 
07032 /*
07033  *  call-seq:
07034  *     str.scan(pattern)                         -> array
07035  *     str.scan(pattern) {|match, ...| block }   -> str
07036  *
07037  *  Both forms iterate through <i>str</i>, matching the pattern (which may be a
07038  *  <code>Regexp</code> or a <code>String</code>). For each match, a result is
07039  *  generated and either added to the result array or passed to the block. If
07040  *  the pattern contains no groups, each individual result consists of the
07041  *  matched string, <code>$&</code>.  If the pattern contains groups, each
07042  *  individual result is itself an array containing one entry per group.
07043  *
07044  *     a = "cruel world"
07045  *     a.scan(/\w+/)        #=> ["cruel", "world"]
07046  *     a.scan(/.../)        #=> ["cru", "el ", "wor"]
07047  *     a.scan(/(...)/)      #=> [["cru"], ["el "], ["wor"]]
07048  *     a.scan(/(..)(..)/)   #=> [["cr", "ue"], ["l ", "wo"]]
07049  *
07050  *  And the block form:
07051  *
07052  *     a.scan(/\w+/) {|w| print "<<#{w}>> " }
07053  *     print "\n"
07054  *     a.scan(/(.)(.)/) {|x,y| print y, x }
07055  *     print "\n"
07056  *
07057  *  <em>produces:</em>
07058  *
07059  *     <<cruel>> <<world>>
07060  *     rceu lowlr
07061  */
07062 
07063 static VALUE
07064 rb_str_scan(VALUE str, VALUE pat)
07065 {
07066     VALUE result;
07067     long start = 0;
07068     long last = -1, prev = 0;
07069     char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
07070 
07071     pat = get_pat(pat, 1);
07072     if (!rb_block_given_p()) {
07073         VALUE ary = rb_ary_new();
07074 
07075         while (!NIL_P(result = scan_once(str, pat, &start))) {
07076             last = prev;
07077             prev = start;
07078             rb_ary_push(ary, result);
07079         }
07080         if (last >= 0) rb_reg_search(pat, str, last, 0);
07081         return ary;
07082     }
07083 
07084     while (!NIL_P(result = scan_once(str, pat, &start))) {
07085         last = prev;
07086         prev = start;
07087         rb_yield(result);
07088         str_mod_check(str, p, len);
07089     }
07090     if (last >= 0) rb_reg_search(pat, str, last, 0);
07091     return str;
07092 }
07093 
07094 
07095 /*
07096  *  call-seq:
07097  *     str.hex   -> integer
07098  *
07099  *  Treats leading characters from <i>str</i> as a string of hexadecimal digits
07100  *  (with an optional sign and an optional <code>0x</code>) and returns the
07101  *  corresponding number. Zero is returned on error.
07102  *
07103  *     "0x0a".hex     #=> 10
07104  *     "-1234".hex    #=> -4660
07105  *     "0".hex        #=> 0
07106  *     "wombat".hex   #=> 0
07107  */
07108 
07109 static VALUE
07110 rb_str_hex(VALUE str)
07111 {
07112     return rb_str_to_inum(str, 16, FALSE);
07113 }
07114 
07115 
07116 /*
07117  *  call-seq:
07118  *     str.oct   -> integer
07119  *
07120  *  Treats leading characters of <i>str</i> as a string of octal digits (with an
07121  *  optional sign) and returns the corresponding number.  Returns 0 if the
07122  *  conversion fails.
07123  *
07124  *     "123".oct       #=> 83
07125  *     "-377".oct      #=> -255
07126  *     "bad".oct       #=> 0
07127  *     "0377bad".oct   #=> 255
07128  */
07129 
07130 static VALUE
07131 rb_str_oct(VALUE str)
07132 {
07133     return rb_str_to_inum(str, -8, FALSE);
07134 }
07135 
07136 
07137 /*
07138  *  call-seq:
07139  *     str.crypt(salt_str)   -> new_str
07140  *
07141  *  Applies a one-way cryptographic hash to <i>str</i> by invoking the
07142  *  standard library function <code>crypt(3)</code> with the given
07143  *  salt string.  While the format and the result are system and
07144  *  implementation dependent, using a salt matching the regular
07145  *  expression <code>\A[a-zA-Z0-9./]{2}</code> should be valid and
07146  *  safe on any platform, in which only the first two characters are
07147  *  significant.
07148  *
07149  *  This method is for use in system specific scripts, so if you want
07150  *  a cross-platform hash function consider using Digest or OpenSSL
07151  *  instead.
07152  */
07153 
07154 static VALUE
07155 rb_str_crypt(VALUE str, VALUE salt)
07156 {
07157     extern char *crypt(const char *, const char *);
07158     VALUE result;
07159     const char *s, *saltp;
07160     char *res;
07161 #ifdef BROKEN_CRYPT
07162     char salt_8bit_clean[3];
07163 #endif
07164 
07165     StringValue(salt);
07166     if (RSTRING_LEN(salt) < 2)
07167         rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
07168 
07169     s = RSTRING_PTR(str);
07170     if (!s) s = "";
07171     saltp = RSTRING_PTR(salt);
07172 #ifdef BROKEN_CRYPT
07173     if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
07174         salt_8bit_clean[0] = saltp[0] & 0x7f;
07175         salt_8bit_clean[1] = saltp[1] & 0x7f;
07176         salt_8bit_clean[2] = '\0';
07177         saltp = salt_8bit_clean;
07178     }
07179 #endif
07180     res = crypt(s, saltp);
07181     if (!res) {
07182         rb_sys_fail("crypt");
07183     }
07184     result = rb_str_new2(res);
07185     OBJ_INFECT(result, str);
07186     OBJ_INFECT(result, salt);
07187     return result;
07188 }
07189 
07190 
07191 /*
07192  *  call-seq:
07193  *     str.intern   -> symbol
07194  *     str.to_sym   -> symbol
07195  *
07196  *  Returns the <code>Symbol</code> corresponding to <i>str</i>, creating the
07197  *  symbol if it did not previously exist. See <code>Symbol#id2name</code>.
07198  *
07199  *     "Koala".intern         #=> :Koala
07200  *     s = 'cat'.to_sym       #=> :cat
07201  *     s == :cat              #=> true
07202  *     s = '@cat'.to_sym      #=> :@cat
07203  *     s == :@cat             #=> true
07204  *
07205  *  This can also be used to create symbols that cannot be represented using the
07206  *  <code>:xxx</code> notation.
07207  *
07208  *     'cat and dog'.to_sym   #=> :"cat and dog"
07209  */
07210 
07211 VALUE
07212 rb_str_intern(VALUE s)
07213 {
07214     VALUE str = RB_GC_GUARD(s);
07215     ID id;
07216 
07217     id = rb_intern_str(str);
07218     return ID2SYM(id);
07219 }
07220 
07221 
07222 /*
07223  *  call-seq:
07224  *     str.ord   -> integer
07225  *
07226  *  Return the <code>Integer</code> ordinal of a one-character string.
07227  *
07228  *     "a".ord         #=> 97
07229  */
07230 
07231 VALUE
07232 rb_str_ord(VALUE s)
07233 {
07234     unsigned int c;
07235 
07236     c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
07237     return UINT2NUM(c);
07238 }
07239 /*
07240  *  call-seq:
07241  *     str.sum(n=16)   -> integer
07242  *
07243  *  Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
07244  *  where <em>n</em> is the optional <code>Fixnum</code> parameter, defaulting
07245  *  to 16. The result is simply the sum of the binary value of each character in
07246  *  <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good
07247  *  checksum.
07248  */
07249 
07250 static VALUE
07251 rb_str_sum(int argc, VALUE *argv, VALUE str)
07252 {
07253     VALUE vbits;
07254     int bits;
07255     char *ptr, *p, *pend;
07256     long len;
07257     VALUE sum = INT2FIX(0);
07258     unsigned long sum0 = 0;
07259 
07260     if (argc == 0) {
07261         bits = 16;
07262     }
07263     else {
07264         rb_scan_args(argc, argv, "01", &vbits);
07265         bits = NUM2INT(vbits);
07266     }
07267     ptr = p = RSTRING_PTR(str);
07268     len = RSTRING_LEN(str);
07269     pend = p + len;
07270 
07271     while (p < pend) {
07272         if (FIXNUM_MAX - UCHAR_MAX < sum0) {
07273             sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
07274             str_mod_check(str, ptr, len);
07275             sum0 = 0;
07276         }
07277         sum0 += (unsigned char)*p;
07278         p++;
07279     }
07280 
07281     if (bits == 0) {
07282         if (sum0) {
07283             sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
07284         }
07285     }
07286     else {
07287         if (sum == INT2FIX(0)) {
07288             if (bits < (int)sizeof(long)*CHAR_BIT) {
07289                 sum0 &= (((unsigned long)1)<<bits)-1;
07290             }
07291             sum = LONG2FIX(sum0);
07292         }
07293         else {
07294             VALUE mod;
07295 
07296             if (sum0) {
07297                 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
07298             }
07299 
07300             mod = rb_funcall(INT2FIX(1), rb_intern("<<"), 1, INT2FIX(bits));
07301             mod = rb_funcall(mod, '-', 1, INT2FIX(1));
07302             sum = rb_funcall(sum, '&', 1, mod);
07303         }
07304     }
07305     return sum;
07306 }
07307 
07308 static VALUE
07309 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
07310 {
07311     rb_encoding *enc;
07312     VALUE w;
07313     long width, len, flen = 1, fclen = 1;
07314     VALUE res;
07315     char *p;
07316     const char *f = " ";
07317     long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
07318     volatile VALUE pad;
07319     int singlebyte = 1, cr;
07320 
07321     rb_scan_args(argc, argv, "11", &w, &pad);
07322     enc = STR_ENC_GET(str);
07323     width = NUM2LONG(w);
07324     if (argc == 2) {
07325         StringValue(pad);
07326         enc = rb_enc_check(str, pad);
07327         f = RSTRING_PTR(pad);
07328         flen = RSTRING_LEN(pad);
07329         fclen = str_strlen(pad, enc);
07330         singlebyte = single_byte_optimizable(pad);
07331         if (flen == 0 || fclen == 0) {
07332             rb_raise(rb_eArgError, "zero width padding");
07333         }
07334     }
07335     len = str_strlen(str, enc);
07336     if (width < 0 || len >= width) return rb_str_dup(str);
07337     n = width - len;
07338     llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
07339     rlen = n - llen;
07340     cr = ENC_CODERANGE(str);
07341     if (flen > 1) {
07342        llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
07343        rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
07344     }
07345     size = RSTRING_LEN(str);
07346     if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
07347        (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
07348        (len += llen2 + rlen2) >= LONG_MAX - size) {
07349        rb_raise(rb_eArgError, "argument too big");
07350     }
07351     len += size;
07352     res = rb_str_new5(str, 0, len);
07353     p = RSTRING_PTR(res);
07354     if (flen <= 1) {
07355        memset(p, *f, llen);
07356        p += llen;
07357     }
07358     else {
07359        while (llen >= fclen) {
07360             memcpy(p,f,flen);
07361             p += flen;
07362             llen -= fclen;
07363         }
07364        if (llen > 0) {
07365            memcpy(p, f, llen2);
07366            p += llen2;
07367         }
07368     }
07369     memcpy(p, RSTRING_PTR(str), size);
07370     p += size;
07371     if (flen <= 1) {
07372        memset(p, *f, rlen);
07373        p += rlen;
07374     }
07375     else {
07376        while (rlen >= fclen) {
07377             memcpy(p,f,flen);
07378             p += flen;
07379             rlen -= fclen;
07380         }
07381        if (rlen > 0) {
07382            memcpy(p, f, rlen2);
07383            p += rlen2;
07384         }
07385     }
07386     *p = '\0';
07387     STR_SET_LEN(res, p-RSTRING_PTR(res));
07388     OBJ_INFECT(res, str);
07389     if (!NIL_P(pad)) OBJ_INFECT(res, pad);
07390     rb_enc_associate(res, enc);
07391     if (argc == 2)
07392         cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
07393     if (cr != ENC_CODERANGE_BROKEN)
07394         ENC_CODERANGE_SET(res, cr);
07395     return res;
07396 }
07397 
07398 
07399 /*
07400  *  call-seq:
07401  *     str.ljust(integer, padstr=' ')   -> new_str
07402  *
07403  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
07404  *  <code>String</code> of length <i>integer</i> with <i>str</i> left justified
07405  *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
07406  *
07407  *     "hello".ljust(4)            #=> "hello"
07408  *     "hello".ljust(20)           #=> "hello               "
07409  *     "hello".ljust(20, '1234')   #=> "hello123412341234123"
07410  */
07411 
07412 static VALUE
07413 rb_str_ljust(int argc, VALUE *argv, VALUE str)
07414 {
07415     return rb_str_justify(argc, argv, str, 'l');
07416 }
07417 
07418 
07419 /*
07420  *  call-seq:
07421  *     str.rjust(integer, padstr=' ')   -> new_str
07422  *
07423  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
07424  *  <code>String</code> of length <i>integer</i> with <i>str</i> right justified
07425  *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
07426  *
07427  *     "hello".rjust(4)            #=> "hello"
07428  *     "hello".rjust(20)           #=> "               hello"
07429  *     "hello".rjust(20, '1234')   #=> "123412341234123hello"
07430  */
07431 
07432 static VALUE
07433 rb_str_rjust(int argc, VALUE *argv, VALUE str)
07434 {
07435     return rb_str_justify(argc, argv, str, 'r');
07436 }
07437 
07438 
07439 /*
07440  *  call-seq:
07441  *     str.center(width, padstr=' ')   -> new_str
07442  *
07443  *  Centers +str+ in +width+.  If +width+ is greater than the length of +str+,
07444  *  returns a new String of length +width+ with +str+ centered and padded with
07445  *  +padstr+; otherwise, returns +str+.
07446  *
07447  *     "hello".center(4)         #=> "hello"
07448  *     "hello".center(20)        #=> "       hello        "
07449  *     "hello".center(20, '123') #=> "1231231hello12312312"
07450  */
07451 
07452 static VALUE
07453 rb_str_center(int argc, VALUE *argv, VALUE str)
07454 {
07455     return rb_str_justify(argc, argv, str, 'c');
07456 }
07457 
07458 /*
07459  *  call-seq:
07460  *     str.partition(sep)              -> [head, sep, tail]
07461  *     str.partition(regexp)           -> [head, match, tail]
07462  *
07463  *  Searches <i>sep</i> or pattern (<i>regexp</i>) in the string
07464  *  and returns the part before it, the match, and the part
07465  *  after it.
07466  *  If it is not found, returns two empty strings and <i>str</i>.
07467  *
07468  *     "hello".partition("l")         #=> ["he", "l", "lo"]
07469  *     "hello".partition("x")         #=> ["hello", "", ""]
07470  *     "hello".partition(/.l/)        #=> ["h", "el", "lo"]
07471  */
07472 
07473 static VALUE
07474 rb_str_partition(VALUE str, VALUE sep)
07475 {
07476     long pos;
07477     int regex = FALSE;
07478 
07479     if (RB_TYPE_P(sep, T_REGEXP)) {
07480         pos = rb_reg_search(sep, str, 0, 0);
07481         regex = TRUE;
07482     }
07483     else {
07484         VALUE tmp;
07485 
07486         tmp = rb_check_string_type(sep);
07487         if (NIL_P(tmp)) {
07488             rb_raise(rb_eTypeError, "type mismatch: %s given",
07489                      rb_obj_classname(sep));
07490         }
07491         sep = tmp;
07492         pos = rb_str_index(str, sep, 0);
07493     }
07494     if (pos < 0) {
07495       failed:
07496         return rb_ary_new3(3, str, str_new_empty(str), str_new_empty(str));
07497     }
07498     if (regex) {
07499         sep = rb_str_subpat(str, sep, INT2FIX(0));
07500         if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed;
07501     }
07502     return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
07503                           sep,
07504                           rb_str_subseq(str, pos+RSTRING_LEN(sep),
07505                                              RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
07506 }
07507 
07508 /*
07509  *  call-seq:
07510  *     str.rpartition(sep)             -> [head, sep, tail]
07511  *     str.rpartition(regexp)          -> [head, match, tail]
07512  *
07513  *  Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end
07514  *  of the string, and returns the part before it, the match, and the part
07515  *  after it.
07516  *  If it is not found, returns two empty strings and <i>str</i>.
07517  *
07518  *     "hello".rpartition("l")         #=> ["hel", "l", "o"]
07519  *     "hello".rpartition("x")         #=> ["", "", "hello"]
07520  *     "hello".rpartition(/.l/)        #=> ["he", "ll", "o"]
07521  */
07522 
07523 static VALUE
07524 rb_str_rpartition(VALUE str, VALUE sep)
07525 {
07526     long pos = RSTRING_LEN(str);
07527     int regex = FALSE;
07528 
07529     if (RB_TYPE_P(sep, T_REGEXP)) {
07530         pos = rb_reg_search(sep, str, pos, 1);
07531         regex = TRUE;
07532     }
07533     else {
07534         VALUE tmp;
07535 
07536         tmp = rb_check_string_type(sep);
07537         if (NIL_P(tmp)) {
07538             rb_raise(rb_eTypeError, "type mismatch: %s given",
07539                      rb_obj_classname(sep));
07540         }
07541         sep = tmp;
07542         pos = rb_str_sublen(str, pos);
07543         pos = rb_str_rindex(str, sep, pos);
07544     }
07545     if (pos < 0) {
07546         return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), str);
07547     }
07548     if (regex) {
07549         sep = rb_reg_nth_match(0, rb_backref_get());
07550     }
07551     return rb_ary_new3(3, rb_str_substr(str, 0, pos),
07552                           sep,
07553                           rb_str_substr(str,pos+str_strlen(sep,STR_ENC_GET(sep)),RSTRING_LEN(str)));
07554 }
07555 
07556 /*
07557  *  call-seq:
07558  *     str.start_with?([prefixes]+)   -> true or false
07559  *
07560  *  Returns true if +str+ starts with one of the +prefixes+ given.
07561  *
07562  *    "hello".start_with?("hell")               #=> true
07563  *
07564  *    # returns true if one of the prefixes matches.
07565  *    "hello".start_with?("heaven", "hell")     #=> true
07566  *    "hello".start_with?("heaven", "paradise") #=> false
07567  */
07568 
07569 static VALUE
07570 rb_str_start_with(int argc, VALUE *argv, VALUE str)
07571 {
07572     int i;
07573 
07574     for (i=0; i<argc; i++) {
07575         VALUE tmp = argv[i];
07576         StringValue(tmp);
07577         rb_enc_check(str, tmp);
07578         if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
07579         if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
07580             return Qtrue;
07581     }
07582     return Qfalse;
07583 }
07584 
07585 /*
07586  *  call-seq:
07587  *     str.end_with?([suffixes]+)   -> true or false
07588  *
07589  *  Returns true if +str+ ends with one of the +suffixes+ given.
07590  */
07591 
07592 static VALUE
07593 rb_str_end_with(int argc, VALUE *argv, VALUE str)
07594 {
07595     int i;
07596     char *p, *s, *e;
07597     rb_encoding *enc;
07598 
07599     for (i=0; i<argc; i++) {
07600         VALUE tmp = argv[i];
07601         StringValue(tmp);
07602         enc = rb_enc_check(str, tmp);
07603         if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
07604         p = RSTRING_PTR(str);
07605         e = p + RSTRING_LEN(str);
07606         s = e - RSTRING_LEN(tmp);
07607         if (rb_enc_left_char_head(p, s, e, enc) != s)
07608             continue;
07609         if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
07610             return Qtrue;
07611     }
07612     return Qfalse;
07613 }
07614 
07615 void
07616 rb_str_setter(VALUE val, ID id, VALUE *var)
07617 {
07618     if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
07619         rb_raise(rb_eTypeError, "value of %s must be String", rb_id2name(id));
07620     }
07621     *var = val;
07622 }
07623 
07624 
07625 /*
07626  *  call-seq:
07627  *     str.force_encoding(encoding)   -> str
07628  *
07629  *  Changes the encoding to +encoding+ and returns self.
07630  */
07631 
07632 static VALUE
07633 rb_str_force_encoding(VALUE str, VALUE enc)
07634 {
07635     str_modifiable(str);
07636     rb_enc_associate(str, rb_to_encoding(enc));
07637     ENC_CODERANGE_CLEAR(str);
07638     return str;
07639 }
07640 
07641 /*
07642  *  call-seq:
07643  *     str.b   -> str
07644  *
07645  *  Returns a copied string whose encoding is ASCII-8BIT.
07646  */
07647 
07648 static VALUE
07649 rb_str_b(VALUE str)
07650 {
07651     VALUE str2 = str_alloc(rb_cString);
07652     str_replace_shared_without_enc(str2, str);
07653     OBJ_INFECT(str2, str);
07654     ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID);
07655     return str2;
07656 }
07657 
07658 /*
07659  *  call-seq:
07660  *     str.valid_encoding?  -> true or false
07661  *
07662  *  Returns true for a string which encoded correctly.
07663  *
07664  *    "\xc2\xa1".force_encoding("UTF-8").valid_encoding?  #=> true
07665  *    "\xc2".force_encoding("UTF-8").valid_encoding?      #=> false
07666  *    "\x80".force_encoding("UTF-8").valid_encoding?      #=> false
07667  */
07668 
07669 static VALUE
07670 rb_str_valid_encoding_p(VALUE str)
07671 {
07672     int cr = rb_enc_str_coderange(str);
07673 
07674     return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue;
07675 }
07676 
07677 /*
07678  *  call-seq:
07679  *     str.ascii_only?  -> true or false
07680  *
07681  *  Returns true for a string which has only ASCII characters.
07682  *
07683  *    "abc".force_encoding("UTF-8").ascii_only?          #=> true
07684  *    "abc\u{6666}".force_encoding("UTF-8").ascii_only?  #=> false
07685  */
07686 
07687 static VALUE
07688 rb_str_is_ascii_only_p(VALUE str)
07689 {
07690     int cr = rb_enc_str_coderange(str);
07691 
07692     return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse;
07693 }
07694 
07709 VALUE
07710 rb_str_ellipsize(VALUE str, long len)
07711 {
07712     static const char ellipsis[] = "...";
07713     const long ellipsislen = sizeof(ellipsis) - 1;
07714     rb_encoding *const enc = rb_enc_get(str);
07715     const long blen = RSTRING_LEN(str);
07716     const char *const p = RSTRING_PTR(str), *e = p + blen;
07717     VALUE estr, ret = 0;
07718 
07719     if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
07720     if (len * rb_enc_mbminlen(enc) >= blen ||
07721         (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
07722         ret = str;
07723     }
07724     else if (len <= ellipsislen ||
07725              !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
07726         if (rb_enc_asciicompat(enc)) {
07727             ret = rb_str_new_with_class(str, ellipsis, len);
07728             rb_enc_associate(ret, enc);
07729         }
07730         else {
07731             estr = rb_usascii_str_new(ellipsis, len);
07732             ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
07733         }
07734     }
07735     else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
07736         rb_str_cat(ret, ellipsis, ellipsislen);
07737     }
07738     else {
07739         estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
07740                              rb_enc_from_encoding(enc), 0, Qnil);
07741         rb_str_append(ret, estr);
07742     }
07743     return ret;
07744 }
07745 
07746 /**********************************************************************
07747  * Document-class: Symbol
07748  *
07749  *  <code>Symbol</code> objects represent names and some strings
07750  *  inside the Ruby
07751  *  interpreter. They are generated using the <code>:name</code> and
07752  *  <code>:"string"</code> literals
07753  *  syntax, and by the various <code>to_sym</code> methods. The same
07754  *  <code>Symbol</code> object will be created for a given name or string
07755  *  for the duration of a program's execution, regardless of the context
07756  *  or meaning of that name. Thus if <code>Fred</code> is a constant in
07757  *  one context, a method in another, and a class in a third, the
07758  *  <code>Symbol</code> <code>:Fred</code> will be the same object in
07759  *  all three contexts.
07760  *
07761  *     module One
07762  *       class Fred
07763  *       end
07764  *       $f1 = :Fred
07765  *     end
07766  *     module Two
07767  *       Fred = 1
07768  *       $f2 = :Fred
07769  *     end
07770  *     def Fred()
07771  *     end
07772  *     $f3 = :Fred
07773  *     $f1.object_id   #=> 2514190
07774  *     $f2.object_id   #=> 2514190
07775  *     $f3.object_id   #=> 2514190
07776  *
07777  */
07778 
07779 
07780 /*
07781  *  call-seq:
07782  *     sym == obj   -> true or false
07783  *
07784  *  Equality---If <i>sym</i> and <i>obj</i> are exactly the same
07785  *  symbol, returns <code>true</code>.
07786  */
07787 
07788 static VALUE
07789 sym_equal(VALUE sym1, VALUE sym2)
07790 {
07791     if (sym1 == sym2) return Qtrue;
07792     return Qfalse;
07793 }
07794 
07795 
07796 static int
07797 sym_printable(const char *s, const char *send, rb_encoding *enc)
07798 {
07799     while (s < send) {
07800         int n;
07801         int c = rb_enc_codepoint_len(s, send, &n, enc);
07802 
07803         if (!rb_enc_isprint(c, enc)) return FALSE;
07804         s += n;
07805     }
07806     return TRUE;
07807 }
07808 
07809 int
07810 rb_str_symname_p(VALUE sym)
07811 {
07812     rb_encoding *enc;
07813     const char *ptr;
07814     long len;
07815     rb_encoding *resenc = rb_default_internal_encoding();
07816 
07817     if (resenc == NULL) resenc = rb_default_external_encoding();
07818     enc = STR_ENC_GET(sym);
07819     ptr = RSTRING_PTR(sym);
07820     len = RSTRING_LEN(sym);
07821     if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
07822         !rb_enc_symname_p(ptr, enc) || !sym_printable(ptr, ptr + len, enc)) {
07823         return FALSE;
07824     }
07825     return TRUE;
07826 }
07827 
07828 VALUE
07829 rb_str_quote_unprintable(VALUE str)
07830 {
07831     rb_encoding *enc;
07832     const char *ptr;
07833     long len;
07834     rb_encoding *resenc;
07835 
07836     Check_Type(str, T_STRING);
07837     resenc = rb_default_internal_encoding();
07838     if (resenc == NULL) resenc = rb_default_external_encoding();
07839     enc = STR_ENC_GET(str);
07840     ptr = RSTRING_PTR(str);
07841     len = RSTRING_LEN(str);
07842     if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
07843         !sym_printable(ptr, ptr + len, enc)) {
07844         return rb_str_inspect(str);
07845     }
07846     return str;
07847 }
07848 
07849 VALUE
07850 rb_id_quote_unprintable(ID id)
07851 {
07852     return rb_str_quote_unprintable(rb_id2str(id));
07853 }
07854 
07855 /*
07856  *  call-seq:
07857  *     sym.inspect    -> string
07858  *
07859  *  Returns the representation of <i>sym</i> as a symbol literal.
07860  *
07861  *     :fred.inspect   #=> ":fred"
07862  */
07863 
07864 static VALUE
07865 sym_inspect(VALUE sym)
07866 {
07867     VALUE str;
07868     const char *ptr;
07869     long len;
07870     ID id = SYM2ID(sym);
07871     char *dest;
07872 
07873     sym = rb_id2str(id);
07874     if (!rb_str_symname_p(sym)) {
07875         str = rb_str_inspect(sym);
07876         len = RSTRING_LEN(str);
07877         rb_str_resize(str, len + 1);
07878         dest = RSTRING_PTR(str);
07879         memmove(dest + 1, dest, len);
07880         dest[0] = ':';
07881     }
07882     else {
07883         rb_encoding *enc = STR_ENC_GET(sym);
07884         ptr = RSTRING_PTR(sym);
07885         len = RSTRING_LEN(sym);
07886         str = rb_enc_str_new(0, len + 1, enc);
07887         dest = RSTRING_PTR(str);
07888         dest[0] = ':';
07889         memcpy(dest + 1, ptr, len);
07890     }
07891     return str;
07892 }
07893 
07894 
07895 /*
07896  *  call-seq:
07897  *     sym.id2name   -> string
07898  *     sym.to_s      -> string
07899  *
07900  *  Returns the name or string corresponding to <i>sym</i>.
07901  *
07902  *     :fred.id2name   #=> "fred"
07903  */
07904 
07905 
07906 VALUE
07907 rb_sym_to_s(VALUE sym)
07908 {
07909     ID id = SYM2ID(sym);
07910 
07911     return str_new3(rb_cString, rb_id2str(id));
07912 }
07913 
07914 
07915 /*
07916  * call-seq:
07917  *   sym.to_sym   -> sym
07918  *   sym.intern   -> sym
07919  *
07920  * In general, <code>to_sym</code> returns the <code>Symbol</code> corresponding
07921  * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
07922  * in this case.
07923  */
07924 
07925 static VALUE
07926 sym_to_sym(VALUE sym)
07927 {
07928     return sym;
07929 }
07930 
07931 static VALUE
07932 sym_call(VALUE args, VALUE sym, int argc, VALUE *argv, VALUE passed_proc)
07933 {
07934     VALUE obj;
07935 
07936     if (argc < 1) {
07937         rb_raise(rb_eArgError, "no receiver given");
07938     }
07939     obj = argv[0];
07940     return rb_funcall_with_block(obj, (ID)sym, argc - 1, argv + 1, passed_proc);
07941 }
07942 
07943 /*
07944  * call-seq:
07945  *   sym.to_proc
07946  *
07947  * Returns a _Proc_ object which respond to the given method by _sym_.
07948  *
07949  *   (1..3).collect(&:to_s)  #=> ["1", "2", "3"]
07950  */
07951 
07952 static VALUE
07953 sym_to_proc(VALUE sym)
07954 {
07955     static VALUE sym_proc_cache = Qfalse;
07956     enum {SYM_PROC_CACHE_SIZE = 67};
07957     VALUE proc;
07958     long id, index;
07959     VALUE *aryp;
07960 
07961     if (!sym_proc_cache) {
07962         sym_proc_cache = rb_ary_tmp_new(SYM_PROC_CACHE_SIZE * 2);
07963         rb_gc_register_mark_object(sym_proc_cache);
07964         rb_ary_store(sym_proc_cache, SYM_PROC_CACHE_SIZE*2 - 1, Qnil);
07965     }
07966 
07967     id = SYM2ID(sym);
07968     index = (id % SYM_PROC_CACHE_SIZE) << 1;
07969 
07970     aryp = RARRAY_PTR(sym_proc_cache);
07971     if (aryp[index] == sym) {
07972         return aryp[index + 1];
07973     }
07974     else {
07975         proc = rb_proc_new(sym_call, (VALUE)id);
07976         aryp[index] = sym;
07977         aryp[index + 1] = proc;
07978         return proc;
07979     }
07980 }
07981 
07982 /*
07983  * call-seq:
07984  *
07985  *   sym.succ
07986  *
07987  * Same as <code>sym.to_s.succ.intern</code>.
07988  */
07989 
07990 static VALUE
07991 sym_succ(VALUE sym)
07992 {
07993     return rb_str_intern(rb_str_succ(rb_sym_to_s(sym)));
07994 }
07995 
07996 /*
07997  * call-seq:
07998  *
07999  *   symbol <=> other_symbol       -> -1, 0, +1 or nil
08000  *
08001  * Compares +symbol+ with +other_symbol+ after calling #to_s on each of the
08002  * symbols. Returns -1, 0, +1 or nil depending on whether +symbol+ is less
08003  * than, equal to, or greater than +other_symbol+.
08004  *
08005  *  +nil+ is returned if the two values are incomparable.
08006  *
08007  * See String#<=> for more information.
08008  */
08009 
08010 static VALUE
08011 sym_cmp(VALUE sym, VALUE other)
08012 {
08013     if (!SYMBOL_P(other)) {
08014         return Qnil;
08015     }
08016     return rb_str_cmp_m(rb_sym_to_s(sym), rb_sym_to_s(other));
08017 }
08018 
08019 /*
08020  * call-seq:
08021  *
08022  *   sym.casecmp(other)  -> -1, 0, +1 or nil
08023  *
08024  * Case-insensitive version of <code>Symbol#<=></code>.
08025  */
08026 
08027 static VALUE
08028 sym_casecmp(VALUE sym, VALUE other)
08029 {
08030     if (!SYMBOL_P(other)) {
08031         return Qnil;
08032     }
08033     return rb_str_casecmp(rb_sym_to_s(sym), rb_sym_to_s(other));
08034 }
08035 
08036 /*
08037  * call-seq:
08038  *   sym =~ obj   -> fixnum or nil
08039  *
08040  * Returns <code>sym.to_s =~ obj</code>.
08041  */
08042 
08043 static VALUE
08044 sym_match(VALUE sym, VALUE other)
08045 {
08046     return rb_str_match(rb_sym_to_s(sym), other);
08047 }
08048 
08049 /*
08050  * call-seq:
08051  *   sym[idx]      -> char
08052  *   sym[b, n]     -> char
08053  *
08054  * Returns <code>sym.to_s[]</code>.
08055  */
08056 
08057 static VALUE
08058 sym_aref(int argc, VALUE *argv, VALUE sym)
08059 {
08060     return rb_str_aref_m(argc, argv, rb_sym_to_s(sym));
08061 }
08062 
08063 /*
08064  * call-seq:
08065  *   sym.length    -> integer
08066  *
08067  * Same as <code>sym.to_s.length</code>.
08068  */
08069 
08070 static VALUE
08071 sym_length(VALUE sym)
08072 {
08073     return rb_str_length(rb_id2str(SYM2ID(sym)));
08074 }
08075 
08076 /*
08077  * call-seq:
08078  *   sym.empty?   -> true or false
08079  *
08080  * Returns that _sym_ is :"" or not.
08081  */
08082 
08083 static VALUE
08084 sym_empty(VALUE sym)
08085 {
08086     return rb_str_empty(rb_id2str(SYM2ID(sym)));
08087 }
08088 
08089 /*
08090  * call-seq:
08091  *   sym.upcase    -> symbol
08092  *
08093  * Same as <code>sym.to_s.upcase.intern</code>.
08094  */
08095 
08096 static VALUE
08097 sym_upcase(VALUE sym)
08098 {
08099     return rb_str_intern(rb_str_upcase(rb_id2str(SYM2ID(sym))));
08100 }
08101 
08102 /*
08103  * call-seq:
08104  *   sym.downcase  -> symbol
08105  *
08106  * Same as <code>sym.to_s.downcase.intern</code>.
08107  */
08108 
08109 static VALUE
08110 sym_downcase(VALUE sym)
08111 {
08112     return rb_str_intern(rb_str_downcase(rb_id2str(SYM2ID(sym))));
08113 }
08114 
08115 /*
08116  * call-seq:
08117  *   sym.capitalize  -> symbol
08118  *
08119  * Same as <code>sym.to_s.capitalize.intern</code>.
08120  */
08121 
08122 static VALUE
08123 sym_capitalize(VALUE sym)
08124 {
08125     return rb_str_intern(rb_str_capitalize(rb_id2str(SYM2ID(sym))));
08126 }
08127 
08128 /*
08129  * call-seq:
08130  *   sym.swapcase  -> symbol
08131  *
08132  * Same as <code>sym.to_s.swapcase.intern</code>.
08133  */
08134 
08135 static VALUE
08136 sym_swapcase(VALUE sym)
08137 {
08138     return rb_str_intern(rb_str_swapcase(rb_id2str(SYM2ID(sym))));
08139 }
08140 
08141 /*
08142  * call-seq:
08143  *   sym.encoding   -> encoding
08144  *
08145  * Returns the Encoding object that represents the encoding of _sym_.
08146  */
08147 
08148 static VALUE
08149 sym_encoding(VALUE sym)
08150 {
08151     return rb_obj_encoding(rb_id2str(SYM2ID(sym)));
08152 }
08153 
08154 ID
08155 rb_to_id(VALUE name)
08156 {
08157     VALUE tmp;
08158 
08159     switch (TYPE(name)) {
08160       default:
08161         tmp = rb_check_string_type(name);
08162         if (NIL_P(tmp)) {
08163             tmp = rb_inspect(name);
08164             rb_raise(rb_eTypeError, "%s is not a symbol",
08165                      RSTRING_PTR(tmp));
08166         }
08167         name = tmp;
08168         /* fall through */
08169       case T_STRING:
08170         name = rb_str_intern(name);
08171         /* fall through */
08172       case T_SYMBOL:
08173         return SYM2ID(name);
08174     }
08175 
08176     UNREACHABLE;
08177 }
08178 
08179 /*
08180  *  A <code>String</code> object holds and manipulates an arbitrary sequence of
08181  *  bytes, typically representing characters. String objects may be created
08182  *  using <code>String::new</code> or as literals.
08183  *
08184  *  Because of aliasing issues, users of strings should be aware of the methods
08185  *  that modify the contents of a <code>String</code> object.  Typically,
08186  *  methods with names ending in ``!'' modify their receiver, while those
08187  *  without a ``!'' return a new <code>String</code>.  However, there are
08188  *  exceptions, such as <code>String#[]=</code>.
08189  *
08190  */
08191 
08192 void
08193 Init_String(void)
08194 {
08195 #undef rb_intern
08196 #define rb_intern(str) rb_intern_const(str)
08197 
08198     rb_cString  = rb_define_class("String", rb_cObject);
08199     rb_include_module(rb_cString, rb_mComparable);
08200     rb_define_alloc_func(rb_cString, empty_str_alloc);
08201     rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
08202     rb_define_method(rb_cString, "initialize", rb_str_init, -1);
08203     rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
08204     rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
08205     rb_define_method(rb_cString, "==", rb_str_equal, 1);
08206     rb_define_method(rb_cString, "===", rb_str_equal, 1);
08207     rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
08208     rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
08209     rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
08210     rb_define_method(rb_cString, "+", rb_str_plus, 1);
08211     rb_define_method(rb_cString, "*", rb_str_times, 1);
08212     rb_define_method(rb_cString, "%", rb_str_format_m, 1);
08213     rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
08214     rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
08215     rb_define_method(rb_cString, "insert", rb_str_insert, 2);
08216     rb_define_method(rb_cString, "length", rb_str_length, 0);
08217     rb_define_method(rb_cString, "size", rb_str_length, 0);
08218     rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
08219     rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
08220     rb_define_method(rb_cString, "=~", rb_str_match, 1);
08221     rb_define_method(rb_cString, "match", rb_str_match_m, -1);
08222     rb_define_method(rb_cString, "succ", rb_str_succ, 0);
08223     rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
08224     rb_define_method(rb_cString, "next", rb_str_succ, 0);
08225     rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
08226     rb_define_method(rb_cString, "upto", rb_str_upto, -1);
08227     rb_define_method(rb_cString, "index", rb_str_index_m, -1);
08228     rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
08229     rb_define_method(rb_cString, "replace", rb_str_replace, 1);
08230     rb_define_method(rb_cString, "clear", rb_str_clear, 0);
08231     rb_define_method(rb_cString, "chr", rb_str_chr, 0);
08232     rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
08233     rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
08234     rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
08235 
08236     rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
08237     rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
08238     rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
08239     rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
08240     rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
08241     rb_define_method(rb_cString, "dump", rb_str_dump, 0);
08242 
08243     rb_define_method(rb_cString, "upcase", rb_str_upcase, 0);
08244     rb_define_method(rb_cString, "downcase", rb_str_downcase, 0);
08245     rb_define_method(rb_cString, "capitalize", rb_str_capitalize, 0);
08246     rb_define_method(rb_cString, "swapcase", rb_str_swapcase, 0);
08247 
08248     rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, 0);
08249     rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, 0);
08250     rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, 0);
08251     rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, 0);
08252 
08253     rb_define_method(rb_cString, "hex", rb_str_hex, 0);
08254     rb_define_method(rb_cString, "oct", rb_str_oct, 0);
08255     rb_define_method(rb_cString, "split", rb_str_split_m, -1);
08256     rb_define_method(rb_cString, "lines", rb_str_lines, -1);
08257     rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
08258     rb_define_method(rb_cString, "chars", rb_str_chars, 0);
08259     rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
08260     rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
08261     rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
08262     rb_define_method(rb_cString, "concat", rb_str_concat, 1);
08263     rb_define_method(rb_cString, "<<", rb_str_concat, 1);
08264     rb_define_method(rb_cString, "prepend", rb_str_prepend, 1);
08265     rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
08266     rb_define_method(rb_cString, "intern", rb_str_intern, 0);
08267     rb_define_method(rb_cString, "to_sym", rb_str_intern, 0);
08268     rb_define_method(rb_cString, "ord", rb_str_ord, 0);
08269 
08270     rb_define_method(rb_cString, "include?", rb_str_include, 1);
08271     rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
08272     rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
08273 
08274     rb_define_method(rb_cString, "scan", rb_str_scan, 1);
08275 
08276     rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
08277     rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
08278     rb_define_method(rb_cString, "center", rb_str_center, -1);
08279 
08280     rb_define_method(rb_cString, "sub", rb_str_sub, -1);
08281     rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
08282     rb_define_method(rb_cString, "chop", rb_str_chop, 0);
08283     rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
08284     rb_define_method(rb_cString, "strip", rb_str_strip, 0);
08285     rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
08286     rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
08287 
08288     rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
08289     rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
08290     rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
08291     rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
08292     rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
08293     rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
08294     rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
08295 
08296     rb_define_method(rb_cString, "tr", rb_str_tr, 2);
08297     rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
08298     rb_define_method(rb_cString, "delete", rb_str_delete, -1);
08299     rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
08300     rb_define_method(rb_cString, "count", rb_str_count, -1);
08301 
08302     rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
08303     rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
08304     rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
08305     rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
08306 
08307     rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
08308     rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
08309     rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
08310     rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
08311 
08312     rb_define_method(rb_cString, "sum", rb_str_sum, -1);
08313 
08314     rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
08315     rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
08316 
08317     rb_define_method(rb_cString, "partition", rb_str_partition, 1);
08318     rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
08319 
08320     rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
08321     rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
08322     rb_define_method(rb_cString, "b", rb_str_b, 0);
08323     rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
08324     rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
08325 
08326     id_to_s = rb_intern("to_s");
08327 
08328     rb_fs = Qnil;
08329     rb_define_variable("$;", &rb_fs);
08330     rb_define_variable("$-F", &rb_fs);
08331 
08332     rb_cSymbol = rb_define_class("Symbol", rb_cObject);
08333     rb_include_module(rb_cSymbol, rb_mComparable);
08334     rb_undef_alloc_func(rb_cSymbol);
08335     rb_undef_method(CLASS_OF(rb_cSymbol), "new");
08336     rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0); /* in parse.y */
08337 
08338     rb_define_method(rb_cSymbol, "==", sym_equal, 1);
08339     rb_define_method(rb_cSymbol, "===", sym_equal, 1);
08340     rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
08341     rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0);
08342     rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
08343     rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
08344     rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
08345     rb_define_method(rb_cSymbol, "to_proc", sym_to_proc, 0);
08346     rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
08347     rb_define_method(rb_cSymbol, "next", sym_succ, 0);
08348 
08349     rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
08350     rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
08351     rb_define_method(rb_cSymbol, "=~", sym_match, 1);
08352 
08353     rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
08354     rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
08355     rb_define_method(rb_cSymbol, "length", sym_length, 0);
08356     rb_define_method(rb_cSymbol, "size", sym_length, 0);
08357     rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
08358     rb_define_method(rb_cSymbol, "match", sym_match, 1);
08359 
08360     rb_define_method(rb_cSymbol, "upcase", sym_upcase, 0);
08361     rb_define_method(rb_cSymbol, "downcase", sym_downcase, 0);
08362     rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, 0);
08363     rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, 0);
08364 
08365     rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
08366 }
08367