14#include "ruby/internal/config.h"
24#include "debug_counter.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/encoding.h"
32#include "internal/error.h"
33#include "internal/gc.h"
34#include "internal/numeric.h"
35#include "internal/object.h"
36#include "internal/proc.h"
37#include "internal/re.h"
38#include "internal/sanitizers.h"
39#include "internal/string.h"
40#include "internal/transcode.h"
45#include "ruby_assert.h"
48#if defined HAVE_CRYPT_R
49# if defined HAVE_CRYPT_H
52#elif !defined HAVE_CRYPT
53# include "missing/crypt.h"
54# define HAVE_CRYPT_R 1
57#define BEG(no) (regs->beg[(no)])
58#define END(no) (regs->end[(no)])
61#undef rb_usascii_str_new
65#undef rb_usascii_str_new_cstr
66#undef rb_utf8_str_new_cstr
67#undef rb_enc_str_new_cstr
68#undef rb_external_str_new_cstr
69#undef rb_locale_str_new_cstr
70#undef rb_str_dup_frozen
71#undef rb_str_buf_new_cstr
125#define RUBY_MAX_CHAR_LEN 16
126#define STR_PRECOMPUTED_HASH FL_USER4
127#define STR_SHARED_ROOT FL_USER5
128#define STR_BORROWED FL_USER6
129#define STR_TMPLOCK FL_USER7
130#define STR_NOFREE FL_USER18
131#define STR_FAKESTR FL_USER19
133#define STR_SET_NOEMBED(str) do {\
134 FL_SET((str), STR_NOEMBED);\
135 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
137#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
139#define STR_SET_LEN(str, n) do { \
140 RSTRING(str)->len = (n); \
144str_encindex_fastpath(
int encindex)
148 case ENCINDEX_ASCII_8BIT:
150 case ENCINDEX_US_ASCII:
158str_enc_fastpath(
VALUE str)
163#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
164#define TERM_FILL(ptr, termlen) do {\
165 char *const term_fill_ptr = (ptr);\
166 const int term_fill_len = (termlen);\
167 *term_fill_ptr = '\0';\
168 if (UNLIKELY(term_fill_len > 1))\
169 memset(term_fill_ptr, 0, term_fill_len);\
172#define RESIZE_CAPA(str,capacity) do {\
173 const int termlen = TERM_LEN(str);\
174 RESIZE_CAPA_TERM(str,capacity,termlen);\
176#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
177 if (STR_EMBED_P(str)) {\
178 if (str_embed_capa(str) < capacity + termlen) {\
179 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
180 const long tlen = RSTRING_LEN(str);\
181 memcpy(tmp, RSTRING_PTR(str), tlen);\
182 RSTRING(str)->as.heap.ptr = tmp;\
183 RSTRING(str)->len = tlen;\
184 STR_SET_NOEMBED(str);\
185 RSTRING(str)->as.heap.aux.capa = (capacity);\
189 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
190 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
191 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
192 RSTRING(str)->as.heap.aux.capa = (capacity);\
196#define STR_SET_SHARED(str, shared_str) do { \
197 if (!FL_TEST(str, STR_FAKESTR)) { \
198 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
199 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
200 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
201 FL_SET((str), STR_SHARED); \
202 FL_SET((shared_str), STR_SHARED_ROOT); \
203 if (RBASIC_CLASS((shared_str)) == 0) \
204 FL_SET_RAW((shared_str), STR_BORROWED); \
208#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
209#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
212#define STR_ENC_GET(str) get_encoding(str)
214#if !defined SHARABLE_MIDDLE_SUBSTRING
215# define SHARABLE_MIDDLE_SUBSTRING 0
217#if !SHARABLE_MIDDLE_SUBSTRING
218#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
220#define SHARABLE_SUBSTRING_P(beg, len, end) 1
225str_embed_capa(
VALUE str)
227 return rb_gc_obj_slot_size(str) - offsetof(
struct RString, as.
embed.ary);
231rb_str_reembeddable_p(
VALUE str)
233 return !
FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
237rb_str_embed_size(
long capa)
243rb_str_size_as_embedded(
VALUE str)
246 if (STR_EMBED_P(str)) {
247 real_size = rb_str_embed_size(
RSTRING(str)->
len) + TERM_LEN(str);
251 else if (rb_str_reembeddable_p(str)) {
252 real_size = rb_str_embed_size(
RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
255 real_size =
sizeof(
struct RString);
259 real_size +=
sizeof(st_index_t);
266STR_EMBEDDABLE_P(
long len,
long termlen)
268 return rb_gc_size_allocatable_p(rb_str_embed_size(
len + termlen));
273static VALUE str_new_frozen_buffer(
VALUE klass,
VALUE orig,
int copy_encoding);
274static VALUE str_new_static(
VALUE klass,
const char *ptr,
long len,
int encindex);
276static void str_make_independent_expand(
VALUE str,
long len,
long expand,
const int termlen);
277static inline void str_modifiable(
VALUE str);
282str_make_independent(
VALUE str)
284 long len = RSTRING_LEN(str);
285 int termlen = TERM_LEN(str);
286 str_make_independent_expand((str),
len, 0L, termlen);
289static inline int str_dependent_p(
VALUE str);
292rb_str_make_independent(
VALUE str)
294 if (str_dependent_p(str)) {
295 str_make_independent(str);
300rb_str_make_embedded(
VALUE str)
305 char *buf =
RSTRING(str)->as.heap.ptr;
309 STR_SET_LEN(str,
len);
312 memcpy(RSTRING_PTR(str), buf,
len);
316 TERM_FILL(
RSTRING(str)->
as.embed.ary +
len, TERM_LEN(str));
320rb_debug_rstring_null_ptr(
const char *func)
322 fprintf(stderr,
"%s is returning NULL!! "
323 "SIGSEGV is highly expected to follow immediately.\n"
324 "If you could reproduce, attach your debugger here, "
325 "and look at the passed string.\n",
330static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
333get_encoding(
VALUE str)
339mustnot_broken(
VALUE str)
341 if (is_broken_string(str)) {
342 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
347mustnot_wchar(
VALUE str)
349 rb_encoding *enc = STR_ENC_GET(str);
350 if (rb_enc_mbminlen(enc) > 1) {
351 rb_raise(rb_eArgError,
"wide char encoding: %s", rb_enc_name(enc));
357static VALUE register_fstring(
VALUE str,
bool copy,
bool force_precompute_hash);
359#if SIZEOF_LONG == SIZEOF_VOIDP
360#define PRECOMPUTED_FAKESTR_HASH 1
364#ifdef PRECOMPUTED_FAKESTR_HASH
366fstring_hash(
VALUE str)
370 return (st_index_t)
RSTRING(str)->as.heap.aux.capa;
377#define fstring_hash rb_str_hash
380const struct st_hash_type rb_fstring_hash_type = {
385#define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
387static inline st_index_t
388str_do_hash(
VALUE str)
390 st_index_t h =
rb_memhash((
const void *)RSTRING_PTR(str), RSTRING_LEN(str));
392 if (e && !is_ascii_string(str)) {
399str_store_precomputed_hash(
VALUE str, st_index_t hash)
405 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
406 size_t free_bytes = str_embed_capa(str) - used_bytes;
410 memcpy(
RSTRING_END(str) + TERM_LEN(str), &hash,
sizeof(hash));
412 FL_SET(str, STR_PRECOMPUTED_HASH);
420 bool force_precompute_hash;
424fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data,
int existing)
433 if (rb_objspace_garbage_object_p(str)) {
448 long len = RSTRING_LEN(str);
449 long capa =
len +
sizeof(st_index_t);
450 int term_len = TERM_LEN(str);
452 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(
capa, term_len)) {
454 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str),
len);
455 STR_SET_LEN(new_str, RSTRING_LEN(str));
457 rb_enc_copy(new_str, str);
458 str_store_precomputed_hash(new_str, fstring_hash(str));
462 rb_enc_copy(new_str, str);
463#ifdef PRECOMPUTED_FAKESTR_HASH
464 if (
rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len +
sizeof(st_index_t)) {
465 str_store_precomputed_hash(new_str, (st_index_t)
RSTRING(str)->as.heap.aux.capa);
479 if (!
OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
482 if (STR_SHARED_P(str)) {
484 str_make_independent(str);
487 if (!BARE_STRING_P(str)) {
493 RBASIC(str)->flags |= RSTRING_FSTR;
495 *key = *value = arg->fstr = str;
508 if (
FL_TEST(str, RSTRING_FSTR))
511 bare = BARE_STRING_P(str);
513 if (STR_EMBED_P(str)) {
518 if (
FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
525 rb_str_resize(str, RSTRING_LEN(str));
527 fstr = register_fstring(str,
false,
false);
530 str_replace_shared_without_enc(str, fstr);
538register_fstring(
VALUE str,
bool copy,
bool force_precompute_hash)
542 .force_precompute_hash = force_precompute_hash
545#if SIZEOF_VOIDP == SIZEOF_LONG
549 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
555 st_table *frozen_strings = rb_vm_fstring_table();
558 st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
559 }
while (UNDEF_P(args.fstr));
572setup_fake_str(
struct RString *fake_str,
const char *name,
long len,
int encidx)
587 return (
VALUE)fake_str;
594rb_setup_fake_str(
struct RString *fake_str,
const char *name,
long len, rb_encoding *enc)
596 return setup_fake_str(fake_str, name,
len, rb_enc_to_index(enc));
605rb_fstring_new(
const char *ptr,
long len)
608 return register_fstring(setup_fake_str(&fake_str, ptr,
len, ENCINDEX_US_ASCII),
false,
false);
612rb_fstring_enc_new(
const char *ptr,
long len, rb_encoding *enc)
615 return register_fstring(rb_setup_fake_str(&fake_str, ptr,
len, enc),
false,
false);
619rb_fstring_cstr(
const char *ptr)
621 return rb_fstring_new(ptr, strlen(ptr));
625fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
635 const char *aptr, *bptr;
638 return (alen != blen ||
640 memcmp(aptr, bptr, alen) != 0);
644single_byte_optimizable(
VALUE str)
648 case ENCINDEX_ASCII_8BIT:
649 case ENCINDEX_US_ASCII:
671static inline const char *
672search_nonascii(
const char *p,
const char *e)
674 const uintptr_t *s, *t;
676#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
677# if SIZEOF_UINTPTR_T == 8
678# define NONASCII_MASK UINT64_C(0x8080808080808080)
679# elif SIZEOF_UINTPTR_T == 4
680# define NONASCII_MASK UINT32_C(0x80808080)
682# error "don't know what to do."
685# if SIZEOF_UINTPTR_T == 8
686# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
687# elif SIZEOF_UINTPTR_T == 4
688# define NONASCII_MASK 0x80808080UL
690# error "don't know what to do."
694 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
695#if !UNALIGNED_WORD_ACCESS
696 if ((uintptr_t)p % SIZEOF_VOIDP) {
697 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
702 case 7:
if (p[-7]&0x80)
return p-7;
703 case 6:
if (p[-6]&0x80)
return p-6;
704 case 5:
if (p[-5]&0x80)
return p-5;
705 case 4:
if (p[-4]&0x80)
return p-4;
707 case 3:
if (p[-3]&0x80)
return p-3;
708 case 2:
if (p[-2]&0x80)
return p-2;
709 case 1:
if (p[-1]&0x80)
return p-1;
714#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
715#define aligned_ptr(value) \
716 __builtin_assume_aligned((value), sizeof(uintptr_t))
718#define aligned_ptr(value) (uintptr_t *)(value)
721 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
724 if (*s & NONASCII_MASK) {
725#ifdef WORDS_BIGENDIAN
726 return (
const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
728 return (
const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
738 case 7:
if (e[-7]&0x80)
return e-7;
739 case 6:
if (e[-6]&0x80)
return e-6;
740 case 5:
if (e[-5]&0x80)
return e-5;
741 case 4:
if (e[-4]&0x80)
return e-4;
743 case 3:
if (e[-3]&0x80)
return e-3;
744 case 2:
if (e[-2]&0x80)
return e-2;
745 case 1:
if (e[-1]&0x80)
return e-1;
751coderange_scan(
const char *p,
long len, rb_encoding *enc)
753 const char *e = p +
len;
755 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
757 p = search_nonascii(p, e);
761 if (rb_enc_asciicompat(enc)) {
762 p = search_nonascii(p, e);
765 int ret = rb_enc_precise_mbclen(p, e, enc);
769 p = search_nonascii(p, e);
775 int ret = rb_enc_precise_mbclen(p, e, enc);
791 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
794 p = search_nonascii(p, e);
798 else if (rb_enc_asciicompat(enc)) {
799 p = search_nonascii(p, e);
805 int ret = rb_enc_precise_mbclen(p, e, enc);
812 p = search_nonascii(p, e);
818 int ret = rb_enc_precise_mbclen(p, e, enc);
843 rb_enc_set_index(str1, rb_enc_get_index(str2));
851rb_enc_cr_str_copy_for_substr(
VALUE dest,
VALUE src)
856 str_enc_copy(dest, src);
857 if (RSTRING_LEN(dest) == 0) {
858 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
869 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
870 search_nonascii(RSTRING_PTR(dest),
RSTRING_END(dest)))
881rb_enc_cr_str_exact_copy(
VALUE dest,
VALUE src)
883 str_enc_copy(dest, src);
888enc_coderange_scan(
VALUE str, rb_encoding *enc)
890 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
894rb_enc_str_coderange_scan(
VALUE str, rb_encoding *enc)
896 return enc_coderange_scan(str, enc);
905 cr = enc_coderange_scan(str, get_encoding(str));
912rb_enc_str_asciicompat(
VALUE str)
915 return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
923 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
932str_mod_check(
VALUE s,
const char *p,
long len)
934 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) !=
len){
940str_capacity(
VALUE str,
const int termlen)
942 if (STR_EMBED_P(str)) {
943 return str_embed_capa(str) - termlen;
945 else if (
FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
949 return RSTRING(str)->as.heap.aux.capa;
956 return str_capacity(str, TERM_LEN(str));
960must_not_null(
const char *ptr)
963 rb_raise(rb_eArgError,
"NULL pointer given");
970 size_t size = rb_str_embed_size(
capa);
974 NEWOBJ_OF(str,
struct RString, klass,
981str_alloc_heap(
VALUE klass)
983 NEWOBJ_OF(str,
struct RString, klass,
990empty_str_alloc(
VALUE klass)
992 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
993 VALUE str = str_alloc_embed(klass, 0);
994 memset(
RSTRING(str)->
as.embed.ary, 0, str_embed_capa(str));
1000str_enc_new(
VALUE klass,
const char *ptr,
long len, rb_encoding *enc)
1005 rb_raise(rb_eArgError,
"negative string size (or size too big)");
1009 enc = rb_ascii8bit_encoding();
1012 RUBY_DTRACE_CREATE_HOOK(STRING,
len);
1014 int termlen = rb_enc_mbminlen(enc);
1016 if (STR_EMBEDDABLE_P(
len, termlen)) {
1017 str = str_alloc_embed(klass,
len + termlen);
1023 str = str_alloc_heap(klass);
1029 rb_xmalloc_mul_add_mul(
sizeof(
char),
len,
sizeof(
char), termlen);
1032 rb_enc_raw_set(str, enc);
1035 memcpy(RSTRING_PTR(str), ptr,
len);
1038 STR_SET_LEN(str,
len);
1039 TERM_FILL(RSTRING_PTR(str) +
len, termlen);
1044str_new(
VALUE klass,
const char *ptr,
long len)
1046 return str_enc_new(klass, ptr,
len, rb_ascii8bit_encoding());
1058 return str_enc_new(
rb_cString, ptr,
len, rb_usascii_encoding());
1064 return str_enc_new(
rb_cString, ptr,
len, rb_utf8_encoding());
1068rb_enc_str_new(
const char *ptr,
long len, rb_encoding *enc)
1081 __msan_unpoison_string(ptr);
1101 if (rb_enc_mbminlen(enc) != 1) {
1102 rb_raise(rb_eArgError,
"wchar encoding given");
1104 return rb_enc_str_new(ptr, strlen(ptr), enc);
1108str_new_static(
VALUE klass,
const char *ptr,
long len,
int encindex)
1113 rb_raise(rb_eArgError,
"negative string size (or size too big)");
1117 str = str_enc_new(klass, ptr,
len, rb_enc_from_index(encindex));
1120 RUBY_DTRACE_CREATE_HOOK(STRING,
len);
1121 str = str_alloc_heap(klass);
1123 RSTRING(str)->as.heap.ptr = (
char *)ptr;
1125 RBASIC(str)->flags |= STR_NOFREE;
1126 rb_enc_associate_index(str, encindex);
1140 return str_new_static(
rb_cString, ptr,
len, ENCINDEX_US_ASCII);
1146 return str_new_static(
rb_cString, ptr,
len, ENCINDEX_UTF_8);
1152 return str_new_static(
rb_cString, ptr,
len, rb_enc_to_index(enc));
1155static VALUE str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *ptr,
long len,
1156 rb_encoding *from, rb_encoding *to,
1157 int ecflags,
VALUE ecopts);
1160is_enc_ascii_string(
VALUE str, rb_encoding *enc)
1162 int encidx = rb_enc_to_index(enc);
1163 if (rb_enc_get_index(str) == encidx)
1164 return is_ascii_string(str);
1175 if (!to)
return str;
1176 if (!from) from = rb_enc_get(str);
1177 if (from == to)
return str;
1178 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1179 rb_is_ascii8bit_enc(to)) {
1180 if (STR_ENC_GET(str) != to) {
1182 rb_enc_associate(str, to);
1189 from, to, ecflags, ecopts);
1190 if (
NIL_P(newstr)) {
1198rb_str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *ptr,
long len,
1199 rb_encoding *from,
int ecflags,
VALUE ecopts)
1203 olen = RSTRING_LEN(newstr);
1204 if (ofs < -olen || olen < ofs)
1206 if (ofs < 0) ofs += olen;
1208 STR_SET_LEN(newstr, ofs);
1212 rb_str_modify(newstr);
1213 return str_cat_conv_enc_opts(newstr, ofs, ptr,
len, from,
1219rb_str_initialize(
VALUE str,
const char *ptr,
long len, rb_encoding *enc)
1221 STR_SET_LEN(str, 0);
1222 rb_enc_associate(str, enc);
1228str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *ptr,
long len,
1229 rb_encoding *from, rb_encoding *to,
1230 int ecflags,
VALUE ecopts)
1235 VALUE econv_wrapper;
1236 const unsigned char *start, *sp;
1237 unsigned char *dest, *dp;
1238 size_t converted_output = (size_t)ofs;
1243 RBASIC_CLEAR_CLASS(econv_wrapper);
1245 if (!ec)
return Qnil;
1248 sp = (
unsigned char*)ptr;
1250 while ((dest = (
unsigned char*)RSTRING_PTR(newstr)),
1251 (dp = dest + converted_output),
1255 size_t converted_input = sp - start;
1256 size_t rest =
len - converted_input;
1257 converted_output = dp - dest;
1259 if (converted_input && converted_output &&
1260 rest < (LONG_MAX / converted_output)) {
1261 rest = (rest * converted_output) / converted_input;
1266 olen += rest < 2 ? 2 : rest;
1267 rb_str_resize(newstr, olen);
1274 len = dp - (
unsigned char*)RSTRING_PTR(newstr);
1276 rb_enc_associate(newstr, to);
1295 const int eidx = rb_enc_to_index(eenc);
1298 return rb_enc_str_new(ptr,
len, eenc);
1302 if ((eidx == rb_ascii8bit_encindex()) ||
1303 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr +
len))) {
1307 ienc = rb_default_internal_encoding();
1308 if (!ienc || eenc == ienc) {
1309 return rb_enc_str_new(ptr,
len, eenc);
1313 if ((eidx == rb_ascii8bit_encindex()) ||
1314 (eidx == rb_usascii_encindex()) ||
1315 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr +
len))) {
1316 return rb_enc_str_new(ptr,
len, ienc);
1319 str = rb_enc_str_new(NULL, 0, ienc);
1322 if (
NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr,
len, eenc, 0,
Qnil))) {
1323 rb_str_initialize(str, ptr,
len, eenc);
1329rb_external_str_with_enc(
VALUE str, rb_encoding *eenc)
1331 int eidx = rb_enc_to_index(eenc);
1332 if (eidx == rb_usascii_encindex() &&
1333 !is_ascii_string(str)) {
1334 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1337 rb_enc_associate_index(str, eidx);
1396str_replace_shared_without_enc(
VALUE str2,
VALUE str)
1398 const int termlen = TERM_LEN(str);
1403 if (str_embed_capa(str2) >=
len + termlen) {
1404 char *ptr2 =
RSTRING(str2)->as.embed.ary;
1405 STR_SET_EMBED(str2);
1406 memcpy(ptr2, RSTRING_PTR(str),
len);
1407 TERM_FILL(ptr2+
len, termlen);
1411 if (STR_SHARED_P(str)) {
1412 root =
RSTRING(str)->as.heap.aux.shared;
1421 if (!STR_EMBED_P(str2) && !
FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1423 rb_fatal(
"about to free a possible shared root");
1425 char *ptr2 = STR_HEAP_PTR(str2);
1427 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1430 FL_SET(str2, STR_NOEMBED);
1431 RSTRING(str2)->as.heap.ptr = ptr;
1432 STR_SET_SHARED(str2, root);
1435 STR_SET_LEN(str2,
len);
1443 str_replace_shared_without_enc(str2, str);
1444 rb_enc_cr_str_exact_copy(str2, str);
1451 return str_replace_shared(str_alloc_heap(klass), str);
1468rb_str_new_frozen_String(
VALUE orig)
1475rb_str_tmp_frozen_acquire(
VALUE orig)
1478 return str_new_frozen_buffer(0, orig, FALSE);
1482rb_str_tmp_frozen_no_embed_acquire(
VALUE orig)
1484 if (
OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig))
return orig;
1485 if (STR_SHARED_P(orig) && !STR_EMBED_P(
RSTRING(orig)->
as.heap.aux.shared))
return rb_str_tmp_frozen_acquire(orig);
1487 VALUE str = str_alloc_heap(0);
1490 FL_SET(str, STR_SHARED_ROOT);
1492 size_t capa = str_capacity(orig, TERM_LEN(orig));
1498 if (STR_EMBED_P(orig) ||
FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT)) {
1499 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(
sizeof(
char),
capa,
sizeof(
char), TERM_LEN(orig));
1506 RBASIC(str)->flags |=
RBASIC(orig)->flags & STR_NOFREE;
1507 RBASIC(orig)->flags &= ~STR_NOFREE;
1508 STR_SET_SHARED(orig, str);
1518rb_str_tmp_frozen_release(
VALUE orig,
VALUE tmp)
1523 if (STR_EMBED_P(tmp)) {
1532 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1536 RSTRING(orig)->as.heap.aux.capa =
RSTRING(tmp)->as.heap.aux.capa;
1537 RBASIC(orig)->flags |=
RBASIC(tmp)->flags & STR_NOFREE;
1542 STR_SET_LEN(tmp, 0);
1550 return str_new_frozen_buffer(klass, orig, TRUE);
1559 VALUE str = str_alloc_heap(klass);
1560 STR_SET_LEN(str, RSTRING_LEN(orig));
1561 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1562 RSTRING(str)->as.heap.aux.capa =
RSTRING(orig)->as.heap.aux.capa;
1563 RBASIC(str)->flags |=
RBASIC(orig)->flags & STR_NOFREE;
1564 RBASIC(orig)->flags &= ~STR_NOFREE;
1565 STR_SET_SHARED(orig, str);
1572str_new_frozen_buffer(
VALUE klass,
VALUE orig,
int copy_encoding)
1576 long len = RSTRING_LEN(orig);
1577 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1578 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1580 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(
len, termlen)) {
1581 str = str_enc_new(klass, RSTRING_PTR(orig),
len, enc);
1587 long ofs =
RSTRING(orig)->as.heap.ptr - RSTRING_PTR(
shared);
1588 long rest = RSTRING_LEN(
shared) - ofs - RSTRING_LEN(orig);
1594 if ((ofs > 0) || (rest > 0) ||
1597 str = str_new_shared(klass,
shared);
1599 RSTRING(str)->as.heap.ptr += ofs;
1600 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1608 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1609 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1611 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1612 STR_SET_LEN(str, RSTRING_LEN(orig));
1617 str = heap_str_make_shared(klass, orig);
1621 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1633str_new_empty_String(
VALUE str)
1636 rb_enc_copy(v, str);
1640#define STR_BUF_MIN_SIZE 63
1645 if (STR_EMBEDDABLE_P(
capa, 1)) {
1653 RSTRING(str)->as.heap.ptr[0] =
'\0';
1662 long len = strlen(ptr);
1673 return str_new(0, 0,
len);
1679 if (STR_EMBED_P(str)) {
1680 RB_DEBUG_COUNTER_INC(obj_str_embed);
1682 else if (
FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1683 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared,
FL_TEST(str, STR_SHARED));
1684 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared,
FL_TEST(str, STR_NOFREE));
1687 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1688 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1693rb_str_memsize(
VALUE str)
1695 if (
FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1696 return STR_HEAP_SIZE(str);
1706 return rb_convert_type_with_id(str,
T_STRING,
"String", idTo_str);
1709static inline void str_discard(
VALUE str);
1710static void str_shared_replace(
VALUE str,
VALUE str2);
1715 if (str != str2) str_shared_replace(str, str2);
1726 enc = STR_ENC_GET(str2);
1729 termlen = rb_enc_mbminlen(enc);
1731 STR_SET_LEN(str, RSTRING_LEN(str2));
1733 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1735 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (
size_t)RSTRING_LEN(str2) + termlen);
1736 rb_enc_associate(str, enc);
1740 if (STR_EMBED_P(str2)) {
1742 long len = RSTRING_LEN(str2);
1745 char *new_ptr =
ALLOC_N(
char,
len + termlen);
1746 memcpy(new_ptr,
RSTRING(str2)->
as.embed.ary,
len + termlen);
1747 RSTRING(str2)->as.heap.ptr = new_ptr;
1748 STR_SET_LEN(str2,
len);
1750 STR_SET_NOEMBED(str2);
1753 STR_SET_NOEMBED(str);
1755 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1757 if (
FL_TEST(str2, STR_SHARED)) {
1759 STR_SET_SHARED(str,
shared);
1762 RSTRING(str)->as.heap.aux.capa =
RSTRING(str2)->as.heap.aux.capa;
1766 STR_SET_EMBED(str2);
1767 RSTRING_PTR(str2)[0] = 0;
1768 STR_SET_LEN(str2, 0);
1769 rb_enc_associate(str, enc);
1783 return rb_obj_as_string_result(str, obj);
1799 len = RSTRING_LEN(str2);
1800 if (STR_SHARED_P(str2)) {
1803 STR_SET_NOEMBED(str);
1804 STR_SET_LEN(str,
len);
1805 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1806 STR_SET_SHARED(str,
shared);
1807 rb_enc_cr_str_exact_copy(str, str2);
1810 str_replace_shared(str, str2);
1819 size_t size = rb_str_embed_size(
capa);
1823 NEWOBJ_OF(str,
struct RString, klass,
1832 NEWOBJ_OF(str,
struct RString, klass,
1843 encidx = rb_enc_get_index(str);
1847 if (encidx) rb_enc_associate_index(dup, encidx);
1857 long len = RSTRING_LEN(str);
1862 STR_SET_LEN(dup, RSTRING_LEN(str));
1863 return str_duplicate_setup_encoding(str, dup, flags);
1872 root =
RSTRING(str)->as.heap.aux.shared;
1874 else if (UNLIKELY(!(flags &
FL_FREEZE))) {
1875 root = str = str_new_frozen(klass, str);
1881 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1882 FL_SET(root, STR_SHARED_ROOT);
1884 flags |= RSTRING_NOEMBED | STR_SHARED;
1886 STR_SET_LEN(dup, RSTRING_LEN(str));
1887 return str_duplicate_setup_encoding(str, dup, flags);
1893 if (STR_EMBED_P(str)) {
1894 return str_duplicate_setup_embed(klass, str, dup);
1897 return str_duplicate_setup_heap(klass, str, dup);
1905 if (STR_EMBED_P(str)) {
1906 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1909 dup = str_alloc_heap(klass);
1912 return str_duplicate_setup(klass, str, dup);
1923rb_str_dup_m(
VALUE str)
1925 if (LIKELY(BARE_STRING_P(str))) {
1936 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1943 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1947 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
1948 str_duplicate_setup_embed(klass, str, new_str);
1951 new_str = ec_str_alloc_heap(ec, klass);
1952 str_duplicate_setup_heap(klass, str, new_str);
1961rb_str_with_debug_created_info(
VALUE str,
VALUE path,
int line)
1963 VALUE debug_info = rb_ary_new_from_args(2, path,
INT2FIX(line));
1965 rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
1982 static ID keyword_ids[2];
1983 VALUE orig, opt, venc, vcapa;
1985 rb_encoding *enc = 0;
1988 if (!keyword_ids[0]) {
1989 keyword_ids[0] = rb_id_encoding();
1990 CONST_ID(keyword_ids[1],
"capacity");
1998 if (!UNDEF_P(venc) && !
NIL_P(venc)) {
1999 enc = rb_to_encoding(venc);
2001 if (!UNDEF_P(vcapa) && !
NIL_P(vcapa)) {
2004 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2006 if (
capa < STR_BUF_MIN_SIZE) {
2007 capa = STR_BUF_MIN_SIZE;
2011 len = RSTRING_LEN(orig);
2015 if (orig == str) n = 0;
2017 str_modifiable(str);
2018 if (STR_EMBED_P(str) ||
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2020 const size_t size = (size_t)
capa + termlen;
2021 const char *
const old_ptr = RSTRING_PTR(str);
2022 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2023 char *new_ptr =
ALLOC_N(
char, size);
2024 if (STR_EMBED_P(str))
RUBY_ASSERT((
long)osize <= str_embed_capa(str));
2025 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2027 RSTRING(str)->as.heap.ptr = new_ptr;
2029 else if (STR_HEAP_SIZE(str) != (
size_t)
capa + termlen) {
2030 SIZED_REALLOC_N(
RSTRING(str)->
as.heap.ptr,
char,
2031 (
size_t)
capa + termlen, STR_HEAP_SIZE(str));
2033 STR_SET_LEN(str,
len);
2036 memcpy(
RSTRING(str)->
as.heap.ptr, RSTRING_PTR(orig),
len);
2037 rb_enc_cr_str_exact_copy(str, orig);
2039 FL_SET(str, STR_NOEMBED);
2046 rb_enc_associate(str, enc);
2058rb_str_s_new(
int argc,
VALUE *argv,
VALUE klass)
2064 static ID keyword_ids[2];
2067 rb_encoding *enc = NULL;
2074 keyword_ids[0] = rb_id_encoding();
2075 CONST_ID(keyword_ids[1],
"capacity");
2077 encoding = kwargs[0];
2078 capacity = kwargs[1];
2087 if (UNDEF_P(encoding)) {
2089 encoding = rb_obj_encoding(orig);
2093 if (!UNDEF_P(encoding)) {
2094 enc = rb_to_encoding(encoding);
2098 if (UNDEF_P(capacity)) {
2100 VALUE empty_str = str_new(klass,
"", 0);
2102 rb_enc_associate(empty_str, enc);
2106 VALUE copy = str_duplicate(klass, orig);
2107 rb_enc_associate(copy, enc);
2120 if (orig_capa >
capa) {
2125 VALUE str = str_enc_new(klass, NULL,
capa, enc);
2126 STR_SET_LEN(str, 0);
2137#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2152static inline uintptr_t
2153count_utf8_lead_bytes_with_word(
const uintptr_t *s)
2158 d = (d>>6) | (~d>>7);
2159 d &= NONASCII_MASK >> 7;
2162#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2164 return rb_popcount_intptr(d);
2168# if SIZEOF_VOIDP == 8
2177enc_strlen(
const char *p,
const char *e, rb_encoding *enc,
int cr)
2183 long diff = (long)(e - p);
2184 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2189 if ((
int)
sizeof(uintptr_t) * 2 < e - p) {
2190 const uintptr_t *s, *t;
2191 const uintptr_t lowbits =
sizeof(uintptr_t) - 1;
2192 s = (
const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2193 t = (
const uintptr_t*)(~lowbits & (uintptr_t)e);
2194 while (p < (
const char *)s) {
2195 if (is_utf8_lead_byte(*p))
len++;
2199 len += count_utf8_lead_bytes_with_word(s);
2202 p = (
const char *)s;
2205 if (is_utf8_lead_byte(*p))
len++;
2211 else if (rb_enc_asciicompat(enc)) {
2216 q = search_nonascii(p, e);
2222 p += rb_enc_fast_mbclen(p, e, enc);
2229 q = search_nonascii(p, e);
2235 p += rb_enc_mbclen(p, e, enc);
2242 for (c=0; p<e; c++) {
2243 p += rb_enc_mbclen(p, e, enc);
2258rb_enc_strlen_cr(
const char *p,
const char *e, rb_encoding *enc,
int *cr)
2266 long diff = (long)(e - p);
2267 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2269 else if (rb_enc_asciicompat(enc)) {
2273 q = search_nonascii(p, e);
2281 ret = rb_enc_precise_mbclen(p, e, enc);
2296 for (c=0; p<e; c++) {
2297 ret = rb_enc_precise_mbclen(p, e, enc);
2304 if (p + rb_enc_mbminlen(enc) <= e)
2305 p += rb_enc_mbminlen(enc);
2316str_strlen(
VALUE str, rb_encoding *enc)
2321 if (single_byte_optimizable(str))
return RSTRING_LEN(str);
2322 if (!enc) enc = STR_ENC_GET(str);
2323 p = RSTRING_PTR(str);
2328 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2333 return enc_strlen(p, e, enc, cr);
2340 return str_strlen(str, NULL);
2354 return LONG2NUM(str_strlen(str, NULL));
2366rb_str_bytesize(
VALUE str)
2384rb_str_empty(
VALUE str)
2386 return RBOOL(RSTRING_LEN(str) == 0);
2404 char *ptr1, *ptr2, *ptr3;
2409 enc = rb_enc_check_str(str1, str2);
2412 termlen = rb_enc_mbminlen(enc);
2413 if (len1 > LONG_MAX - len2) {
2414 rb_raise(rb_eArgError,
"string size too big");
2416 str3 = str_enc_new(
rb_cString, 0, len1+len2, enc);
2417 ptr3 = RSTRING_PTR(str3);
2418 memcpy(ptr3, ptr1, len1);
2419 memcpy(ptr3+len1, ptr2, len2);
2420 TERM_FILL(&ptr3[len1+len2], termlen);
2436 MAYBE_UNUSED(
char) *ptr1, *ptr2;
2439 int enc1 = rb_enc_get_index(str1);
2440 int enc2 = rb_enc_get_index(str2);
2445 else if (enc2 < 0) {
2448 else if (enc1 != enc2) {
2451 else if (len1 > LONG_MAX - len2) {
2484 rb_enc_copy(str2, str);
2489 rb_raise(rb_eArgError,
"negative argument");
2491 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2492 if (STR_EMBEDDABLE_P(
len, 1)) {
2494 memset(RSTRING_PTR(str2), 0,
len + 1);
2501 STR_SET_LEN(str2,
len);
2502 rb_enc_copy(str2, str);
2505 if (
len && LONG_MAX/
len < RSTRING_LEN(str)) {
2506 rb_raise(rb_eArgError,
"argument too big");
2509 len *= RSTRING_LEN(str);
2510 termlen = TERM_LEN(str);
2512 ptr2 = RSTRING_PTR(str2);
2514 n = RSTRING_LEN(str);
2515 memcpy(ptr2, RSTRING_PTR(str), n);
2516 while (n <=
len/2) {
2517 memcpy(ptr2 + n, ptr2, n);
2520 memcpy(ptr2 + n, ptr2,
len-n);
2522 STR_SET_LEN(str2,
len);
2523 TERM_FILL(&ptr2[
len], termlen);
2524 rb_enc_cr_str_copy_for_substr(str2, str);
2550 VALUE tmp = rb_check_array_type(arg);
2559rb_check_lockedtmp(
VALUE str)
2561 if (
FL_TEST(str, STR_TMPLOCK)) {
2568#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2570str_modifiable(
VALUE str)
2572 if (RB_UNLIKELY(
FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2573 if (CHILLED_STRING_P(str)) {
2574 CHILLED_STRING_MUTATED(str);
2576 rb_check_lockedtmp(str);
2577 rb_check_frozen(str);
2582str_dependent_p(
VALUE str)
2584 if (STR_EMBED_P(str) || !
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2594#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2596str_independent(
VALUE str)
2598 if (RB_UNLIKELY(
FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2599 str_modifiable(str);
2600 return !str_dependent_p(str);
2606str_make_independent_expand(
VALUE str,
long len,
long expand,
const int termlen)
2614 if (!STR_EMBED_P(str) && str_embed_capa(str) >=
capa + termlen) {
2615 ptr =
RSTRING(str)->as.heap.ptr;
2619 STR_SET_LEN(str,
len);
2624 oldptr = RSTRING_PTR(str);
2626 memcpy(ptr, oldptr,
len);
2628 if (
FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2631 STR_SET_NOEMBED(str);
2632 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2633 TERM_FILL(ptr +
len, termlen);
2634 RSTRING(str)->as.heap.ptr = ptr;
2635 STR_SET_LEN(str,
len);
2642 if (!str_independent(str))
2643 str_make_independent(str);
2650 int termlen = TERM_LEN(str);
2651 long len = RSTRING_LEN(str);
2654 rb_raise(rb_eArgError,
"negative expanding string size");
2656 if (expand >= LONG_MAX -
len) {
2657 rb_raise(rb_eArgError,
"string size too big");
2660 if (!str_independent(str)) {
2661 str_make_independent_expand(str,
len, expand, termlen);
2663 else if (expand > 0) {
2664 RESIZE_CAPA_TERM(str,
len + expand, termlen);
2671str_modify_keep_cr(
VALUE str)
2673 if (!str_independent(str))
2674 str_make_independent(str);
2681str_discard(
VALUE str)
2683 str_modifiable(str);
2684 if (!STR_EMBED_P(str) && !
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2685 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2686 RSTRING(str)->as.heap.ptr = 0;
2687 STR_SET_LEN(str, 0);
2694 int encindex = rb_enc_get_index(str);
2696 if (RB_UNLIKELY(encindex == -1)) {
2700 if (RB_LIKELY(str_encindex_fastpath(encindex))) {
2704 rb_encoding *enc = rb_enc_from_index(encindex);
2705 if (!rb_enc_asciicompat(enc)) {
2725 return RSTRING_PTR(str);
2729zero_filled(
const char *s,
int n)
2731 for (; n > 0; --n) {
2738str_null_char(
const char *s,
long len,
const int minlen, rb_encoding *enc)
2740 const char *e = s +
len;
2742 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2743 if (zero_filled(s, minlen))
return s;
2749str_fill_term(
VALUE str,
char *s,
long len,
int termlen)
2754 if (str_dependent_p(str)) {
2755 if (!zero_filled(s +
len, termlen))
2756 str_make_independent_expand(str,
len, 0L, termlen);
2759 TERM_FILL(s +
len, termlen);
2762 return RSTRING_PTR(str);
2766rb_str_change_terminator_length(
VALUE str,
const int oldtermlen,
const int termlen)
2768 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2769 long len = RSTRING_LEN(str);
2773 rb_check_lockedtmp(str);
2774 str_make_independent_expand(str,
len, 0L, termlen);
2776 else if (str_dependent_p(str)) {
2777 if (termlen > oldtermlen)
2778 str_make_independent_expand(str,
len, 0L, termlen);
2781 if (!STR_EMBED_P(str)) {
2786 if (termlen > oldtermlen) {
2787 TERM_FILL(RSTRING_PTR(str) +
len, termlen);
2795str_null_check(
VALUE str,
int *w)
2797 char *s = RSTRING_PTR(str);
2798 long len = RSTRING_LEN(str);
2799 rb_encoding *enc = rb_enc_get(str);
2800 const int minlen = rb_enc_mbminlen(enc);
2804 if (str_null_char(s,
len, minlen, enc)) {
2807 return str_fill_term(str, s,
len, minlen);
2810 if (!s || memchr(s, 0,
len)) {
2814 s = str_fill_term(str, s,
len, minlen);
2820rb_str_to_cstr(
VALUE str)
2823 return str_null_check(str, &w);
2831 char *s = str_null_check(str, &w);
2834 rb_raise(rb_eArgError,
"string contains null char");
2836 rb_raise(rb_eArgError,
"string contains null byte");
2842rb_str_fill_terminator(
VALUE str,
const int newminlen)
2844 char *s = RSTRING_PTR(str);
2845 long len = RSTRING_LEN(str);
2846 return str_fill_term(str, s,
len, newminlen);
2852 str = rb_check_convert_type_with_id(str,
T_STRING,
"String", idTo_str);
2876str_nth_len(
const char *p,
const char *e,
long *nthp, rb_encoding *enc)
2885 else if (rb_enc_asciicompat(enc)) {
2886 const char *p2, *e2;
2889 while (p < e && 0 < nth) {
2896 p2 = search_nonascii(p, e2);
2905 n = rb_enc_mbclen(p, e, enc);
2916 while (p < e && nth--) {
2917 p += rb_enc_mbclen(p, e, enc);
2926rb_enc_nth(
const char *p,
const char *e,
long nth, rb_encoding *enc)
2928 return str_nth_len(p, e, &nth, enc);
2932str_nth(
const char *p,
const char *e,
long nth, rb_encoding *enc,
int singlebyte)
2937 p = str_nth_len(p, e, &nth, enc);
2946str_offset(
const char *p,
const char *e,
long nth, rb_encoding *enc,
int singlebyte)
2948 const char *pp = str_nth(p, e, nth, enc, singlebyte);
2949 if (!pp)
return e - p;
2956 return str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
2957 STR_ENC_GET(str), single_byte_optimizable(str));
2962str_utf8_nth(
const char *p,
const char *e,
long *nthp)
2965 if ((
int)SIZEOF_VOIDP * 2 < e - p && (
int)SIZEOF_VOIDP * 2 < nth) {
2966 const uintptr_t *s, *t;
2967 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2968 s = (
const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2969 t = (
const uintptr_t*)(~lowbits & (uintptr_t)e);
2970 while (p < (
const char *)s) {
2971 if (is_utf8_lead_byte(*p)) nth--;
2975 nth -= count_utf8_lead_bytes_with_word(s);
2977 }
while (s < t && (
int)SIZEOF_VOIDP <= nth);
2981 if (is_utf8_lead_byte(*p)) {
2982 if (nth == 0)
break;
2992str_utf8_offset(
const char *p,
const char *e,
long nth)
2994 const char *pp = str_utf8_nth(p, e, &nth);
3003 if (single_byte_optimizable(str) || pos < 0)
3006 char *p = RSTRING_PTR(str);
3007 return enc_strlen(p, p + pos, STR_ENC_GET(str),
ENC_CODERANGE(str));
3012str_subseq(
VALUE str,
long beg,
long len)
3020 const int termlen = TERM_LEN(str);
3021 if (!SHARABLE_SUBSTRING_P(beg,
len, RSTRING_LEN(str))) {
3028 if (str_embed_capa(str2) >=
len + termlen) {
3029 char *ptr2 =
RSTRING(str2)->as.embed.ary;
3030 STR_SET_EMBED(str2);
3031 memcpy(ptr2, RSTRING_PTR(str) + beg,
len);
3032 TERM_FILL(ptr2+
len, termlen);
3034 STR_SET_LEN(str2,
len);
3038 str_replace_shared(str2, str);
3041 RSTRING(str2)->as.heap.ptr += beg;
3042 if (RSTRING_LEN(str2) >
len) {
3043 STR_SET_LEN(str2,
len);
3053 VALUE str2 = str_subseq(str, beg,
len);
3054 rb_enc_cr_str_copy_for_substr(str2, str);
3063 const long blen = RSTRING_LEN(str);
3064 rb_encoding *enc = STR_ENC_GET(str);
3065 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3067 if (
len < 0)
return 0;
3068 if (beg < 0 && -beg < 0)
return 0;
3072 if (single_byte_optimizable(str)) {
3073 if (beg > blen)
return 0;
3076 if (beg < 0)
return 0;
3078 if (
len > blen - beg)
3080 if (
len < 0)
return 0;
3085 if (
len > -beg)
len = -beg;
3089 while (beg-- >
len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3092 while (
len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3098 slen = str_strlen(str, enc);
3100 if (beg < 0)
return 0;
3102 if (
len == 0)
goto end;
3105 else if (beg > 0 && beg > blen) {
3109 if (beg > str_strlen(str, enc))
return 0;
3114 enc == rb_utf8_encoding()) {
3115 p = str_utf8_nth(s, e, &beg);
3116 if (beg > 0)
return 0;
3117 len = str_utf8_offset(p, e,
len);
3123 p = s + beg * char_sz;
3127 else if (
len * char_sz > e - p)
3132 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3133 if (beg > 0)
return 0;
3137 len = str_offset(p, e,
len, enc, 0);
3145static VALUE str_substr(
VALUE str,
long beg,
long len,
int empty);
3150 return str_substr(str, beg,
len, TRUE);
3160str_substr(
VALUE str,
long beg,
long len,
int empty)
3164 if (!p)
return Qnil;
3165 if (!
len && !empty)
return Qnil;
3167 beg = p - RSTRING_PTR(str);
3169 VALUE str2 = str_subseq(str, beg,
len);
3170 rb_enc_cr_str_copy_for_substr(str2, str);
3178 if (CHILLED_STRING_P(str)) {
3183 rb_str_resize(str, RSTRING_LEN(str));
3199 if (
OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3229str_uminus(
VALUE str)
3234 return rb_fstring(str);
3238#define rb_str_dup_frozen rb_str_new_frozen
3243 if (
FL_TEST(str, STR_TMPLOCK)) {
3246 FL_SET(str, STR_TMPLOCK);
3253 if (!
FL_TEST(str, STR_TMPLOCK)) {
3271 const int termlen = TERM_LEN(str);
3273 str_modifiable(str);
3274 if (STR_SHARED_P(str)) {
3277 if (
len > (
capa = (
long)str_capacity(str, termlen)) ||
len < 0) {
3278 rb_bug(
"probable buffer overflow: %ld for %ld",
len,
capa);
3289 else if (
len > RSTRING_LEN(str)) {
3293 const char *
const new_end = RSTRING_PTR(str) +
len;
3294 rb_encoding *enc = rb_enc_get(str);
3303 else if (
len < RSTRING_LEN(str)) {
3311 STR_SET_LEN(str,
len);
3312 TERM_FILL(&RSTRING_PTR(str)[
len], termlen);
3319 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3322 int independent = str_independent(str);
3323 long slen = RSTRING_LEN(str);
3324 const int termlen = TERM_LEN(str);
3326 if (slen >
len || (termlen != 1 && slen <
len)) {
3332 if (STR_EMBED_P(str)) {
3333 if (
len == slen)
return str;
3334 if (str_embed_capa(str) >=
len + termlen) {
3335 STR_SET_LEN(str,
len);
3339 str_make_independent_expand(str, slen,
len - slen, termlen);
3341 else if (str_embed_capa(str) >=
len + termlen) {
3342 char *ptr = STR_HEAP_PTR(str);
3344 if (slen >
len) slen =
len;
3347 STR_SET_LEN(str,
len);
3348 if (independent) ruby_xfree(ptr);
3351 else if (!independent) {
3352 if (
len == slen)
return str;
3353 str_make_independent_expand(str, slen,
len - slen, termlen);
3357 SIZED_REALLOC_N(
RSTRING(str)->
as.heap.ptr,
char,
3358 (
size_t)
len + termlen, STR_HEAP_SIZE(str));
3361 else if (
len == slen)
return str;
3362 STR_SET_LEN(str,
len);
3369str_ensure_available_capa(
VALUE str,
long len)
3371 str_modify_keep_cr(str);
3373 const int termlen = TERM_LEN(str);
3374 long olen = RSTRING_LEN(str);
3376 if (RB_UNLIKELY(olen > LONG_MAX -
len)) {
3377 rb_raise(rb_eArgError,
"string sizes too big");
3380 long total = olen +
len;
3381 long capa = str_capacity(str, termlen);
3384 if (total >= LONG_MAX / 2) {
3387 while (total >
capa) {
3390 RESIZE_CAPA_TERM(str,
capa, termlen);
3395str_buf_cat4(
VALUE str,
const char *ptr,
long len,
bool keep_cr)
3398 str_modify_keep_cr(str);
3403 if (
len == 0)
return 0;
3405 long total, olen,
off = -1;
3407 const int termlen = TERM_LEN(str);
3410 if (ptr >= sptr && ptr <= sptr + olen) {
3414 long capa = str_capacity(str, termlen);
3416 if (olen > LONG_MAX -
len) {
3417 rb_raise(rb_eArgError,
"string sizes too big");
3421 if (total >= LONG_MAX / 2) {
3424 while (total >
capa) {
3427 RESIZE_CAPA_TERM(str,
capa, termlen);
3428 sptr = RSTRING_PTR(str);
3433 memcpy(sptr + olen, ptr,
len);
3434 STR_SET_LEN(str, total);
3435 TERM_FILL(sptr + total, termlen);
3440#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3441#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3446 if (
len == 0)
return str;
3448 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3450 return str_buf_cat(str, ptr,
len);
3461rb_str_buf_cat_byte(
VALUE str,
unsigned char byte)
3466 if (UNLIKELY(!str_independent(str))) {
3467 str_make_independent(str);
3470 long string_length = -1;
3471 const int null_terminator_length = 1;
3476 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3477 rb_raise(rb_eArgError,
"string sizes too big");
3480 long string_capacity = str_capacity(str, null_terminator_length);
3486 if (LIKELY(string_capacity >= string_length + 1)) {
3488 sptr[string_length] = byte;
3489 STR_SET_LEN(str, string_length + 1);
3490 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3494 str_buf_cat(str, (
char *)&
byte, 1);
3510 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3521rb_enc_cr_str_buf_cat(
VALUE str,
const char *ptr,
long len,
3522 int ptr_encindex,
int ptr_cr,
int *ptr_cr_ret)
3527 rb_encoding *str_enc, *ptr_enc;
3531 if (str_encindex == ptr_encindex) {
3533 ptr_cr = coderange_scan(ptr,
len, rb_enc_from_index(ptr_encindex));
3537 str_enc = rb_enc_from_index(str_encindex);
3538 ptr_enc = rb_enc_from_index(ptr_encindex);
3539 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3542 if (RSTRING_LEN(str) == 0) {
3545 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3551 ptr_cr = coderange_scan(ptr,
len, ptr_enc);
3560 *ptr_cr_ret = ptr_cr;
3562 if (str_encindex != ptr_encindex &&
3565 str_enc = rb_enc_from_index(str_encindex);
3566 ptr_enc = rb_enc_from_index(ptr_encindex);
3571 res_encindex = str_encindex;
3576 res_encindex = str_encindex;
3580 res_encindex = ptr_encindex;
3585 res_encindex = str_encindex;
3592 res_encindex = str_encindex;
3598 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3600 str_buf_cat(str, ptr,
len);
3606 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3613 return rb_enc_cr_str_buf_cat(str, ptr,
len,
3622 rb_encoding *enc = rb_enc_from_index(encindex);
3623 if (rb_enc_asciicompat(enc)) {
3624 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3630 unsigned int c = (
unsigned char)*ptr;
3631 int len = rb_enc_codelen(c, enc);
3632 rb_enc_mbcput(c, buf, enc);
3633 rb_enc_cr_str_buf_cat(str, buf,
len,
3646 if (str_enc_fastpath(str)) {
3650 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
true);
3656 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
true);
3667 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3683rb_str_concat_literals(
size_t num,
const VALUE *strary)
3687 unsigned long len = 1;
3692 for (i = 0; i < num; ++i) {
len += RSTRING_LEN(strary[i]); }
3694 str_enc_copy_direct(str, strary[0]);
3696 for (i = s; i < num; ++i) {
3697 const VALUE v = strary[i];
3701 if (encidx != ENCINDEX_US_ASCII) {
3703 rb_enc_set_index(str, encidx);
3728rb_str_concat_multi(
int argc,
VALUE *argv,
VALUE str)
3730 str_modifiable(str);
3735 else if (argc > 1) {
3738 rb_enc_copy(arg_str, str);
3739 for (i = 0; i < argc; i++) {
3772rb_str_append_as_bytes(
int argc,
VALUE *argv,
VALUE str)
3774 long needed_capacity = 0;
3778 for (
int index = 0; index < argc; index++) {
3779 VALUE obj = argv[index];
3787 needed_capacity += RSTRING_LEN(obj);
3792 "wrong argument type %"PRIsVALUE
" (expected String or Integer)",
3799 str_ensure_available_capa(str, needed_capacity);
3802 for (
int index = 0; index < argc; index++) {
3803 VALUE obj = argv[index];
3808 argv[index] = obj = rb_int_and(obj,
INT2FIX(0xff));
3809 char byte = (char)(
NUM2INT(obj) & 0xFF);
3818 memcpy(sptr, ptr,
len);
3823 rb_bug(
"append_as_bytes arguments should have been validated");
3827 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3828 TERM_FILL(sptr, TERM_LEN(str));
3833 for (
int index = 0; index < argc; index++) {
3834 VALUE obj = argv[index];
3851 rb_bug(
"append_as_bytes arguments should have been validated");
3921 rb_encoding *enc = STR_ENC_GET(str1);
3925 if (rb_num_to_uint(str2, &code) == 0) {
3938 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
3941 rb_str_buf_cat_byte(str1, (
unsigned char)code);
3944 long pos = RSTRING_LEN(str1);
3949 switch (
len = rb_enc_codelen(code, enc)) {
3950 case ONIGERR_INVALID_CODE_POINT_VALUE:
3951 rb_raise(
rb_eRangeError,
"invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3953 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3959 rb_enc_mbcput(code, buf, enc);
3960 if (rb_enc_precise_mbclen(buf, buf +
len + 1, enc) !=
len) {
3961 rb_raise(
rb_eRangeError,
"invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3963 rb_str_resize(str1, pos+
len);
3964 memcpy(RSTRING_PTR(str1) + pos, buf,
len);
3977rb_ascii8bit_appendable_encoding_index(rb_encoding *enc,
unsigned int code)
3979 int encidx = rb_enc_to_index(enc);
3981 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
3986 if (encidx == ENCINDEX_US_ASCII && code > 127) {
3987 return ENCINDEX_ASCII_8BIT;
4010rb_str_prepend_multi(
int argc,
VALUE *argv,
VALUE str)
4012 str_modifiable(str);
4017 else if (argc > 1) {
4020 rb_enc_copy(arg_str, str);
4021 for (i = 0; i < argc; i++) {
4034 st_index_t precomputed_hash;
4035 memcpy(&precomputed_hash,
RSTRING_END(str) + TERM_LEN(str),
sizeof(precomputed_hash));
4037 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4038 return precomputed_hash;
4041 return str_do_hash(str);
4048 const char *ptr1, *ptr2;
4051 return (len1 != len2 ||
4053 memcmp(ptr1, ptr2, len1) != 0);
4067rb_str_hash_m(
VALUE str)
4073#define lesser(a,b) (((a)>(b))?(b):(a))
4081 if (RSTRING_LEN(str1) == 0)
return TRUE;
4082 if (RSTRING_LEN(str2) == 0)
return TRUE;
4085 if (idx1 == idx2)
return TRUE;
4090 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4094 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4104 const char *ptr1, *ptr2;
4107 if (str1 == str2)
return 0;
4110 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4119 if (len1 > len2)
return 1;
4122 if (retval > 0)
return 1;
4149 if (str1 == str2)
return Qtrue;
4156 return rb_str_eql_internal(str1, str2);
4180 if (str1 == str2)
return Qtrue;
4182 return rb_str_eql_internal(str1, str2);
4213 return rb_invcmp(str1, str2);
4255 return str_casecmp(str1, s);
4263 const char *p1, *p1end, *p2, *p2end;
4265 enc = rb_enc_compatible(str1, str2);
4270 p1 = RSTRING_PTR(str1); p1end =
RSTRING_END(str1);
4271 p2 = RSTRING_PTR(str2); p2end =
RSTRING_END(str2);
4272 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4273 while (p1 < p1end && p2 < p2end) {
4275 unsigned int c1 =
TOLOWER(*p1 & 0xff);
4276 unsigned int c2 =
TOLOWER(*p2 & 0xff);
4278 return INT2FIX(c1 < c2 ? -1 : 1);
4285 while (p1 < p1end && p2 < p2end) {
4286 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4287 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4289 if (0 <= c1 && 0 <= c2) {
4293 return INT2FIX(c1 < c2 ? -1 : 1);
4297 l1 = rb_enc_mbclen(p1, p1end, enc);
4298 l2 = rb_enc_mbclen(p2, p2end, enc);
4299 len = l1 < l2 ? l1 : l2;
4300 r = memcmp(p1, p2,
len);
4302 return INT2FIX(r < 0 ? -1 : 1);
4304 return INT2FIX(l1 < l2 ? -1 : 1);
4310 if (RSTRING_LEN(str1) == RSTRING_LEN(str2))
return INT2FIX(0);
4311 if (RSTRING_LEN(str1) > RSTRING_LEN(str2))
return INT2FIX(1);
4345 return str_casecmp_p(str1, s);
4352 VALUE folded_str1, folded_str2;
4353 VALUE fold_opt = sym_fold;
4355 enc = rb_enc_compatible(str1, str2);
4360 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4361 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4363 return rb_str_eql(folded_str1, folded_str2);
4367strseq_core(
const char *str_ptr,
const char *str_ptr_end,
long str_len,
4368 const char *sub_ptr,
long sub_len,
long offset, rb_encoding *enc)
4370 const char *search_start = str_ptr;
4371 long pos, search_len = str_len - offset;
4375 pos =
rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4376 if (pos < 0)
return pos;
4378 if (t == search_start + pos)
break;
4379 search_len -= t - search_start;
4380 if (search_len <= 0)
return -1;
4381 offset += t - search_start;
4384 return pos + offset;
4388#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4389#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4392rb_strseq_index(
VALUE str,
VALUE sub,
long offset,
int in_byte)
4394 const char *str_ptr, *str_ptr_end, *sub_ptr;
4395 long str_len, sub_len;
4398 enc = rb_enc_check(str, sub);
4399 if (is_broken_string(sub))
return -1;
4401 str_ptr = RSTRING_PTR(str);
4403 str_len = RSTRING_LEN(str);
4404 sub_ptr = RSTRING_PTR(sub);
4405 sub_len = RSTRING_LEN(sub);
4407 if (str_len < sub_len)
return -1;
4410 long str_len_char, sub_len_char;
4411 int single_byte = single_byte_optimizable(str);
4412 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4413 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4415 offset += str_len_char;
4416 if (offset < 0)
return -1;
4418 if (str_len_char - offset < sub_len_char)
return -1;
4419 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4422 if (sub_len == 0)
return offset;
4425 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4439rb_str_index_m(
int argc,
VALUE *argv,
VALUE str)
4443 rb_encoding *enc = STR_ENC_GET(str);
4446 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4447 long slen = str_strlen(str, enc);
4449 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4461 pos = str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
4462 enc, single_byte_optimizable(str));
4473 pos = rb_str_index(str, sub, pos);
4487str_ensure_byte_pos(
VALUE str,
long pos)
4489 if (!single_byte_optimizable(str)) {
4490 const char *s = RSTRING_PTR(str);
4492 const char *p = s + pos;
4493 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4495 "offset %ld does not land on character boundary", pos);
4542rb_str_byteindex_m(
int argc,
VALUE *argv,
VALUE str)
4548 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4549 long slen = RSTRING_LEN(str);
4551 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4562 str_ensure_byte_pos(str, pos);
4574 pos = rb_str_byteindex(str, sub, pos);
4575 if (pos >= 0)
return LONG2NUM(pos);
4582memrchr(
const char *search_str,
int chr,
long search_len)
4584 const char *ptr = search_str + search_len;
4585 while (ptr > search_str) {
4586 if ((
unsigned char)*(--ptr) == chr)
return (
void *)ptr;
4594str_rindex(
VALUE str,
VALUE sub,
const char *s, rb_encoding *enc)
4596 char *hit, *adjusted;
4598 long slen, searchlen;
4601 sbeg = RSTRING_PTR(str);
4602 slen = RSTRING_LEN(sub);
4603 if (slen == 0)
return s - sbeg;
4605 t = RSTRING_PTR(sub);
4607 searchlen = s - sbeg + 1;
4609 if (memcmp(s, t, slen) == 0) {
4614 hit = memrchr(sbeg, c, searchlen);
4617 if (hit != adjusted) {
4618 searchlen = adjusted - sbeg;
4621 if (memcmp(hit, t, slen) == 0)
4623 searchlen = adjusted - sbeg;
4624 }
while (searchlen > 0);
4638 enc = rb_enc_check(str, sub);
4639 if (is_broken_string(sub))
return -1;
4640 singlebyte = single_byte_optimizable(str);
4641 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc);
4642 slen = str_strlen(sub, enc);
4645 if (
len < slen)
return -1;
4646 if (
len - pos < slen) pos =
len - slen;
4647 if (
len == 0)
return pos;
4649 sbeg = RSTRING_PTR(str);
4652 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4658 s = str_nth(sbeg,
RSTRING_END(str), pos, enc, singlebyte);
4659 return str_rindex(str, sub, s, enc);
4720rb_str_rindex_m(
int argc,
VALUE *argv,
VALUE str)
4724 rb_encoding *enc = STR_ENC_GET(str);
4725 long pos,
len = str_strlen(str, enc);
4727 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4729 if (pos < 0 && (pos +=
len) < 0) {
4735 if (pos >
len) pos =
len;
4743 pos = str_offset(RSTRING_PTR(str),
RSTRING_END(str), pos,
4744 enc, single_byte_optimizable(str));
4755 pos = rb_str_rindex(str, sub, pos);
4765rb_str_byterindex(
VALUE str,
VALUE sub,
long pos)
4771 enc = rb_enc_check(str, sub);
4772 if (is_broken_string(sub))
return -1;
4773 len = RSTRING_LEN(str);
4774 slen = RSTRING_LEN(sub);
4777 if (
len < slen)
return -1;
4778 if (
len - pos < slen) pos =
len - slen;
4779 if (
len == 0)
return pos;
4781 sbeg = RSTRING_PTR(str);
4784 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4791 return str_rindex(str, sub, s, enc);
4856rb_str_byterindex_m(
int argc,
VALUE *argv,
VALUE str)
4860 long pos,
len = RSTRING_LEN(str);
4862 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4864 if (pos < 0 && (pos +=
len) < 0) {
4870 if (pos >
len) pos =
len;
4876 str_ensure_byte_pos(str, pos);
4888 pos = rb_str_byterindex(str, sub, pos);
4889 if (pos >= 0)
return LONG2NUM(pos);
4925 switch (OBJ_BUILTIN_TYPE(y)) {
4977rb_str_match_m(
int argc,
VALUE *argv,
VALUE str)
4984 result = rb_funcallv(get_pat(re), rb_intern(
"match"), argc, argv);
5016rb_str_match_m_p(
int argc,
VALUE *argv,
VALUE str)
5020 re = get_pat(argv[0]);
5021 return rb_reg_match_p(re, str, argc > 1 ?
NUM2LONG(argv[1]) : 0);
5030static enum neighbor_char
5031enc_succ_char(
char *p,
long len, rb_encoding *enc)
5036 if (rb_enc_mbminlen(enc) > 1) {
5038 int r = rb_enc_precise_mbclen(p, p +
len, enc), c;
5040 return NEIGHBOR_NOT_CHAR;
5042 c = rb_enc_mbc_to_codepoint(p, p +
len, enc) + 1;
5044 if (!l)
return NEIGHBOR_NOT_CHAR;
5045 if (l !=
len)
return NEIGHBOR_WRAPPED;
5046 rb_enc_mbcput(c, p, enc);
5047 r = rb_enc_precise_mbclen(p, p +
len, enc);
5049 return NEIGHBOR_NOT_CHAR;
5051 return NEIGHBOR_FOUND;
5054 for (i =
len-1; 0 <= i && (
unsigned char)p[i] == 0xff; i--)
5057 return NEIGHBOR_WRAPPED;
5058 ++((
unsigned char*)p)[i];
5059 l = rb_enc_precise_mbclen(p, p+
len, enc);
5063 return NEIGHBOR_FOUND;
5066 memset(p+l, 0xff,
len-l);
5072 for (len2 =
len-1; 0 < len2; len2--) {
5073 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5077 memset(p+len2+1, 0xff,
len-(len2+1));
5082static enum neighbor_char
5083enc_pred_char(
char *p,
long len, rb_encoding *enc)
5087 if (rb_enc_mbminlen(enc) > 1) {
5089 int r = rb_enc_precise_mbclen(p, p +
len, enc), c;
5091 return NEIGHBOR_NOT_CHAR;
5093 c = rb_enc_mbc_to_codepoint(p, p +
len, enc);
5094 if (!c)
return NEIGHBOR_NOT_CHAR;
5097 if (!l)
return NEIGHBOR_NOT_CHAR;
5098 if (l !=
len)
return NEIGHBOR_WRAPPED;
5099 rb_enc_mbcput(c, p, enc);
5100 r = rb_enc_precise_mbclen(p, p +
len, enc);
5102 return NEIGHBOR_NOT_CHAR;
5104 return NEIGHBOR_FOUND;
5107 for (i =
len-1; 0 <= i && (
unsigned char)p[i] == 0; i--)
5110 return NEIGHBOR_WRAPPED;
5111 --((
unsigned char*)p)[i];
5112 l = rb_enc_precise_mbclen(p, p+
len, enc);
5116 return NEIGHBOR_FOUND;
5119 memset(p+l, 0,
len-l);
5125 for (len2 =
len-1; 0 < len2; len2--) {
5126 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5130 memset(p+len2+1, 0,
len-(len2+1));
5144static enum neighbor_char
5145enc_succ_alnum_char(
char *p,
long len, rb_encoding *enc,
char *carry)
5147 enum neighbor_char ret;
5151 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5155 const int max_gaps = 1;
5157 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5159 ctype = ONIGENC_CTYPE_DIGIT;
5161 ctype = ONIGENC_CTYPE_ALPHA;
5163 return NEIGHBOR_NOT_CHAR;
5166 for (
try = 0;
try <= max_gaps; ++
try) {
5167 ret = enc_succ_char(p,
len, enc);
5168 if (ret == NEIGHBOR_FOUND) {
5169 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5171 return NEIGHBOR_FOUND;
5178 ret = enc_pred_char(p,
len, enc);
5179 if (ret == NEIGHBOR_FOUND) {
5180 c = rb_enc_mbc_to_codepoint(p, p+
len, enc);
5193 return NEIGHBOR_NOT_CHAR;
5196 if (ctype != ONIGENC_CTYPE_DIGIT) {
5198 return NEIGHBOR_WRAPPED;
5202 enc_succ_char(carry,
len, enc);
5203 return NEIGHBOR_WRAPPED;
5271 str =
rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5272 rb_enc_cr_str_copy_for_substr(str, orig);
5273 return str_succ(str);
5280 char *sbeg, *s, *e, *last_alnum = 0;
5281 int found_alnum = 0;
5283 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] =
"\1";
5284 long carry_pos = 0, carry_len = 1;
5285 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5287 slen = RSTRING_LEN(str);
5288 if (slen == 0)
return str;
5290 enc = STR_ENC_GET(str);
5291 sbeg = RSTRING_PTR(str);
5292 s = e = sbeg + slen;
5294 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5295 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5301 l = rb_enc_precise_mbclen(s, e, enc);
5302 if (!ONIGENC_MBCLEN_CHARFOUND_P(l))
continue;
5303 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5304 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5306 case NEIGHBOR_NOT_CHAR:
5308 case NEIGHBOR_FOUND:
5310 case NEIGHBOR_WRAPPED:
5315 carry_pos = s - sbeg;
5320 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5321 enum neighbor_char neighbor;
5322 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5323 l = rb_enc_precise_mbclen(s, e, enc);
5324 if (!ONIGENC_MBCLEN_CHARFOUND_P(l))
continue;
5325 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5327 neighbor = enc_succ_char(tmp, l, enc);
5329 case NEIGHBOR_FOUND:
5333 case NEIGHBOR_WRAPPED:
5336 case NEIGHBOR_NOT_CHAR:
5339 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5341 enc_succ_char(s, l, enc);
5343 if (!rb_enc_asciicompat(enc)) {
5344 MEMCPY(carry, s,
char, l);
5347 carry_pos = s - sbeg;
5351 RESIZE_CAPA(str, slen + carry_len);
5352 sbeg = RSTRING_PTR(str);
5353 s = sbeg + carry_pos;
5354 memmove(s + carry_len, s, slen - carry_pos);
5355 memmove(s, carry, carry_len);
5357 STR_SET_LEN(str, slen);
5358 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5372rb_str_succ_bang(
VALUE str)
5380all_digits_p(
const char *s,
long len)
5434 VALUE end, exclusive;
5438 return rb_str_upto_each(beg, end,
RTEST(exclusive), str_upto_i,
Qnil);
5444 VALUE current, after_end;
5451 enc = rb_enc_check(beg, end);
5452 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5454 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5455 char c = RSTRING_PTR(beg)[0];
5456 char e = RSTRING_PTR(end)[0];
5458 if (c > e || (excl && c == e))
return beg;
5460 VALUE str = rb_enc_str_new(&c, 1, enc);
5462 if ((*each)(str, arg))
break;
5463 if (!excl && c == e)
break;
5465 if (excl && c == e)
break;
5470 if (ascii &&
ISDIGIT(RSTRING_PTR(beg)[0]) &&
ISDIGIT(RSTRING_PTR(end)[0]) &&
5471 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5472 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5477 b = rb_str_to_inum(beg, 10, FALSE);
5478 e = rb_str_to_inum(end, 10, FALSE);
5482 rb_encoding *usascii = rb_usascii_encoding();
5485 if (excl && bi == ei)
break;
5486 if ((*each)(
rb_enc_sprintf(usascii,
"%.*ld", width, bi), arg))
break;
5491 ID op = excl ?
'<' : idLE;
5492 VALUE args[2], fmt = rb_fstring_lit(
"%.*d");
5497 if ((*each)(
rb_str_format(numberof(args), args, fmt), arg))
break;
5498 b = rb_funcallv(b, succ, 0, 0);
5505 if (n > 0 || (excl && n == 0))
return beg;
5507 after_end = rb_funcallv(end, succ, 0, 0);
5512 next = rb_funcallv(current, succ, 0, 0);
5513 if ((*each)(current, arg))
break;
5514 if (
NIL_P(next))
break;
5518 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5533 if (is_ascii_string(beg) &&
ISDIGIT(RSTRING_PTR(beg)[0]) &&
5534 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5535 VALUE b, args[2], fmt = rb_fstring_lit(
"%.*d");
5537 b = rb_str_to_inum(beg, 10, FALSE);
5540 rb_encoding *usascii = rb_usascii_encoding();
5543 if ((*each)(
rb_enc_sprintf(usascii,
"%.*ld", width, bi), arg))
break;
5551 if ((*each)(
rb_str_format(numberof(args), args, fmt), arg))
break;
5552 b = rb_funcallv(b, succ, 0, 0);
5558 VALUE next = rb_funcallv(current, succ, 0, 0);
5559 if ((*each)(current, arg))
break;
5562 if (RSTRING_LEN(current) == 0)
5573 if (!
rb_equal(str, *argp))
return 0;
5587 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5588 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5589 rb_enc_asciicompat(STR_ENC_GET(val))) {
5590 const char *bp = RSTRING_PTR(beg);
5591 const char *ep = RSTRING_PTR(end);
5592 const char *vp = RSTRING_PTR(val);
5593 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5594 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5602 if (b <= v && v < e)
return Qtrue;
5603 return RBOOL(!
RTEST(exclusive) && v == e);
5610 all_digits_p(bp, RSTRING_LEN(beg)) &&
5611 all_digits_p(ep, RSTRING_LEN(end))) {
5616 rb_str_upto_each(beg, end,
RTEST(exclusive), include_range_i, (
VALUE)&val);
5618 return RBOOL(
NIL_P(val));
5641 return rb_str_subpat(str, indx,
INT2FIX(0));
5644 if (rb_str_index(str, indx, 0) != -1)
5650 long beg,
len = str_strlen(str, NULL);
5662 return str_substr(str, idx, 1, FALSE);
5681rb_str_aref_m(
int argc,
VALUE *argv,
VALUE str)
5685 return rb_str_subpat(str, argv[0], argv[1]);
5688 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5692 return rb_str_aref(str, argv[0]);
5698 char *ptr = RSTRING_PTR(str);
5699 long olen = RSTRING_LEN(str), nlen;
5701 str_modifiable(str);
5702 if (
len > olen)
len = olen;
5704 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5706 int fl = (int)(
RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5708 ptr =
RSTRING(str)->as.embed.ary;
5709 memmove(ptr, oldptr +
len, nlen);
5710 if (fl == STR_NOEMBED)
xfree(oldptr);
5713 if (!STR_SHARED_P(str)) {
5715 rb_enc_cr_str_exact_copy(shared, str);
5720 STR_SET_LEN(str, nlen);
5722 if (!SHARABLE_MIDDLE_SUBSTRING) {
5723 TERM_FILL(ptr + nlen, TERM_LEN(str));
5730rb_str_update_1(
VALUE str,
long beg,
long len,
VALUE val,
long vbeg,
long vlen)
5736 if (beg == 0 && vlen == 0) {
5741 str_modify_keep_cr(str);
5745 RESIZE_CAPA(str, slen + vlen -
len);
5746 sptr = RSTRING_PTR(str);
5755 memmove(sptr + beg + vlen,
5757 slen - (beg +
len));
5759 if (vlen < beg &&
len < 0) {
5763 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5766 STR_SET_LEN(str, slen);
5767 TERM_FILL(&sptr[slen], TERM_LEN(str));
5774 rb_str_update_1(str, beg,
len, val, 0, RSTRING_LEN(val));
5783 int singlebyte = single_byte_optimizable(str);
5789 enc = rb_enc_check(str, val);
5790 slen = str_strlen(str, enc);
5792 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5801 if (
len > slen - beg) {
5804 p = str_nth(RSTRING_PTR(str),
RSTRING_END(str), beg, enc, singlebyte);
5809 beg = p - RSTRING_PTR(str);
5811 rb_str_update_0(str, beg,
len, val);
5812 rb_enc_associate(str, enc);
5823 long start, end,
len;
5833 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5837 nth += regs->num_regs;
5847 enc = rb_enc_check_str(str, val);
5848 rb_str_update_0(str, start,
len, val);
5849 rb_enc_associate(str, enc);
5857 switch (
TYPE(indx)) {
5859 rb_str_subpat_set(str, indx,
INT2FIX(0), val);
5863 beg = rb_str_index(str, indx, 0);
5917rb_str_aset_m(
int argc,
VALUE *argv,
VALUE str)
5921 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5929 return rb_str_aset(str, argv[0], argv[1]);
5989rb_str_slice_bang(
int argc,
VALUE *argv,
VALUE str)
5997 str_modify_keep_cr(str);
6005 if ((nth += regs->num_regs) <= 0)
return Qnil;
6007 else if (nth >= regs->num_regs)
return Qnil;
6009 len = END(nth) - beg;
6012 else if (argc == 2) {
6021 beg = p - RSTRING_PTR(str);
6025 beg = rb_str_index(str, indx, 0);
6026 if (beg == -1)
return Qnil;
6027 len = RSTRING_LEN(indx);
6039 beg = p - RSTRING_PTR(str);
6048 beg = p - RSTRING_PTR(str);
6052 rb_enc_cr_str_copy_for_substr(result, str);
6060 char *sptr = RSTRING_PTR(str);
6061 long slen = RSTRING_LEN(str);
6062 if (beg +
len > slen)
6066 slen - (beg +
len));
6068 STR_SET_LEN(str, slen);
6069 TERM_FILL(&sptr[slen], TERM_LEN(str));
6080 switch (OBJ_BUILTIN_TYPE(pat)) {
6099get_pat_quoted(
VALUE pat,
int check)
6103 switch (OBJ_BUILTIN_TYPE(pat)) {
6117 if (check && is_broken_string(pat)) {
6118 rb_exc_raise(rb_reg_check_preprocess(pat));
6124rb_pat_search(
VALUE pat,
VALUE str,
long pos,
int set_backref_str)
6127 pos = rb_str_byteindex(str, pat, pos);
6128 if (set_backref_str) {
6130 str = rb_str_new_frozen_String(str);
6131 rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6140 return rb_reg_search0(pat, str, pos, 0, set_backref_str);
6160rb_str_sub_bang(
int argc,
VALUE *argv,
VALUE str)
6174 hash = rb_check_hash_type(argv[1]);
6180 pat = get_pat_quoted(argv[0], 1);
6182 str_modifiable(str);
6183 beg = rb_pat_search(pat, str, 0, 1);
6197 end0 = beg0 + RSTRING_LEN(pat);
6206 if (iter || !
NIL_P(hash)) {
6207 p = RSTRING_PTR(str);
len = RSTRING_LEN(str);
6210 repl = rb_obj_as_string(
rb_yield(match0));
6213 repl = rb_hash_aref(hash,
rb_str_subseq(str, beg0, end0 - beg0));
6214 repl = rb_obj_as_string(repl);
6216 str_mod_check(str, p,
len);
6217 rb_check_frozen(str);
6223 enc = rb_enc_compatible(str, repl);
6225 rb_encoding *str_enc = STR_ENC_GET(str);
6226 p = RSTRING_PTR(str);
len = RSTRING_LEN(str);
6230 rb_enc_inspect_name(str_enc),
6231 rb_enc_inspect_name(STR_ENC_GET(repl)));
6233 enc = STR_ENC_GET(repl);
6236 rb_enc_associate(str, enc);
6246 rlen = RSTRING_LEN(repl);
6247 len = RSTRING_LEN(str);
6249 RESIZE_CAPA(str,
len + rlen - plen);
6251 p = RSTRING_PTR(str);
6253 memmove(p + beg0 + rlen, p + beg0 + plen,
len - beg0 - plen);
6255 rp = RSTRING_PTR(repl);
6256 memmove(p + beg0, rp, rlen);
6258 STR_SET_LEN(str,
len);
6259 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
6288 rb_str_sub_bang(argc, argv, str);
6293str_gsub(
int argc,
VALUE *argv,
VALUE str,
int bang)
6296 long beg, beg0, end0;
6297 long offset, blen, slen,
len, last;
6298 enum {STR, ITER, MAP} mode = STR;
6300 int need_backref = -1;
6301 rb_encoding *str_enc;
6310 hash = rb_check_hash_type(argv[1]);
6319 rb_error_arity(argc, 1, 2);
6322 pat = get_pat_quoted(argv[0], 1);
6323 beg = rb_pat_search(pat, str, 0, need_backref);
6325 if (bang)
return Qnil;
6330 blen = RSTRING_LEN(str) + 30;
6332 sp = RSTRING_PTR(str);
6333 slen = RSTRING_LEN(str);
6335 str_enc = STR_ENC_GET(str);
6336 rb_enc_associate(dest, str_enc);
6344 end0 = beg0 + RSTRING_LEN(pat);
6355 val = rb_obj_as_string(
rb_yield(match0));
6358 val = rb_hash_aref(hash,
rb_str_subseq(str, beg0, end0 - beg0));
6359 val = rb_obj_as_string(val);
6361 str_mod_check(str, sp, slen);
6366 else if (need_backref) {
6368 if (need_backref < 0) {
6369 need_backref = val != repl;
6376 len = beg0 - offset;
6390 if (RSTRING_LEN(str) <= end0)
break;
6391 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0,
RSTRING_END(str), str_enc);
6393 offset = end0 +
len;
6395 cp = RSTRING_PTR(str) + offset;
6396 if (offset > RSTRING_LEN(str))
break;
6397 beg = rb_pat_search(pat, str, offset, need_backref);
6401 if (RSTRING_LEN(str) > offset) {
6404 rb_pat_search(pat, str, last, 1);
6406 str_shared_replace(str, dest);
6434rb_str_gsub_bang(
int argc,
VALUE *argv,
VALUE str)
6436 str_modify_keep_cr(str);
6437 return str_gsub(argc, argv, str, 1);
6460 return str_gsub(argc, argv, str, 0);
6478 str_modifiable(str);
6479 if (str == str2)
return str;
6483 return str_replace(str, str2);
6498rb_str_clear(
VALUE str)
6502 STR_SET_LEN(str, 0);
6503 RSTRING_PTR(str)[0] = 0;
6504 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6523rb_str_chr(
VALUE str)
6547 pos += RSTRING_LEN(str);
6548 if (pos < 0 || RSTRING_LEN(str) <= pos)
6551 return INT2FIX((
unsigned char)RSTRING_PTR(str)[pos]);
6570 long len = RSTRING_LEN(str);
6571 char *ptr, *head, *left = 0;
6575 if (pos < -
len ||
len <= pos)
6582 char byte = (char)(
NUM2INT(w) & 0xFF);
6584 if (!str_independent(str))
6585 str_make_independent(str);
6586 enc = STR_ENC_GET(str);
6587 head = RSTRING_PTR(str);
6589 if (!STR_EMBED_P(str)) {
6596 nlen = rb_enc_precise_mbclen(left, head+
len, enc);
6604 width = rb_enc_precise_mbclen(left, head+
len, enc);
6606 nlen = rb_enc_precise_mbclen(left, head+
len, enc);
6622str_byte_substr(
VALUE str,
long beg,
long len,
int empty)
6624 long n = RSTRING_LEN(str);
6626 if (beg > n ||
len < 0)
return Qnil;
6629 if (beg < 0)
return Qnil;
6634 if (!empty)
return Qnil;
6638 VALUE str2 = str_subseq(str, beg,
len);
6640 str_enc_copy_direct(str2, str);
6642 if (RSTRING_LEN(str2) == 0) {
6643 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6677 long beg,
len = RSTRING_LEN(str);
6685 return str_byte_substr(str, beg,
len, TRUE);
6690 return str_byte_substr(str, idx, 1, FALSE);
6737rb_str_byteslice(
int argc,
VALUE *argv,
VALUE str)
6742 return str_byte_substr(str, beg,
len, TRUE);
6745 return str_byte_aref(str, argv[0]);
6749str_check_beg_len(
VALUE str,
long *beg,
long *
len)
6751 long end, slen = RSTRING_LEN(str);
6754 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6763 if (*
len > slen - *beg) {
6767 str_ensure_byte_pos(str, *beg);
6768 str_ensure_byte_pos(str, end);
6793rb_str_bytesplice(
int argc,
VALUE *argv,
VALUE str)
6795 long beg,
len, vbeg, vlen;
6800 if (!(argc == 2 || argc == 3 || argc == 5)) {
6801 rb_raise(rb_eArgError,
"wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6805 rb_raise(
rb_eTypeError,
"wrong argument type %s (expected Range)",
6806 rb_builtin_class_name(argv[0]));
6813 vlen = RSTRING_LEN(val);
6818 rb_raise(
rb_eTypeError,
"wrong argument type %s (expected Range)",
6819 rb_builtin_class_name(argv[2]));
6831 vlen = RSTRING_LEN(val);
6839 str_check_beg_len(str, &beg, &
len);
6840 str_check_beg_len(val, &vbeg, &vlen);
6841 str_modify_keep_cr(str);
6844 rb_enc_associate(str, rb_enc_check(str, val));
6847 rb_str_update_1(str, beg,
len, val, vbeg, vlen);
6865rb_str_reverse(
VALUE str)
6872 if (RSTRING_LEN(str) <= 1)
return str_duplicate(
rb_cString, str);
6873 enc = STR_ENC_GET(str);
6879 if (RSTRING_LEN(str) > 1) {
6880 if (single_byte_optimizable(str)) {
6887 int clen = rb_enc_fast_mbclen(s, e, enc);
6895 cr = rb_enc_asciicompat(enc) ?
6898 int clen = rb_enc_mbclen(s, e, enc);
6907 STR_SET_LEN(rev, RSTRING_LEN(str));
6908 str_enc_copy_direct(rev, str);
6928rb_str_reverse_bang(
VALUE str)
6930 if (RSTRING_LEN(str) > 1) {
6931 if (single_byte_optimizable(str)) {
6934 str_modify_keep_cr(str);
6935 s = RSTRING_PTR(str);
6944 str_shared_replace(str, rb_str_reverse(str));
6948 str_modify_keep_cr(str);
6973 i = rb_str_index(str, arg, 0);
6975 return RBOOL(i != -1);
7017 rb_raise(rb_eArgError,
"invalid radix %d", base);
7019 return rb_str_to_inum(str, base, FALSE);
7043rb_str_to_f(
VALUE str)
7058rb_str_to_s(
VALUE str)
7068str_cat_char(
VALUE str,
unsigned int c, rb_encoding *enc)
7070 char s[RUBY_MAX_CHAR_LEN];
7071 int n = rb_enc_codelen(c, enc);
7073 rb_enc_mbcput(c, s, enc);
7078#define CHAR_ESC_LEN 13
7081rb_str_buf_cat_escaped_char(
VALUE result,
unsigned int c,
int unicode_p)
7083 char buf[CHAR_ESC_LEN + 1];
7091 snprintf(buf, CHAR_ESC_LEN,
"%c", c);
7093 else if (c < 0x10000) {
7094 snprintf(buf, CHAR_ESC_LEN,
"\\u%04X", c);
7097 snprintf(buf, CHAR_ESC_LEN,
"\\u{%X}", c);
7102 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", c);
7105 snprintf(buf, CHAR_ESC_LEN,
"\\x{%X}", c);
7108 l = (int)strlen(buf);
7114ruby_escaped_char(
int c)
7117 case '\0':
return "\\0";
7118 case '\n':
return "\\n";
7119 case '\r':
return "\\r";
7120 case '\t':
return "\\t";
7121 case '\f':
return "\\f";
7122 case '\013':
return "\\v";
7123 case '\010':
return "\\b";
7124 case '\007':
return "\\a";
7125 case '\033':
return "\\e";
7126 case '\x7f':
return "\\c?";
7132rb_str_escape(
VALUE str)
7135 rb_encoding *enc = rb_enc_from_index(encidx);
7136 const char *p = RSTRING_PTR(str);
7138 const char *prev = p;
7139 char buf[CHAR_ESC_LEN + 1];
7141 int unicode_p = rb_enc_unicode_p(enc);
7142 int asciicompat = rb_enc_asciicompat(enc);
7147 int n = rb_enc_precise_mbclen(p, pend, enc);
7149 if (p > prev) str_buf_cat(result, prev, p - prev);
7150 n = rb_enc_mbminlen(enc);
7152 n = (int)(pend - p);
7154 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", *p & 0377);
7155 str_buf_cat(result, buf, strlen(buf));
7161 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7163 cc = ruby_escaped_char(c);
7165 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7166 str_buf_cat(result, cc, strlen(cc));
7169 else if (asciicompat && rb_enc_isascii(c, enc) &&
ISPRINT(c)) {
7172 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7173 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7177 if (p > prev) str_buf_cat(result, prev, p - prev);
7200 rb_encoding *enc = rb_enc_from_index(encidx);
7201 const char *p, *pend, *prev;
7202 char buf[CHAR_ESC_LEN + 1];
7204 rb_encoding *resenc = rb_default_internal_encoding();
7205 int unicode_p = rb_enc_unicode_p(enc);
7206 int asciicompat = rb_enc_asciicompat(enc);
7208 if (resenc == NULL) resenc = rb_default_external_encoding();
7209 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7210 rb_enc_associate(result, resenc);
7211 str_buf_cat2(result,
"\"");
7219 n = rb_enc_precise_mbclen(p, pend, enc);
7221 if (p > prev) str_buf_cat(result, prev, p - prev);
7222 n = rb_enc_mbminlen(enc);
7224 n = (int)(pend - p);
7226 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", *p & 0377);
7227 str_buf_cat(result, buf, strlen(buf));
7233 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7235 if ((asciicompat || unicode_p) &&
7236 (c ==
'"'|| c ==
'\\' ||
7241 (cc ==
'$' || cc ==
'@' || cc ==
'{'))))) {
7242 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7243 str_buf_cat2(result,
"\\");
7244 if (asciicompat || enc == resenc) {
7250 case '\n': cc =
'n';
break;
7251 case '\r': cc =
'r';
break;
7252 case '\t': cc =
't';
break;
7253 case '\f': cc =
'f';
break;
7254 case '\013': cc =
'v';
break;
7255 case '\010': cc =
'b';
break;
7256 case '\007': cc =
'a';
break;
7257 case 033: cc =
'e';
break;
7258 default: cc = 0;
break;
7261 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7264 str_buf_cat(result, buf, 2);
7277 (asciicompat && rb_enc_isascii(c, enc) &&
ISPRINT(c))) {
7281 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7282 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7287 if (p > prev) str_buf_cat(result, prev, p - prev);
7288 str_buf_cat2(result,
"\"");
7293#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7313 int encidx = rb_enc_get_index(str);
7314 rb_encoding *enc = rb_enc_from_index(encidx);
7316 const char *p, *pend;
7319 int u8 = (encidx == rb_utf8_encindex());
7320 static const char nonascii_suffix[] =
".dup.force_encoding(\"%s\")";
7323 if (!rb_enc_asciicompat(enc)) {
7325 len += strlen(enc->name);
7328 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7331 unsigned char c = *p++;
7334 case '"':
case '\\':
7335 case '\n':
case '\r':
7336 case '\t':
case '\f':
7337 case '\013':
case '\010':
case '\007':
case '\033':
7342 clen = IS_EVSTR(p, pend) ? 2 : 1;
7350 if (u8 && c > 0x7F) {
7351 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7353 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7356 else if (cc <= 0xFFFFF)
7369 if (clen > LONG_MAX -
len) {
7376 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7377 q = RSTRING_PTR(result); qend = q +
len + 1;
7381 unsigned char c = *p++;
7383 if (c ==
'"' || c ==
'\\') {
7387 else if (c ==
'#') {
7388 if (IS_EVSTR(p, pend)) *q++ =
'\\';
7391 else if (c ==
'\n') {
7395 else if (c ==
'\r') {
7399 else if (c ==
'\t') {
7403 else if (c ==
'\f') {
7407 else if (c ==
'\013') {
7411 else if (c ==
'\010') {
7415 else if (c ==
'\007') {
7419 else if (c ==
'\033') {
7429 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7431 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7434 snprintf(q, qend-q,
"u%04X", cc);
7436 snprintf(q, qend-q,
"u{%X}", cc);
7441 snprintf(q, qend-q,
"x%02X", c);
7447 if (!rb_enc_asciicompat(enc)) {
7448 snprintf(q, qend-q, nonascii_suffix, enc->name);
7449 encidx = rb_ascii8bit_encindex();
7452 rb_enc_associate_index(result, encidx);
7458unescape_ascii(
unsigned int c)
7482undump_after_backslash(
VALUE undumped,
const char **ss,
const char *s_end, rb_encoding **penc,
bool *utf8,
bool *binary)
7484 const char *s = *ss;
7488 unsigned char buf[6];
7489 static rb_encoding *enc_utf8 = NULL;
7506 *buf = unescape_ascii(*s);
7518 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7519 if (*penc != enc_utf8) {
7521 rb_enc_associate(undumped, enc_utf8);
7538 if (hexlen == 0 || hexlen > 6) {
7544 if (0xd800 <= c && c <= 0xdfff) {
7547 codelen = rb_enc_mbcput(c, (
char *)buf, *penc);
7557 if (0xd800 <= c && c <= 0xdfff) {
7560 codelen = rb_enc_mbcput(c, (
char *)buf, *penc);
7588static VALUE rb_str_is_ascii_only_p(
VALUE str);
7606str_undump(
VALUE str)
7608 const char *s = RSTRING_PTR(str);
7610 rb_encoding *enc = rb_enc_get(str);
7611 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7613 bool binary =
false;
7617 if (rb_str_is_ascii_only_p(str) ==
Qfalse) {
7620 if (!str_null_check(str, &w)) {
7623 if (RSTRING_LEN(str) < 2)
goto invalid_format;
7624 if (*s !=
'"')
goto invalid_format;
7642 static const char force_encoding_suffix[] =
".force_encoding(\"";
7643 static const char dup_suffix[] =
".dup";
7644 const char *encname;
7649 size =
sizeof(dup_suffix) - 1;
7650 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7652 size =
sizeof(force_encoding_suffix) - 1;
7653 if (s_end - s <= size)
goto invalid_format;
7654 if (memcmp(s, force_encoding_suffix, size) != 0)
goto invalid_format;
7658 rb_raise(
rb_eRuntimeError,
"dumped string contained Unicode escape but used force_encoding");
7662 s = memchr(s,
'"', s_end-s);
7664 if (!s)
goto invalid_format;
7665 if (s_end - s != 2)
goto invalid_format;
7666 if (s[0] !=
'"' || s[1] !=
')')
goto invalid_format;
7668 encidx = rb_enc_find_index2(encname, (
long)size);
7672 rb_enc_associate_index(undumped, encidx);
7682 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7693 rb_raise(
rb_eRuntimeError,
"invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7697rb_str_check_dummy_enc(rb_encoding *enc)
7699 if (rb_enc_dummy_p(enc)) {
7706str_true_enc(
VALUE str)
7708 rb_encoding *enc = STR_ENC_GET(str);
7709 rb_str_check_dummy_enc(enc);
7713static OnigCaseFoldType
7714check_case_options(
int argc,
VALUE *argv, OnigCaseFoldType flags)
7719 rb_raise(rb_eArgError,
"too many options");
7720 if (argv[0]==sym_turkic) {
7721 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7723 if (argv[1]==sym_lithuanian)
7724 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7726 rb_raise(rb_eArgError,
"invalid second option");
7729 else if (argv[0]==sym_lithuanian) {
7730 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7732 if (argv[1]==sym_turkic)
7733 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7735 rb_raise(rb_eArgError,
"invalid second option");
7739 rb_raise(rb_eArgError,
"too many options");
7740 else if (argv[0]==sym_ascii)
7741 flags |= ONIGENC_CASE_ASCII_ONLY;
7742 else if (argv[0]==sym_fold) {
7743 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7744 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7746 rb_raise(rb_eArgError,
"option :fold only allowed for downcasing");
7749 rb_raise(rb_eArgError,
"invalid option");
7754case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc,
VALUE str)
7756 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() ||
rb_enc_mbmaxlen(enc) == 1))
7762#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7763#ifndef CASEMAP_DEBUG
7764# define CASEMAP_DEBUG 0
7772 OnigUChar space[FLEX_ARY_LEN];
7776mapping_buffer_free(
void *p)
7780 while (current_buffer) {
7781 previous_buffer = current_buffer;
7782 current_buffer = current_buffer->next;
7783 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7789 {0, mapping_buffer_free,},
7790 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7794rb_str_casemap(
VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7798 const OnigUChar *source_current, *source_end;
7799 int target_length = 0;
7800 VALUE buffer_anchor;
7803 size_t buffer_count = 0;
7804 int buffer_length_or_invalid;
7806 if (RSTRING_LEN(source) == 0)
return str_duplicate(
rb_cString, source);
7808 source_current = (OnigUChar*)RSTRING_PTR(source);
7813 while (source_current < source_end) {
7815 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7816 if (CASEMAP_DEBUG) {
7817 fprintf(stderr,
"Buffer allocation, capa is %"PRIuSIZE
"\n",
capa);
7820 *pre_buffer = current_buffer;
7821 pre_buffer = ¤t_buffer->next;
7822 current_buffer->next = NULL;
7823 current_buffer->capa =
capa;
7824 buffer_length_or_invalid = enc->case_map(flags,
7825 &source_current, source_end,
7826 current_buffer->space,
7827 current_buffer->space+current_buffer->capa,
7829 if (buffer_length_or_invalid < 0) {
7830 current_buffer =
DATA_PTR(buffer_anchor);
7832 mapping_buffer_free(current_buffer);
7833 rb_raise(rb_eArgError,
"input string invalid");
7835 target_length += current_buffer->used = buffer_length_or_invalid;
7837 if (CASEMAP_DEBUG) {
7838 fprintf(stderr,
"Buffer count is %"PRIuSIZE
"\n", buffer_count);
7841 if (buffer_count==1) {
7842 target =
rb_str_new((
const char*)current_buffer->space, target_length);
7845 char *target_current;
7848 target_current = RSTRING_PTR(target);
7849 current_buffer =
DATA_PTR(buffer_anchor);
7850 while (current_buffer) {
7851 memcpy(target_current, current_buffer->space, current_buffer->used);
7852 target_current += current_buffer->used;
7853 current_buffer = current_buffer->next;
7856 current_buffer =
DATA_PTR(buffer_anchor);
7858 mapping_buffer_free(current_buffer);
7863 str_enc_copy_direct(target, source);
7870rb_str_ascii_casemap(
VALUE source,
VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7872 const OnigUChar *source_current, *source_end;
7873 OnigUChar *target_current, *target_end;
7874 long old_length = RSTRING_LEN(source);
7875 int length_or_invalid;
7877 if (old_length == 0)
return Qnil;
7879 source_current = (OnigUChar*)RSTRING_PTR(source);
7881 if (source == target) {
7882 target_current = (OnigUChar*)source_current;
7883 target_end = (OnigUChar*)source_end;
7886 target_current = (OnigUChar*)RSTRING_PTR(target);
7890 length_or_invalid = onigenc_ascii_only_case_map(flags,
7891 &source_current, source_end,
7892 target_current, target_end, enc);
7893 if (length_or_invalid < 0)
7894 rb_raise(rb_eArgError,
"input string invalid");
7895 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7896 fprintf(stderr,
"problem with rb_str_ascii_casemap"
7897 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7898 rb_raise(rb_eArgError,
"internal problem with rb_str_ascii_casemap"
7899 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7902 str_enc_copy(target, source);
7908upcase_single(
VALUE str)
7910 char *s = RSTRING_PTR(str), *send =
RSTRING_END(str);
7911 bool modified =
false;
7914 unsigned int c = *(
unsigned char*)s;
7916 if (
'a' <= c && c <=
'z') {
7917 *s =
'A' + (c -
'a');
7945rb_str_upcase_bang(
int argc,
VALUE *argv,
VALUE str)
7948 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7950 flags = check_case_options(argc, argv, flags);
7951 str_modify_keep_cr(str);
7952 enc = str_true_enc(str);
7953 if (case_option_single_p(flags, enc, str)) {
7954 if (upcase_single(str))
7955 flags |= ONIGENC_CASE_MODIFIED;
7957 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7958 rb_str_ascii_casemap(str, str, &flags, enc);
7960 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7962 if (ONIGENC_CASE_MODIFIED&flags)
return str;
7984rb_str_upcase(
int argc,
VALUE *argv,
VALUE str)
7987 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7990 flags = check_case_options(argc, argv, flags);
7991 enc = str_true_enc(str);
7992 if (case_option_single_p(flags, enc, str)) {
7993 ret =
rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7994 str_enc_copy_direct(ret, str);
7997 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7999 rb_str_ascii_casemap(str, ret, &flags, enc);
8002 ret = rb_str_casemap(str, &flags, enc);
8009downcase_single(
VALUE str)
8011 char *s = RSTRING_PTR(str), *send =
RSTRING_END(str);
8012 bool modified =
false;
8015 unsigned int c = *(
unsigned char*)s;
8017 if (
'A' <= c && c <=
'Z') {
8018 *s =
'a' + (c -
'A');
8047rb_str_downcase_bang(
int argc,
VALUE *argv,
VALUE str)
8050 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8052 flags = check_case_options(argc, argv, flags);
8053 str_modify_keep_cr(str);
8054 enc = str_true_enc(str);
8055 if (case_option_single_p(flags, enc, str)) {
8056 if (downcase_single(str))
8057 flags |= ONIGENC_CASE_MODIFIED;
8059 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8060 rb_str_ascii_casemap(str, str, &flags, enc);
8062 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8064 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8086rb_str_downcase(
int argc,
VALUE *argv,
VALUE str)
8089 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8092 flags = check_case_options(argc, argv, flags);
8093 enc = str_true_enc(str);
8094 if (case_option_single_p(flags, enc, str)) {
8095 ret =
rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8096 str_enc_copy_direct(ret, str);
8097 downcase_single(ret);
8099 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8101 rb_str_ascii_casemap(str, ret, &flags, enc);
8104 ret = rb_str_casemap(str, &flags, enc);
8132rb_str_capitalize_bang(
int argc,
VALUE *argv,
VALUE str)
8135 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8137 flags = check_case_options(argc, argv, flags);
8138 str_modify_keep_cr(str);
8139 enc = str_true_enc(str);
8140 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8141 if (flags&ONIGENC_CASE_ASCII_ONLY)
8142 rb_str_ascii_casemap(str, str, &flags, enc);
8144 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8146 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8170rb_str_capitalize(
int argc,
VALUE *argv,
VALUE str)
8173 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8176 flags = check_case_options(argc, argv, flags);
8177 enc = str_true_enc(str);
8178 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return str;
8179 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8181 rb_str_ascii_casemap(str, ret, &flags, enc);
8184 ret = rb_str_casemap(str, &flags, enc);
8211rb_str_swapcase_bang(
int argc,
VALUE *argv,
VALUE str)
8214 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8216 flags = check_case_options(argc, argv, flags);
8217 str_modify_keep_cr(str);
8218 enc = str_true_enc(str);
8219 if (flags&ONIGENC_CASE_ASCII_ONLY)
8220 rb_str_ascii_casemap(str, str, &flags, enc);
8222 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8224 if (ONIGENC_CASE_MODIFIED&flags)
return str;
8248rb_str_swapcase(
int argc,
VALUE *argv,
VALUE str)
8251 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8254 flags = check_case_options(argc, argv, flags);
8255 enc = str_true_enc(str);
8256 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return str_duplicate(
rb_cString, str);
8257 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8259 rb_str_ascii_casemap(str, ret, &flags, enc);
8262 ret = rb_str_casemap(str, &flags, enc);
8267typedef unsigned char *USTR;
8271 unsigned int now, max;
8276trnext(
struct tr *t, rb_encoding *enc)
8283 if (t->p == t->pend)
return -1;
8284 if (rb_enc_ascget(t->p, t->pend, &n, enc) ==
'\\' && t->p + n < t->pend) {
8287 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8289 if (rb_enc_ascget(t->p, t->pend, &n, enc) ==
'-' && t->p + n < t->pend) {
8291 if (t->p < t->pend) {
8292 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8295 if (t->now < 0x80 && c < 0x80) {
8296 rb_raise(rb_eArgError,
8297 "invalid range \"%c-%c\" in string transliteration",
8301 rb_raise(rb_eArgError,
"invalid range in string transliteration");
8305 else if (t->now < c) {
8314 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8315 if (t->now == t->max) {
8320 if (t->now < t->max) {
8336 const unsigned int errc = -1;
8337 unsigned int trans[256];
8338 rb_encoding *enc, *e1, *e2;
8339 struct tr trsrc, trrepl;
8341 unsigned int c, c0, last = 0;
8342 int modify = 0, i, l;
8343 unsigned char *s, *send;
8345 int singlebyte = single_byte_optimizable(str);
8349#define CHECK_IF_ASCII(c) \
8350 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8351 (cr = ENC_CODERANGE_VALID) : 0)
8355 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8356 if (RSTRING_LEN(repl) == 0) {
8357 return rb_str_delete_bang(1, &src, str);
8361 e1 = rb_enc_check(str, src);
8362 e2 = rb_enc_check(str, repl);
8367 enc = rb_enc_check(src, repl);
8369 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8370 if (RSTRING_LEN(src) > 1 &&
8371 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) ==
'^' &&
8372 trsrc.p + l < trsrc.pend) {
8376 trrepl.p = RSTRING_PTR(repl);
8377 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8378 trsrc.gen = trrepl.gen = 0;
8379 trsrc.now = trrepl.now = 0;
8380 trsrc.max = trrepl.max = 0;
8383 for (i=0; i<256; i++) {
8386 while ((c = trnext(&trsrc, enc)) != errc) {
8391 if (!hash) hash = rb_hash_new();
8395 while ((c = trnext(&trrepl, enc)) != errc)
8398 for (i=0; i<256; i++) {
8399 if (trans[i] != errc) {
8407 for (i=0; i<256; i++) {
8410 while ((c = trnext(&trsrc, enc)) != errc) {
8411 r = trnext(&trrepl, enc);
8412 if (r == errc) r = trrepl.now;
8415 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8418 if (!hash) hash = rb_hash_new();
8426 str_modify_keep_cr(str);
8427 s = (
unsigned char *)RSTRING_PTR(str); send = (
unsigned char *)
RSTRING_END(str);
8428 termlen = rb_enc_mbminlen(enc);
8431 long offset, max = RSTRING_LEN(str);
8432 unsigned int save = -1;
8433 unsigned char *buf =
ALLOC_N(
unsigned char, max + termlen), *t = buf;
8438 int r = rb_enc_precise_mbclen((
char *)s, (
char *)send, e1);
8441 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(e1));
8444 c0 = c = rb_enc_mbc_to_codepoint((
char *)s, (
char *)send, e1);
8446 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8455 if (cflag) c = last;
8458 else if (cflag) c = errc;
8464 if (c != (
unsigned int)-1) {
8470 tlen = rb_enc_codelen(c, enc);
8476 if (enc != e1) may_modify = 1;
8478 if ((offset = t - buf) + tlen > max) {
8479 size_t MAYBE_UNUSED(old) = max + termlen;
8480 max = offset + tlen + (send - s);
8481 SIZED_REALLOC_N(buf,
unsigned char, max + termlen, old);
8484 rb_enc_mbcput(c, t, enc);
8485 if (may_modify && memcmp(s, t, tlen) != 0) {
8491 if (!STR_EMBED_P(str)) {
8492 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8494 TERM_FILL((
char *)t, termlen);
8495 RSTRING(str)->as.heap.ptr = (
char *)buf;
8496 STR_SET_LEN(str, t - buf);
8497 STR_SET_NOEMBED(str);
8498 RSTRING(str)->as.heap.aux.capa = max;
8502 c = (
unsigned char)*s;
8503 if (trans[c] != errc) {
8520 long offset, max = (long)((send - s) * 1.2);
8521 unsigned char *buf =
ALLOC_N(
unsigned char, max + termlen), *t = buf;
8526 int r = rb_enc_precise_mbclen((
char *)s, (
char *)send, e1);
8529 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(e1));
8532 c0 = c = rb_enc_mbc_to_codepoint((
char *)s, (
char *)send, e1);
8534 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8542 if (cflag) c = last;
8545 else if (cflag) c = errc;
8549 c = cflag ? last : errc;
8552 tlen = rb_enc_codelen(c, enc);
8557 if (enc != e1) may_modify = 1;
8559 if ((offset = t - buf) + tlen > max) {
8560 size_t MAYBE_UNUSED(old) = max + termlen;
8561 max = offset + tlen + (long)((send - s) * 1.2);
8562 SIZED_REALLOC_N(buf,
unsigned char, max + termlen, old);
8566 rb_enc_mbcput(c, t, enc);
8567 if (may_modify && memcmp(s, t, tlen) != 0) {
8575 if (!STR_EMBED_P(str)) {
8576 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8578 TERM_FILL((
char *)t, termlen);
8579 RSTRING(str)->as.heap.ptr = (
char *)buf;
8580 STR_SET_LEN(str, t - buf);
8581 STR_SET_NOEMBED(str);
8582 RSTRING(str)->as.heap.aux.capa = max;
8588 rb_enc_associate(str, enc);
8607 return tr_trans(str, src, repl, 0);
8654 tr_trans(str, src, repl, 0);
8658#define TR_TABLE_MAX (UCHAR_MAX+1)
8659#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8661tr_setup_table(
VALUE str,
char stable[TR_TABLE_SIZE],
int first,
8662 VALUE *tablep,
VALUE *ctablep, rb_encoding *enc)
8664 const unsigned int errc = -1;
8665 char buf[TR_TABLE_MAX];
8668 VALUE table = 0, ptable = 0;
8669 int i, l, cflag = 0;
8671 tr.p = RSTRING_PTR(str);
tr.pend =
tr.p + RSTRING_LEN(str);
8672 tr.gen =
tr.now =
tr.max = 0;
8674 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(
tr.p,
tr.pend, &l, enc) ==
'^') {
8679 for (i=0; i<TR_TABLE_MAX; i++) {
8682 stable[TR_TABLE_MAX] = cflag;
8684 else if (stable[TR_TABLE_MAX] && !cflag) {
8685 stable[TR_TABLE_MAX] = 0;
8687 for (i=0; i<TR_TABLE_MAX; i++) {
8691 while ((c = trnext(&
tr, enc)) != errc) {
8692 if (c < TR_TABLE_MAX) {
8693 buf[(
unsigned char)c] = !cflag;
8698 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8701 table = ptable ? ptable : rb_hash_new();
8705 table = rb_hash_new();
8710 if (table && (!ptable || (cflag ^ !
NIL_P(rb_hash_aref(ptable, key))))) {
8711 rb_hash_aset(table, key,
Qtrue);
8715 for (i=0; i<TR_TABLE_MAX; i++) {
8716 stable[i] = stable[i] && buf[i];
8718 if (!table && !cflag) {
8725tr_find(
unsigned int c,
const char table[TR_TABLE_SIZE],
VALUE del,
VALUE nodel)
8727 if (c < TR_TABLE_MAX) {
8728 return table[c] != 0;
8734 if (!
NIL_P(rb_hash_lookup(del, v)) &&
8735 (!nodel ||
NIL_P(rb_hash_lookup(nodel, v)))) {
8739 else if (nodel && !
NIL_P(rb_hash_lookup(nodel, v))) {
8742 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8756rb_str_delete_bang(
int argc,
VALUE *argv,
VALUE str)
8758 char squeez[TR_TABLE_SIZE];
8759 rb_encoding *enc = 0;
8761 VALUE del = 0, nodel = 0;
8763 int i, ascompat, cr;
8765 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8767 for (i=0; i<argc; i++) {
8771 enc = rb_enc_check(str, s);
8772 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8775 str_modify_keep_cr(str);
8776 ascompat = rb_enc_asciicompat(enc);
8777 s = t = RSTRING_PTR(str);
8784 if (ascompat && (c = *(
unsigned char*)s) < 0x80) {
8795 c = rb_enc_codepoint_len(s, send, &clen, enc);
8797 if (tr_find(c, squeez, del, nodel)) {
8801 if (t != s) rb_enc_mbcput(c, t, enc);
8808 TERM_FILL(t, TERM_LEN(str));
8809 STR_SET_LEN(str, t - RSTRING_PTR(str));
8812 if (modify)
return str;
8832rb_str_delete(
int argc,
VALUE *argv,
VALUE str)
8835 rb_str_delete_bang(argc, argv, str);
8849rb_str_squeeze_bang(
int argc,
VALUE *argv,
VALUE str)
8851 char squeez[TR_TABLE_SIZE];
8852 rb_encoding *enc = 0;
8853 VALUE del = 0, nodel = 0;
8854 unsigned char *s, *send, *t;
8856 int ascompat, singlebyte = single_byte_optimizable(str);
8860 enc = STR_ENC_GET(str);
8863 for (i=0; i<argc; i++) {
8867 enc = rb_enc_check(str, s);
8868 if (singlebyte && !single_byte_optimizable(s))
8870 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8874 str_modify_keep_cr(str);
8875 s = t = (
unsigned char *)RSTRING_PTR(str);
8876 if (!s || RSTRING_LEN(str) == 0)
return Qnil;
8879 ascompat = rb_enc_asciicompat(enc);
8883 unsigned int c = *s++;
8884 if (c != save || (argc > 0 && !squeez[c])) {
8894 if (ascompat && (c = *s) < 0x80) {
8895 if (c != save || (argc > 0 && !squeez[c])) {
8901 c = rb_enc_codepoint_len((
char *)s, (
char *)send, &clen, enc);
8903 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8904 if (t != s) rb_enc_mbcput(c, t, enc);
8913 TERM_FILL((
char *)t, TERM_LEN(str));
8914 if ((
char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8915 STR_SET_LEN(str, (
char *)t - RSTRING_PTR(str));
8919 if (modify)
return str;
8942rb_str_squeeze(
int argc,
VALUE *argv,
VALUE str)
8945 rb_str_squeeze_bang(argc, argv, str);
8963 return tr_trans(str, src, repl, 1);
8986 tr_trans(str, src, repl, 1);
9015rb_str_count(
int argc,
VALUE *argv,
VALUE str)
9017 char table[TR_TABLE_SIZE];
9018 rb_encoding *enc = 0;
9019 VALUE del = 0, nodel = 0, tstr;
9029 enc = rb_enc_check(str, tstr);
9032 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
9033 (ptstr = RSTRING_PTR(tstr),
9034 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (
const unsigned char *)ptstr, (
const unsigned char *)ptstr+1)) &&
9035 !is_broken_string(str)) {
9037 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9039 s = RSTRING_PTR(str);
9040 if (!s || RSTRING_LEN(str) == 0)
return INT2FIX(0);
9043 if (*(
unsigned char*)s++ == c) n++;
9049 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9050 for (i=1; i<argc; i++) {
9053 enc = rb_enc_check(str, tstr);
9054 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9057 s = RSTRING_PTR(str);
9058 if (!s || RSTRING_LEN(str) == 0)
return INT2FIX(0);
9060 ascompat = rb_enc_asciicompat(enc);
9064 if (ascompat && (c = *(
unsigned char*)s) < 0x80) {
9072 c = rb_enc_codepoint_len(s, send, &clen, enc);
9073 if (tr_find(c, table, del, nodel)) {
9084rb_fs_check(
VALUE val)
9088 if (
NIL_P(val))
return 0;
9093static const char isspacetable[256] = {
9094 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9095 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9096 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9097 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9098 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9099 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9105 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9106 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9107 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9108 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9109 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9112#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9115split_string(
VALUE result,
VALUE str,
long beg,
long len,
long empty_count)
9117 if (empty_count >= 0 &&
len == 0) {
9118 return empty_count + 1;
9120 if (empty_count > 0) {
9124 rb_ary_push(result, str_new_empty_String(str));
9125 }
while (--empty_count > 0);
9129 rb_yield(str_new_empty_String(str));
9130 }
while (--empty_count > 0);
9135 rb_ary_push(result, str);
9144 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9148literal_split_pattern(
VALUE spat, split_type_t default_type)
9150 rb_encoding *enc = STR_ENC_GET(spat);
9156 return SPLIT_TYPE_CHARS;
9158 else if (rb_enc_asciicompat(enc)) {
9159 if (
len == 1 && ptr[0] ==
' ') {
9160 return SPLIT_TYPE_AWK;
9165 if (rb_enc_ascget(ptr, ptr +
len, &l, enc) ==
' ' &&
len == l) {
9166 return SPLIT_TYPE_AWK;
9169 return default_type;
9182rb_str_split_m(
int argc,
VALUE *argv,
VALUE str)
9187 split_type_t split_type;
9188 long beg, end, i = 0, empty_count = -1;
9193 if (
rb_scan_args(argc, argv,
"02", &spat, &limit) == 2) {
9195 if (lim <= 0) limit =
Qnil;
9196 else if (lim == 1) {
9197 if (RSTRING_LEN(str) == 0)
9208 if (
NIL_P(limit) && !lim) empty_count = 0;
9210 enc = STR_ENC_GET(str);
9211 split_type = SPLIT_TYPE_REGEXP;
9213 spat = get_pat_quoted(spat, 0);
9216 split_type = SPLIT_TYPE_AWK;
9218 else if (!(spat = rb_fs_check(spat))) {
9219 rb_raise(
rb_eTypeError,
"value of $; must be String or Regexp");
9224 if (split_type != SPLIT_TYPE_AWK) {
9229 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9230 if (split_type == SPLIT_TYPE_AWK) {
9232 split_type = SPLIT_TYPE_STRING;
9237 mustnot_broken(spat);
9238 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9246#define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
9249 char *ptr = RSTRING_PTR(str);
9251 if (split_type == SPLIT_TYPE_AWK) {
9256 if (result) result = rb_ary_new();
9258 if (is_ascii_string(str)) {
9259 while (ptr < eptr) {
9260 c = (
unsigned char)*ptr++;
9262 if (ascii_isspace(c)) {
9268 if (!
NIL_P(limit) && lim <= i)
break;
9271 else if (ascii_isspace(c)) {
9272 SPLIT_STR(beg, end-beg);
9275 if (!
NIL_P(limit)) ++i;
9283 while (ptr < eptr) {
9286 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9295 if (!
NIL_P(limit) && lim <= i)
break;
9299 SPLIT_STR(beg, end-beg);
9302 if (!
NIL_P(limit)) ++i;
9310 else if (split_type == SPLIT_TYPE_STRING) {
9311 char *str_start = ptr;
9312 char *substr_start = ptr;
9313 char *sptr = RSTRING_PTR(spat);
9314 long slen = RSTRING_LEN(spat);
9316 if (result) result = rb_ary_new();
9317 mustnot_broken(str);
9318 enc = rb_enc_check(str, spat);
9319 while (ptr < eptr &&
9320 (end =
rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9323 if (t != ptr + end) {
9327 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9330 if (!
NIL_P(limit) && lim <= ++i)
break;
9332 beg = ptr - str_start;
9334 else if (split_type == SPLIT_TYPE_CHARS) {
9335 char *str_start = ptr;
9338 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9339 mustnot_broken(str);
9340 enc = rb_enc_get(str);
9341 while (ptr < eptr &&
9342 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9343 SPLIT_STR(ptr - str_start, n);
9345 if (!
NIL_P(limit) && lim <= ++i)
break;
9347 beg = ptr - str_start;
9350 if (result) result = rb_ary_new();
9351 long len = RSTRING_LEN(str);
9359 (match ? (rb_match_unbusy(match),
rb_backref_set(match)) : (void)0)) {
9364 if (start == end && BEG(0) == END(0)) {
9369 else if (last_null == 1) {
9370 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9377 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9383 SPLIT_STR(beg, end-beg);
9384 beg = start = END(0);
9388 for (idx=1; idx < regs->num_regs; idx++) {
9389 if (BEG(idx) == -1)
continue;
9390 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9392 if (!
NIL_P(limit) && lim <= ++i)
break;
9394 if (match) rb_match_unbusy(match);
9396 if (RSTRING_LEN(str) > 0 && (!
NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9397 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9400 return result ? result : str;
9410 return rb_str_split_m(1, &sep, str);
9413#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9419 rb_ary_push(ary, e);
9428#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9431chomp_newline(
const char *p,
const char *e, rb_encoding *enc)
9433 const char *prev = rb_enc_prev_char(p, e, e, enc);
9436 prev = rb_enc_prev_char(p, e, e, enc);
9437 if (prev && rb_enc_ascget(prev, e, NULL, enc) ==
'\r')
9449 RSTRING_LEN(rs) != 1 ||
9450 RSTRING_PTR(rs)[0] !=
'\n')) {
9456#define rb_rs get_rs()
9463 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9464 long pos,
len, rslen;
9470 static ID keywords[1];
9475 chomp = (!UNDEF_P(chomp) &&
RTEST(chomp));
9479 if (!ENUM_ELEM(ary, str)) {
9487 if (!RSTRING_LEN(str))
goto end;
9489 ptr = subptr = RSTRING_PTR(str);
9491 len = RSTRING_LEN(str);
9493 rslen = RSTRING_LEN(rs);
9496 enc = rb_enc_get(str);
9498 enc = rb_enc_check(str, rs);
9503 const char *eol = NULL;
9505 while (subend < pend) {
9506 long chomp_rslen = 0;
9508 if (rb_enc_ascget(subend, pend, &n, enc) !=
'\r')
9510 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9512 if (eol == subend)
break;
9516 chomp_rslen = -rslen;
9520 if (!subptr) subptr = subend;
9524 }
while (subend < pend);
9526 if (rslen == 0) chomp_rslen = 0;
9528 subend - subptr + (chomp ? chomp_rslen : rslen));
9529 if (ENUM_ELEM(ary, line)) {
9530 str_mod_check(str, ptr,
len);
9532 subptr = eol = NULL;
9537 rsptr = RSTRING_PTR(rs);
9538 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9547 rsptr = RSTRING_PTR(rs);
9548 rslen = RSTRING_LEN(rs);
9551 while (subptr < pend) {
9552 pos =
rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9556 if (hit != adjusted) {
9560 subend = hit += rslen;
9563 subend = chomp_newline(subptr, subend, enc);
9570 if (ENUM_ELEM(ary, line)) {
9571 str_mod_check(str, ptr,
len);
9576 if (subptr != pend) {
9579 pend = chomp_newline(subptr, pend, enc);
9581 else if (pend - subptr >= rslen &&
9582 memcmp(pend - rslen, rsptr, rslen) == 0) {
9587 ENUM_ELEM(ary, line);
9608rb_str_each_line(
int argc,
VALUE *argv,
VALUE str)
9611 return rb_str_enumerate_lines(argc, argv, str, 0);
9624rb_str_lines(
int argc,
VALUE *argv,
VALUE str)
9626 VALUE ary = WANTARRAY(
"lines", 0);
9627 return rb_str_enumerate_lines(argc, argv, str, ary);
9641 for (i=0; i<RSTRING_LEN(str); i++) {
9642 ENUM_ELEM(ary,
INT2FIX((
unsigned char)RSTRING_PTR(str)[i]));
9660rb_str_each_byte(
VALUE str)
9663 return rb_str_enumerate_bytes(str, 0);
9675rb_str_bytes(
VALUE str)
9677 VALUE ary = WANTARRAY(
"bytes", RSTRING_LEN(str));
9678 return rb_str_enumerate_bytes(str, ary);
9696 ptr = RSTRING_PTR(str);
9697 len = RSTRING_LEN(str);
9698 enc = rb_enc_get(str);
9701 for (i = 0; i <
len; i += n) {
9702 n = rb_enc_fast_mbclen(ptr + i, ptr +
len, enc);
9707 for (i = 0; i <
len; i += n) {
9708 n = rb_enc_mbclen(ptr + i, ptr +
len, enc);
9729rb_str_each_char(
VALUE str)
9732 return rb_str_enumerate_chars(str, 0);
9744rb_str_chars(
VALUE str)
9747 return rb_str_enumerate_chars(str, ary);
9751rb_str_enumerate_codepoints(
VALUE str,
VALUE ary)
9756 const char *ptr, *end;
9759 if (single_byte_optimizable(str))
9760 return rb_str_enumerate_bytes(str, ary);
9763 ptr = RSTRING_PTR(str);
9765 enc = STR_ENC_GET(str);
9768 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9789rb_str_each_codepoint(
VALUE str)
9792 return rb_str_enumerate_codepoints(str, 0);
9804rb_str_codepoints(
VALUE str)
9807 return rb_str_enumerate_codepoints(str, ary);
9811get_reg_grapheme_cluster(rb_encoding *enc)
9813 int encidx = rb_enc_to_index(enc);
9815 const OnigUChar source_ascii[] =
"\\X";
9816 const OnigUChar *source = source_ascii;
9817 size_t source_len =
sizeof(source_ascii) - 1;
9820#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9821#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9822#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9823#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9824#define CASE_UTF(e) \
9825 case ENCINDEX_UTF_##e: { \
9826 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9827 source = source_UTF_##e; \
9828 source_len = sizeof(source_UTF_##e); \
9831 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9839 regex_t *reg_grapheme_cluster;
9841 int r = onig_new(®_grapheme_cluster, source, source + source_len,
9842 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9844 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9845 onig_error_code_to_str(message, r, &einfo);
9846 rb_fatal(
"cannot compile grapheme cluster regexp: %s", (
char *)message);
9849 return reg_grapheme_cluster;
9853get_cached_reg_grapheme_cluster(rb_encoding *enc)
9855 int encidx = rb_enc_to_index(enc);
9856 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9858 if (encidx == rb_utf8_encindex()) {
9859 if (!reg_grapheme_cluster_utf8) {
9860 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9863 return reg_grapheme_cluster_utf8;
9872 size_t grapheme_cluster_count = 0;
9873 rb_encoding *enc = get_encoding(str);
9874 const char *ptr, *end;
9876 if (!rb_enc_unicode_p(enc)) {
9880 bool cached_reg_grapheme_cluster =
true;
9881 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9882 if (!reg_grapheme_cluster) {
9883 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9884 cached_reg_grapheme_cluster =
false;
9887 ptr = RSTRING_PTR(str);
9891 OnigPosition
len = onig_match(reg_grapheme_cluster,
9892 (
const OnigUChar *)ptr, (
const OnigUChar *)end,
9893 (
const OnigUChar *)ptr, NULL, 0);
9894 if (
len <= 0)
break;
9895 grapheme_cluster_count++;
9899 if (!cached_reg_grapheme_cluster) {
9900 onig_free(reg_grapheme_cluster);
9903 return SIZET2NUM(grapheme_cluster_count);
9907rb_str_enumerate_grapheme_clusters(
VALUE str,
VALUE ary)
9910 rb_encoding *enc = get_encoding(str);
9911 const char *ptr0, *ptr, *end;
9913 if (!rb_enc_unicode_p(enc)) {
9914 return rb_str_enumerate_chars(str, ary);
9919 bool cached_reg_grapheme_cluster =
true;
9920 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9921 if (!reg_grapheme_cluster) {
9922 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9923 cached_reg_grapheme_cluster =
false;
9926 ptr0 = ptr = RSTRING_PTR(str);
9930 OnigPosition
len = onig_match(reg_grapheme_cluster,
9931 (
const OnigUChar *)ptr, (
const OnigUChar *)end,
9932 (
const OnigUChar *)ptr, NULL, 0);
9933 if (
len <= 0)
break;
9938 if (!cached_reg_grapheme_cluster) {
9939 onig_free(reg_grapheme_cluster);
9959rb_str_each_grapheme_cluster(
VALUE str)
9962 return rb_str_enumerate_grapheme_clusters(str, 0);
9974rb_str_grapheme_clusters(
VALUE str)
9977 return rb_str_enumerate_grapheme_clusters(str, ary);
9981chopped_length(
VALUE str)
9983 rb_encoding *enc = STR_ENC_GET(str);
9984 const char *p, *p2, *beg, *end;
9986 beg = RSTRING_PTR(str);
9987 end = beg + RSTRING_LEN(str);
9988 if (beg >= end)
return 0;
9989 p = rb_enc_prev_char(beg, end, end, enc);
9991 if (p > beg && rb_enc_ascget(p, end, 0, enc) ==
'\n') {
9992 p2 = rb_enc_prev_char(beg, p, end, enc);
9993 if (p2 && rb_enc_ascget(p2, end, 0, enc) ==
'\r') p = p2;
10009rb_str_chop_bang(
VALUE str)
10011 str_modify_keep_cr(str);
10012 if (RSTRING_LEN(str) > 0) {
10014 len = chopped_length(str);
10015 STR_SET_LEN(str,
len);
10016 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
10035rb_str_chop(
VALUE str)
10041smart_chomp(
VALUE str,
const char *e,
const char *p)
10043 rb_encoding *enc = rb_enc_get(str);
10044 if (rb_enc_mbminlen(enc) > 1) {
10049 pp = e - rb_enc_mbminlen(enc);
10052 if (rb_enc_ascget(pp, e, 0, enc) ==
'\r') {
10060 if (--e > p && *(e-1) ==
'\r') {
10077 char *pp, *e, *rsptr;
10079 char *
const p = RSTRING_PTR(str);
10080 long len = RSTRING_LEN(str);
10082 if (
len == 0)
return 0;
10085 return smart_chomp(str, e, p);
10088 enc = rb_enc_get(str);
10091 if (rb_enc_mbminlen(enc) > 1) {
10096 pp -= rb_enc_mbminlen(enc);
10099 if (rb_enc_ascget(pp, e, 0, enc) ==
'\r') {
10106 while (e > p && *(e-1) ==
'\n') {
10108 if (e > p && *(e-1) ==
'\r')
10114 if (rslen >
len)
return len;
10116 enc = rb_enc_get(rs);
10117 newline = rsptr[rslen-1];
10118 if (rslen == rb_enc_mbminlen(enc)) {
10120 if (newline ==
'\n')
10121 return smart_chomp(str, e, p);
10125 return smart_chomp(str, e, p);
10129 enc = rb_enc_check(str, rs);
10130 if (is_broken_string(rs)) {
10134 if (p[
len-1] == newline &&
10136 memcmp(rsptr, pp, rslen) == 0)) {
10137 if (at_char_boundary(p, pp, e, enc))
10138 return len - rslen;
10150chomp_rs(
int argc,
const VALUE *argv)
10154 VALUE rs = argv[0];
10166 long olen = RSTRING_LEN(str);
10167 long len = chompped_length(str, rs);
10168 if (
len >= olen)
return Qnil;
10169 str_modify_keep_cr(str);
10170 STR_SET_LEN(str,
len);
10171 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
10188rb_str_chomp_bang(
int argc,
VALUE *argv,
VALUE str)
10191 str_modifiable(str);
10192 if (RSTRING_LEN(str) == 0 && argc < 2)
return Qnil;
10193 rs = chomp_rs(argc, argv);
10195 return rb_str_chomp_string(str, rs);
10208rb_str_chomp(
int argc,
VALUE *argv,
VALUE str)
10210 VALUE rs = chomp_rs(argc, argv);
10216lstrip_offset(
VALUE str,
const char *s,
const char *e, rb_encoding *enc)
10218 const char *
const start = s;
10220 if (!s || s >= e)
return 0;
10223 if (single_byte_optimizable(str)) {
10224 while (s < e && (*s ==
'\0' || ascii_isspace(*s))) s++;
10229 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10249rb_str_lstrip_bang(
VALUE str)
10253 long olen, loffset;
10255 str_modify_keep_cr(str);
10256 enc = STR_ENC_GET(str);
10258 loffset = lstrip_offset(str, start, start+olen, enc);
10260 long len = olen-loffset;
10261 s = start + loffset;
10262 memmove(start, s,
len);
10263 STR_SET_LEN(str,
len);
10264 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10287rb_str_lstrip(
VALUE str)
10292 loffset = lstrip_offset(str, start, start+
len, STR_ENC_GET(str));
10293 if (loffset <= 0)
return str_duplicate(
rb_cString, str);
10298rstrip_offset(
VALUE str,
const char *s,
const char *e, rb_encoding *enc)
10302 rb_str_check_dummy_enc(enc);
10306 if (!s || s >= e)
return 0;
10310 if (single_byte_optimizable(str)) {
10312 while (s < t && ((c = *(t-1)) ==
'\0' || ascii_isspace(c))) t--;
10317 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10337rb_str_rstrip_bang(
VALUE str)
10341 long olen, roffset;
10343 str_modify_keep_cr(str);
10344 enc = STR_ENC_GET(str);
10346 roffset = rstrip_offset(str, start, start+olen, enc);
10348 long len = olen - roffset;
10350 STR_SET_LEN(str,
len);
10351 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10374rb_str_rstrip(
VALUE str)
10378 long olen, roffset;
10380 enc = STR_ENC_GET(str);
10382 roffset = rstrip_offset(str, start, start+olen, enc);
10384 if (roffset <= 0)
return str_duplicate(
rb_cString, str);
10400rb_str_strip_bang(
VALUE str)
10403 long olen, loffset, roffset;
10406 str_modify_keep_cr(str);
10407 enc = STR_ENC_GET(str);
10409 loffset = lstrip_offset(str, start, start+olen, enc);
10410 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10412 if (loffset > 0 || roffset > 0) {
10413 long len = olen-roffset;
10416 memmove(start, start + loffset,
len);
10418 STR_SET_LEN(str,
len);
10419 TERM_FILL(start+
len, rb_enc_mbminlen(enc));
10442rb_str_strip(
VALUE str)
10445 long olen, loffset, roffset;
10446 rb_encoding *enc = STR_ENC_GET(str);
10449 loffset = lstrip_offset(str, start, start+olen, enc);
10450 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10452 if (loffset <= 0 && roffset <= 0)
return str_duplicate(
rb_cString, str);
10457scan_once(
VALUE str,
VALUE pat,
long *start,
int set_backref_str)
10460 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10466 end = pos + RSTRING_LEN(pat);
10476 rb_encoding *enc = STR_ENC_GET(str);
10480 if (RSTRING_LEN(str) > end)
10481 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10490 if (!regs || regs->num_regs == 1) {
10496 for (
int i = 1; i < regs->num_regs; i++) {
10502 rb_ary_push(result, s);
10557 long last = -1, prev = 0;
10558 char *p = RSTRING_PTR(str);
long len = RSTRING_LEN(str);
10560 pat = get_pat_quoted(pat, 1);
10561 mustnot_broken(str);
10563 VALUE ary = rb_ary_new();
10565 while (!
NIL_P(result = scan_once(str, pat, &start, 0))) {
10568 rb_ary_push(ary, result);
10570 if (last >= 0) rb_pat_search(pat, str, last, 1);
10575 while (!
NIL_P(result = scan_once(str, pat, &start, 1))) {
10579 str_mod_check(str, p,
len);
10581 if (last >= 0) rb_pat_search(pat, str, last, 1);
10605rb_str_hex(
VALUE str)
10607 return rb_str_to_inum(str, 16, FALSE);
10632rb_str_oct(
VALUE str)
10634 return rb_str_to_inum(str, -8, FALSE);
10637#ifndef HAVE_CRYPT_R
10642 rb_nativethread_lock_t lock;
10643} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10646crypt_mutex_initialize(
void)
10717# define CRYPT_END() ALLOCV_END(databuf)
10719 extern char *crypt(
const char *,
const char *);
10720# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10723 const char *s, *saltp;
10726 char salt_8bit_clean[3];
10730 mustnot_wchar(str);
10731 mustnot_wchar(salt);
10733 saltp = RSTRING_PTR(salt);
10734 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10735 rb_raise(rb_eArgError,
"salt too short (need >=2 bytes)");
10739 if (!
ISASCII((
unsigned char)saltp[0]) || !
ISASCII((
unsigned char)saltp[1])) {
10740 salt_8bit_clean[0] = saltp[0] & 0x7f;
10741 salt_8bit_clean[1] = saltp[1] & 0x7f;
10742 salt_8bit_clean[2] =
'\0';
10743 saltp = salt_8bit_clean;
10748# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10749 data->initialized = 0;
10751 res = crypt_r(s, saltp, data);
10753 crypt_mutex_initialize();
10755 res = crypt(s, saltp);
10796 char *ptr, *p, *pend;
10799 unsigned long sum0 = 0;
10804 ptr = p = RSTRING_PTR(str);
10805 len = RSTRING_LEN(str);
10811 str_mod_check(str, ptr,
len);
10814 sum0 += (
unsigned char)*p;
10825 if (bits < (
int)
sizeof(
long)*CHAR_BIT) {
10826 sum0 &= (((
unsigned long)1)<<bits)-1;
10846rb_str_justify(
int argc,
VALUE *argv,
VALUE str,
char jflag)
10850 long width,
len, flen = 1, fclen = 1;
10853 const char *f =
" ";
10854 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10856 int singlebyte = 1, cr;
10860 enc = STR_ENC_GET(str);
10861 termlen = rb_enc_mbminlen(enc);
10865 enc = rb_enc_check(str, pad);
10866 f = RSTRING_PTR(pad);
10867 flen = RSTRING_LEN(pad);
10868 fclen = str_strlen(pad, enc);
10869 singlebyte = single_byte_optimizable(pad);
10870 if (flen == 0 || fclen == 0) {
10871 rb_raise(rb_eArgError,
"zero width padding");
10874 len = str_strlen(str, enc);
10875 if (width < 0 || len >= width)
return str_duplicate(
rb_cString, str);
10877 llen = (jflag ==
'l') ? 0 : ((jflag ==
'r') ? n : n/2);
10881 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10882 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10884 size = RSTRING_LEN(str);
10885 if ((
len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10886 (
len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10887 (
len += llen2 + rlen2) >= LONG_MAX - size) {
10888 rb_raise(rb_eArgError,
"argument too big");
10892 p = RSTRING_PTR(res);
10894 memset(p, *f, llen);
10898 while (llen >= fclen) {
10904 memcpy(p, f, llen2);
10908 memcpy(p, RSTRING_PTR(str), size);
10911 memset(p, *f, rlen);
10915 while (rlen >= fclen) {
10921 memcpy(p, f, rlen2);
10925 TERM_FILL(p, termlen);
10926 STR_SET_LEN(res, p-RSTRING_PTR(res));
10949rb_str_ljust(
int argc,
VALUE *argv,
VALUE str)
10951 return rb_str_justify(argc, argv, str,
'l');
10965rb_str_rjust(
int argc,
VALUE *argv,
VALUE str)
10967 return rb_str_justify(argc, argv, str,
'r');
10982rb_str_center(
int argc,
VALUE *argv,
VALUE str)
10984 return rb_str_justify(argc, argv, str,
'c');
11000 sep = get_pat_quoted(sep, 0);
11012 pos = rb_str_index(str, sep, 0);
11013 if (pos < 0)
goto failed;
11018 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11021 return rb_ary_new3(3, str_duplicate(
rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11035 long pos = RSTRING_LEN(str);
11037 sep = get_pat_quoted(sep, 0);
11050 pos = rb_str_rindex(str, sep, pos);
11059 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11061 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(
rb_cString, str));
11073rb_str_start_with(
int argc,
VALUE *argv,
VALUE str)
11077 for (i=0; i<argc; i++) {
11078 VALUE tmp = argv[i];
11080 if (rb_reg_start_with_p(tmp, str))
11084 const char *p, *s, *e;
11089 enc = rb_enc_check(str, tmp);
11090 if ((tlen = RSTRING_LEN(tmp)) == 0)
return Qtrue;
11091 if ((slen = RSTRING_LEN(str)) < tlen)
continue;
11092 p = RSTRING_PTR(str);
11095 if (!at_char_right_boundary(p, s, e, enc))
11097 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11113rb_str_end_with(
int argc,
VALUE *argv,
VALUE str)
11117 for (i=0; i<argc; i++) {
11118 VALUE tmp = argv[i];
11119 const char *p, *s, *e;
11124 enc = rb_enc_check(str, tmp);
11125 if ((tlen = RSTRING_LEN(tmp)) == 0)
return Qtrue;
11126 if ((slen = RSTRING_LEN(str)) < tlen)
continue;
11127 p = RSTRING_PTR(str);
11130 if (!at_char_boundary(p, s, e, enc))
11132 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11148deleted_prefix_length(
VALUE str,
VALUE prefix)
11150 const char *strptr, *prefixptr;
11151 long olen, prefixlen;
11152 rb_encoding *enc = rb_enc_get(str);
11156 if (!is_broken_string(prefix) ||
11157 !rb_enc_asciicompat(enc) ||
11158 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11159 enc = rb_enc_check(str, prefix);
11163 prefixlen = RSTRING_LEN(prefix);
11164 if (prefixlen <= 0)
return 0;
11165 olen = RSTRING_LEN(str);
11166 if (olen < prefixlen)
return 0;
11167 strptr = RSTRING_PTR(str);
11168 prefixptr = RSTRING_PTR(prefix);
11169 if (memcmp(strptr, prefixptr, prefixlen) != 0)
return 0;
11170 if (is_broken_string(prefix)) {
11171 if (!is_broken_string(str)) {
11175 const char *strend = strptr + olen;
11176 const char *after_prefix = strptr + prefixlen;
11177 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11197rb_str_delete_prefix_bang(
VALUE str,
VALUE prefix)
11200 str_modify_keep_cr(str);
11202 prefixlen = deleted_prefix_length(str, prefix);
11203 if (prefixlen <= 0)
return Qnil;
11217rb_str_delete_prefix(
VALUE str,
VALUE prefix)
11221 prefixlen = deleted_prefix_length(str, prefix);
11222 if (prefixlen <= 0)
return str_duplicate(
rb_cString, str);
11224 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11237deleted_suffix_length(
VALUE str,
VALUE suffix)
11239 const char *strptr, *suffixptr;
11240 long olen, suffixlen;
11244 if (is_broken_string(suffix))
return 0;
11245 enc = rb_enc_check(str, suffix);
11248 suffixlen = RSTRING_LEN(suffix);
11249 if (suffixlen <= 0)
return 0;
11250 olen = RSTRING_LEN(str);
11251 if (olen < suffixlen)
return 0;
11252 strptr = RSTRING_PTR(str);
11253 suffixptr = RSTRING_PTR(suffix);
11254 const char *strend = strptr + olen;
11255 const char *before_suffix = strend - suffixlen;
11256 if (memcmp(before_suffix, suffixptr, suffixlen) != 0)
return 0;
11257 if (!at_char_boundary(strptr, before_suffix, strend, enc))
return 0;
11272rb_str_delete_suffix_bang(
VALUE str,
VALUE suffix)
11274 long olen, suffixlen,
len;
11275 str_modifiable(str);
11277 suffixlen = deleted_suffix_length(str, suffix);
11278 if (suffixlen <= 0)
return Qnil;
11280 olen = RSTRING_LEN(str);
11281 str_modify_keep_cr(str);
11282 len = olen - suffixlen;
11283 STR_SET_LEN(str,
len);
11284 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
11300rb_str_delete_suffix(
VALUE str,
VALUE suffix)
11304 suffixlen = deleted_suffix_length(str, suffix);
11305 if (suffixlen <= 0)
return str_duplicate(
rb_cString, str);
11307 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11314 rb_raise(
rb_eTypeError,
"value of %"PRIsVALUE
" must be String", rb_id2str(
id));
11322 val = rb_fs_check(val);
11325 "value of %"PRIsVALUE
" must be String or Regexp",
11329 rb_warn_deprecated(
"'$;'", NULL);
11346 str_modifiable(str);
11348 rb_encoding *encoding = rb_to_encoding(enc);
11349 int idx = rb_enc_to_index(encoding);
11356 rb_enc_associate_index(str, idx);
11380 if (STR_EMBED_P(str)) {
11381 str2 = str_alloc_embed(
rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11386 str_replace_shared_without_enc(str2, str);
11388 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11421rb_str_valid_encoding_p(
VALUE str)
11441rb_str_is_ascii_only_p(
VALUE str)
11451 static const char ellipsis[] =
"...";
11452 const long ellipsislen =
sizeof(ellipsis) - 1;
11453 rb_encoding *
const enc = rb_enc_get(str);
11454 const long blen = RSTRING_LEN(str);
11455 const char *
const p = RSTRING_PTR(str), *e = p + blen;
11456 VALUE estr, ret = 0;
11459 if (
len * rb_enc_mbminlen(enc) >= blen ||
11463 else if (
len <= ellipsislen ||
11465 if (rb_enc_asciicompat(enc)) {
11467 rb_enc_associate(ret, enc);
11474 else if (ret =
rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11479 rb_enc_from_encoding(enc), 0,
Qnil);
11486str_compat_and_valid(
VALUE str, rb_encoding *enc)
11492 rb_raise(rb_eArgError,
"replacement must be valid byte sequence '%+"PRIsVALUE
"'", str);
11495 rb_encoding *e = STR_ENC_GET(str);
11498 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11504static VALUE enc_str_scrub(rb_encoding *enc,
VALUE str,
VALUE repl,
int cr);
11509 rb_encoding *enc = STR_ENC_GET(str);
11514rb_enc_str_scrub(rb_encoding *enc,
VALUE str,
VALUE repl)
11517 if (enc == STR_ENC_GET(str)) {
11522 return enc_str_scrub(enc, str, repl, cr);
11526enc_str_scrub(rb_encoding *enc,
VALUE str,
VALUE repl,
int cr)
11530 const char *rep, *p, *e, *p1, *sp;
11536 rb_raise(rb_eArgError,
"both of block and replacement given");
11543 if (!
NIL_P(repl)) {
11544 repl = str_compat_and_valid(repl, enc);
11547 if (rb_enc_dummy_p(enc)) {
11550 encidx = rb_enc_to_index(enc);
11552#define DEFAULT_REPLACE_CHAR(str) do { \
11553 static const char replace[sizeof(str)-1] = str; \
11554 rep = replace; replen = (int)sizeof(replace); \
11557 slen = RSTRING_LEN(str);
11558 p = RSTRING_PTR(str);
11563 if (rb_enc_asciicompat(enc)) {
11569 else if (!
NIL_P(repl)) {
11570 rep = RSTRING_PTR(repl);
11571 replen = RSTRING_LEN(repl);
11574 else if (encidx == rb_utf8_encindex()) {
11575 DEFAULT_REPLACE_CHAR(
"\xEF\xBF\xBD");
11579 DEFAULT_REPLACE_CHAR(
"?");
11584 p = search_nonascii(p, e);
11589 int ret = rb_enc_precise_mbclen(p, e, enc);
11608 if (e - p < clen) clen = e - p;
11615 for (; clen > 1; clen--) {
11616 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11627 repl =
rb_yield(rb_enc_str_new(p, clen, enc));
11628 str_mod_check(str, sp, slen);
11629 repl = str_compat_and_valid(repl, enc);
11636 p = search_nonascii(p, e);
11662 repl =
rb_yield(rb_enc_str_new(p, e-p, enc));
11663 str_mod_check(str, sp, slen);
11664 repl = str_compat_and_valid(repl, enc);
11673 long mbminlen = rb_enc_mbminlen(enc);
11677 else if (!
NIL_P(repl)) {
11678 rep = RSTRING_PTR(repl);
11679 replen = RSTRING_LEN(repl);
11681 else if (encidx == ENCINDEX_UTF_16BE) {
11682 DEFAULT_REPLACE_CHAR(
"\xFF\xFD");
11684 else if (encidx == ENCINDEX_UTF_16LE) {
11685 DEFAULT_REPLACE_CHAR(
"\xFD\xFF");
11687 else if (encidx == ENCINDEX_UTF_32BE) {
11688 DEFAULT_REPLACE_CHAR(
"\x00\x00\xFF\xFD");
11690 else if (encidx == ENCINDEX_UTF_32LE) {
11691 DEFAULT_REPLACE_CHAR(
"\xFD\xFF\x00\x00");
11694 DEFAULT_REPLACE_CHAR(
"?");
11698 int ret = rb_enc_precise_mbclen(p, e, enc);
11711 if (e - p < clen) clen = e - p;
11712 if (clen <= mbminlen * 2) {
11717 for (; clen > mbminlen; clen-=mbminlen) {
11718 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11728 repl =
rb_yield(rb_enc_str_new(p, clen, enc));
11729 str_mod_check(str, sp, slen);
11730 repl = str_compat_and_valid(repl, enc);
11755 repl =
rb_yield(rb_enc_str_new(p, e-p, enc));
11756 str_mod_check(str, sp, slen);
11757 repl = str_compat_and_valid(repl, enc);
11793str_scrub_bang(
int argc,
VALUE *argv,
VALUE str)
11801static ID id_normalize;
11802static ID id_normalized_p;
11803static VALUE mUnicodeNormalize;
11806unicode_normalize_common(
int argc,
VALUE *argv,
VALUE str,
ID id)
11808 static int UnicodeNormalizeRequired = 0;
11811 if (!UnicodeNormalizeRequired) {
11812 rb_require(
"unicode_normalize/normalize.rb");
11813 UnicodeNormalizeRequired = 1;
11817 return rb_funcallv(mUnicodeNormalize,
id, argc+1, argv2);
11854rb_str_unicode_normalize(
int argc,
VALUE *argv,
VALUE str)
11856 return unicode_normalize_common(argc, argv, str, id_normalize);
11870rb_str_unicode_normalize_bang(
int argc,
VALUE *argv,
VALUE str)
11872 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11899rb_str_unicode_normalized_p(
int argc,
VALUE *argv,
VALUE str)
11901 return unicode_normalize_common(argc, argv, str, id_normalized_p);
12033#define sym_equal rb_obj_equal
12036sym_printable(
const char *s,
const char *send, rb_encoding *enc)
12040 int c = rb_enc_precise_mbclen(s, send, enc);
12044 c = rb_enc_mbc_to_codepoint(s, send, enc);
12052rb_str_symname_p(
VALUE sym)
12057 rb_encoding *resenc = rb_default_internal_encoding();
12059 if (resenc == NULL) resenc = rb_default_external_encoding();
12060 enc = STR_ENC_GET(sym);
12061 ptr = RSTRING_PTR(sym);
12062 len = RSTRING_LEN(sym);
12063 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) ||
len != (
long)strlen(ptr) ||
12071rb_str_quote_unprintable(
VALUE str)
12076 rb_encoding *resenc;
12079 resenc = rb_default_internal_encoding();
12080 if (resenc == NULL) resenc = rb_default_external_encoding();
12081 enc = STR_ENC_GET(str);
12082 ptr = RSTRING_PTR(str);
12083 len = RSTRING_LEN(str);
12084 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12085 !sym_printable(ptr, ptr +
len, enc)) {
12086 return rb_str_escape(str);
12092rb_id_quote_unprintable(
ID id)
12094 VALUE str = rb_id2str(
id);
12095 if (!rb_str_symname_p(str)) {
12096 return rb_str_escape(str);
12114sym_inspect(
VALUE sym)
12121 if (!rb_str_symname_p(str)) {
12123 len = RSTRING_LEN(str);
12124 rb_str_resize(str,
len + 1);
12125 dest = RSTRING_PTR(str);
12126 memmove(dest + 1, dest,
len);
12129 rb_encoding *enc = STR_ENC_GET(str);
12130 VALUE orig_str = str;
12132 len = RSTRING_LEN(orig_str);
12133 str = rb_enc_str_new(0,
len + 1, enc);
12136 ptr = RSTRING_PTR(orig_str);
12137 dest = RSTRING_PTR(str);
12138 memcpy(dest + 1, ptr,
len);
12158rb_sym_proc_call(
ID mid,
int argc,
const VALUE *argv,
int kw_splat,
VALUE passed_proc)
12163 rb_raise(rb_eArgError,
"no receiver given");
12260 return rb_str_match(
rb_sym2str(sym), other);
12275sym_match_m(
int argc,
VALUE *argv,
VALUE sym)
12277 return rb_str_match_m(argc, argv,
rb_sym2str(sym));
12290sym_match_m_p(
int argc,
VALUE *argv,
VALUE sym)
12292 return rb_str_match_m_p(argc, argv, sym);
12310 return rb_str_aref_m(argc, argv,
rb_sym2str(sym));
12321sym_length(
VALUE sym)
12335sym_empty(
VALUE sym)
12369sym_downcase(
int argc,
VALUE *argv,
VALUE sym)
12385sym_capitalize(
int argc,
VALUE *argv,
VALUE sym)
12401sym_swapcase(
int argc,
VALUE *argv,
VALUE sym)
12415sym_start_with(
int argc,
VALUE *argv,
VALUE sym)
12417 return rb_str_start_with(argc, argv,
rb_sym2str(sym));
12430sym_end_with(
int argc,
VALUE *argv,
VALUE sym)
12432 return rb_str_end_with(argc, argv,
rb_sym2str(sym));
12444sym_encoding(
VALUE sym)
12450string_for_symbol(
VALUE name)
12455 rb_raise(
rb_eTypeError,
"%+"PRIsVALUE
" is not a symbol nor a string",
12469 name = string_for_symbol(name);
12470 return rb_intern_str(name);
12479 name = string_for_symbol(name);
12503 return rb_fstring(str);
12510 return register_fstring(setup_fake_str(&fake_str, ptr,
len, ENCINDEX_US_ASCII),
true,
false);
12522 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12523 rb_enc_autoload(enc);
12527 return register_fstring(rb_setup_fake_str(&fake_str, ptr,
len, enc),
true,
false);
12531rb_enc_literal_str(
const char *ptr,
long len, rb_encoding *enc)
12533 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12534 rb_enc_autoload(enc);
12538 return register_fstring(rb_setup_fake_str(&fake_str, ptr,
len, enc),
true,
true);
12549rb_yjit_str_concat_codepoint(
VALUE str,
VALUE codepoint)
12554 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12555 rb_str_buf_cat_byte(str, (
char) code);
12569 st_foreach(rb_vm_fstring_table(), fstring_set_class_i,
rb_cString);
12735 rb_gc_register_address(&
rb_fs);
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
@ RUBY_FL_FREEZE
This flag has something to do with data immutability.
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
VALUE rb_define_module(const char *name)
Defines a top-level module.
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
int rb_block_given_p(void)
Determines if the current method is given a block.
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
#define TYPE(_)
Old name of rb_type.
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
#define FL_EXIVAR
Old name of RUBY_FL_EXIVAR.
#define ALLOCV
Old name of RB_ALLOCV.
#define ISSPACE
Old name of rb_isspace.
#define T_STRING
Old name of RUBY_T_STRING.
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
#define xfree
Old name of ruby_xfree.
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
#define rb_str_cat2
Old name of rb_str_cat_cstr.
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
#define ID2SYM
Old name of RB_ID2SYM.
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
#define SYM2ID
Old name of RB_SYM2ID.
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
#define CLASS_OF
Old name of rb_class_of.
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
#define SIZET2NUM
Old name of RB_SIZE2NUM.
#define FIXABLE
Old name of RB_FIXABLE.
#define xmalloc
Old name of ruby_xmalloc.
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
#define LONG2FIX
Old name of RB_INT2FIX.
#define ISDIGIT
Old name of rb_isdigit.
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
#define ZALLOC_N
Old name of RB_ZALLOC_N.
#define ALLOC_N
Old name of RB_ALLOC_N.
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
#define FL_SET
Old name of RB_FL_SET.
#define rb_ary_new3
Old name of rb_ary_new_from_args.
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
#define LONG2NUM
Old name of RB_LONG2NUM.
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
#define ISALPHA
Old name of rb_isalpha.
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
#define ISASCII
Old name of rb_isascii.
#define TOLOWER
Old name of rb_tolower.
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
#define NUM2INT
Old name of RB_NUM2INT.
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
#define DBL2NUM
Old name of rb_float_new.
#define ISPRINT
Old name of rb_isprint.
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
#define FL_TEST
Old name of RB_FL_TEST.
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
#define NUM2LONG
Old name of RB_NUM2LONG.
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
#define FL_UNSET
Old name of RB_FL_UNSET.
#define UINT2NUM
Old name of RB_UINT2NUM.
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
#define rb_ary_new2
Old name of rb_ary_new_capa.
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
#define SYMBOL_P
Old name of RB_SYMBOL_P.
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
#define T_REGEXP
Old name of RUBY_T_REGEXP.
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
VALUE rb_eRangeError
RangeError exception.
VALUE rb_eTypeError
TypeError exception.
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
VALUE rb_eRuntimeError
RuntimeError exception.
VALUE rb_eIndexError
IndexError exception.
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
VALUE rb_cSymbol
Symbol class.
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
VALUE rb_mComparable
Comparable module.
VALUE rb_cString
String class.
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
rb_econv_result_t
return value of rb_econv_convert()
@ econv_finished
The conversion stopped after converting everything.
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
VALUE rb_fs
The field separator character for inputs, or the $;.
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
void rb_backref_set(VALUE md)
Updates $~.
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
void rb_str_free(VALUE str)
Destroys the given string for no reason.
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
#define rb_hash_end(h)
Just another name of st_hash_end.
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "defaultexternal" encoding.
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
#define rb_str_buf_cat
Just another name of rb_str_cat.
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
VALUE rb_str_dup(VALUE str)
Duplicates a string.
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "defaultexternal" encoding.
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
#define rb_strlen_lit(str)
Length of a string literal.
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
int capa
Designed capacity of the buffer.
int off
Offset inside of ptr.
int len
Length of the buffer.
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
VALUE rb_yield(VALUE val)
Yields the block.
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
#define ALLOCA_N(type, n)
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
#define RBASIC(obj)
Convenient casting macro.
#define DATA_PTR(obj)
Convenient getter macro.
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
#define StringValue(v)
Ensures that the parameter object is a String.
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
#define RSTRING(obj)
Convenient casting macro.
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
struct rb_data_type_struct rb_data_type_t
This is the struct that holds necessary info for a struct.
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
#define errno
Ractor-aware version of errno.
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
VALUE flags
Per-object flags.
struct RBasic basic
Basic part, including flags and class.
long capa
Capacity of *ptr.
long len
Length of the string, not including terminating NUL character.
union RString::@157025041137035241047331270155043025061071337053::@157067065136062356112324002106172053054013023024::@365170260060164113275356137374160141226332013204 aux
Auxiliary info.
struct RString::@157025041137035241047331270155043025061071337053::@153056146250355212360325351117351053336274231135 embed
Embedded contents.
VALUE shared
Parent of the string.
char * ptr
Pointer to the contents of the string.
union RString::@157025041137035241047331270155043025061071337053 as
String's specific fields.
struct RString::@157025041137035241047331270155043025061071337053::@157067065136062356112324002106172053054013023024 heap
Strings that use separated memory region for contents use this pattern.
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
uintptr_t VALUE
Type that represents a Ruby object.
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
ruby_value_type
C-level type of an object.