Ruby 3.4.5p51 (2025-07-16 revision 20cda200d3ce092571d0b5d342dadca69636cb0f)
string.c
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "id.h"
27#include "internal.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/encoding.h"
32#include "internal/error.h"
33#include "internal/gc.h"
34#include "internal/numeric.h"
35#include "internal/object.h"
36#include "internal/proc.h"
37#include "internal/re.h"
38#include "internal/sanitizers.h"
39#include "internal/string.h"
40#include "internal/transcode.h"
41#include "probes.h"
42#include "ruby/encoding.h"
43#include "ruby/re.h"
44#include "ruby/util.h"
45#include "ruby_assert.h"
46#include "vm_sync.h"
48
49#if defined HAVE_CRYPT_R
50# if defined HAVE_CRYPT_H
51# include <crypt.h>
52# endif
53#elif !defined HAVE_CRYPT
54# include "missing/crypt.h"
55# define HAVE_CRYPT_R 1
56#endif
57
58#define BEG(no) (regs->beg[(no)])
59#define END(no) (regs->end[(no)])
60
61#undef rb_str_new
62#undef rb_usascii_str_new
63#undef rb_utf8_str_new
64#undef rb_enc_str_new
65#undef rb_str_new_cstr
66#undef rb_usascii_str_new_cstr
67#undef rb_utf8_str_new_cstr
68#undef rb_enc_str_new_cstr
69#undef rb_external_str_new_cstr
70#undef rb_locale_str_new_cstr
71#undef rb_str_dup_frozen
72#undef rb_str_buf_new_cstr
73#undef rb_str_buf_cat
74#undef rb_str_buf_cat2
75#undef rb_str_cat2
76#undef rb_str_cat_cstr
77#undef rb_fstring_cstr
78
81
82/* Flags of RString
83 *
84 * 0: STR_SHARED (equal to ELTS_SHARED)
85 * The string is shared. The buffer this string points to is owned by
86 * another string (the shared root).
87 * 1: RSTRING_NOEMBED
88 * The string is not embedded. When a string is embedded, the contents
89 * follow the header. When a string is not embedded, the contents is
90 * on a separately allocated buffer.
91 * 2: STR_CHILLED_LITERAL (will be frozen in a future version)
92 * The string was allocated as a literal in a file without an explicit `frozen_string_literal` comment.
93 * It emits a deprecation warning when mutated for the first time.
94 * 3: STR_CHILLED_SYMBOL_TO_S (will be frozen in a future version)
95 * The string was allocated by the `Symbol#to_s` method.
96 * It emits a deprecation warning when mutated for the first time.
97 * 4: STR_PRECOMPUTED_HASH
98 * The string is embedded and has its precomputed hashcode stored
99 * after the terminator.
100 * 5: STR_SHARED_ROOT
101 * Other strings may point to the contents of this string. When this
102 * flag is set, STR_SHARED must not be set.
103 * 6: STR_BORROWED
104 * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
105 * to be unshared by rb_str_tmp_frozen_release.
106 * 7: STR_TMPLOCK
107 * The pointer to the buffer is passed to a system call such as
108 * read(2). Any modification and realloc is prohibited.
109 * 8-9: ENC_CODERANGE
110 * Stores the coderange of the string.
111 * 10-16: ENCODING
112 * Stores the encoding of the string.
113 * 17: RSTRING_FSTR
114 * The string is a fstring. The string is deduplicated in the fstring
115 * table.
116 * 18: STR_NOFREE
117 * Do not free this string's buffer when the string is reclaimed
118 * by the garbage collector. Used for when the string buffer is a C
119 * string literal.
120 * 19: STR_FAKESTR
121 * The string is not allocated or managed by the garbage collector.
122 * Typically, the string object header (struct RString) is temporarily
123 * allocated on C stack.
124 */
125
126#define RUBY_MAX_CHAR_LEN 16
127#define STR_PRECOMPUTED_HASH FL_USER4
128#define STR_SHARED_ROOT FL_USER5
129#define STR_BORROWED FL_USER6
130#define STR_TMPLOCK FL_USER7
131#define STR_NOFREE FL_USER18
132#define STR_FAKESTR FL_USER19
133
134#define STR_SET_NOEMBED(str) do {\
135 FL_SET((str), STR_NOEMBED);\
136 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
137} while (0)
138#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
139
140#define STR_SET_LEN(str, n) do { \
141 RSTRING(str)->len = (n); \
142} while (0)
143
144static inline bool
145str_encindex_fastpath(int encindex)
146{
147 // The overwhelming majority of strings are in one of these 3 encodings.
148 switch (encindex) {
149 case ENCINDEX_ASCII_8BIT:
150 case ENCINDEX_UTF_8:
151 case ENCINDEX_US_ASCII:
152 return true;
153 default:
154 return false;
155 }
156}
157
158static inline bool
159str_enc_fastpath(VALUE str)
160{
161 return str_encindex_fastpath(ENCODING_GET_INLINED(str));
162}
163
164#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
165#define TERM_FILL(ptr, termlen) do {\
166 char *const term_fill_ptr = (ptr);\
167 const int term_fill_len = (termlen);\
168 *term_fill_ptr = '\0';\
169 if (UNLIKELY(term_fill_len > 1))\
170 memset(term_fill_ptr, 0, term_fill_len);\
171} while (0)
172
173#define RESIZE_CAPA(str,capacity) do {\
174 const int termlen = TERM_LEN(str);\
175 RESIZE_CAPA_TERM(str,capacity,termlen);\
176} while (0)
177#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
178 if (STR_EMBED_P(str)) {\
179 if (str_embed_capa(str) < capacity + termlen) {\
180 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
181 const long tlen = RSTRING_LEN(str);\
182 memcpy(tmp, RSTRING_PTR(str), tlen);\
183 RSTRING(str)->as.heap.ptr = tmp;\
184 RSTRING(str)->len = tlen;\
185 STR_SET_NOEMBED(str);\
186 RSTRING(str)->as.heap.aux.capa = (capacity);\
187 }\
188 }\
189 else {\
190 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
191 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
192 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
193 RSTRING(str)->as.heap.aux.capa = (capacity);\
194 }\
195} while (0)
196
197#define STR_SET_SHARED(str, shared_str) do { \
198 if (!FL_TEST(str, STR_FAKESTR)) { \
199 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
200 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
201 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
202 FL_SET((str), STR_SHARED); \
203 FL_SET((shared_str), STR_SHARED_ROOT); \
204 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
205 FL_SET_RAW((shared_str), STR_BORROWED); \
206 } \
207} while (0)
208
209#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
210#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
211/* TODO: include the terminator size in capa. */
212
213#define STR_ENC_GET(str) get_encoding(str)
214
215#if !defined SHARABLE_MIDDLE_SUBSTRING
216# define SHARABLE_MIDDLE_SUBSTRING 0
217#endif
218#if !SHARABLE_MIDDLE_SUBSTRING
219#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
220#else
221#define SHARABLE_SUBSTRING_P(beg, len, end) 1
222#endif
223
224
225static inline long
226str_embed_capa(VALUE str)
227{
228 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
229}
230
231bool
232rb_str_reembeddable_p(VALUE str)
233{
234 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
235}
236
237static inline size_t
238rb_str_embed_size(long capa)
239{
240 return offsetof(struct RString, as.embed.ary) + capa;
241}
242
243size_t
244rb_str_size_as_embedded(VALUE str)
245{
246 size_t real_size;
247 if (STR_EMBED_P(str)) {
248 real_size = rb_str_embed_size(RSTRING(str)->len) + TERM_LEN(str);
249 }
250 /* if the string is not currently embedded, but it can be embedded, how
251 * much space would it require */
252 else if (rb_str_reembeddable_p(str)) {
253 real_size = rb_str_embed_size(RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
254 }
255 else {
256 real_size = sizeof(struct RString);
257 }
258
259 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
260 real_size += sizeof(st_index_t);
261 }
262
263 return real_size;
264}
265
266static inline bool
267STR_EMBEDDABLE_P(long len, long termlen)
268{
269 return rb_gc_size_allocatable_p(rb_str_embed_size(len + termlen));
270}
271
272static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
273static VALUE str_new_frozen(VALUE klass, VALUE orig);
274static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
275static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
276static VALUE str_new(VALUE klass, const char *ptr, long len);
277static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
278static inline void str_modifiable(VALUE str);
279static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
280static inline VALUE str_alloc_embed(VALUE klass, size_t capa);
281
282static inline void
283str_make_independent(VALUE str)
284{
285 long len = RSTRING_LEN(str);
286 int termlen = TERM_LEN(str);
287 str_make_independent_expand((str), len, 0L, termlen);
288}
289
290static inline int str_dependent_p(VALUE str);
291
292void
293rb_str_make_independent(VALUE str)
294{
295 if (str_dependent_p(str)) {
296 str_make_independent(str);
297 }
298}
299
300void
301rb_str_make_embedded(VALUE str)
302{
303 RUBY_ASSERT(rb_str_reembeddable_p(str));
304 RUBY_ASSERT(!STR_EMBED_P(str));
305
306 char *buf = RSTRING(str)->as.heap.ptr;
307 long len = RSTRING(str)->len;
308
309 STR_SET_EMBED(str);
310 STR_SET_LEN(str, len);
311
312 if (len > 0) {
313 memcpy(RSTRING_PTR(str), buf, len);
314 ruby_xfree(buf);
315 }
316
317 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
318}
319
320void
321rb_debug_rstring_null_ptr(const char *func)
322{
323 fprintf(stderr, "%s is returning NULL!! "
324 "SIGSEGV is highly expected to follow immediately.\n"
325 "If you could reproduce, attach your debugger here, "
326 "and look at the passed string.\n",
327 func);
328}
329
330/* symbols for [up|down|swap]case/capitalize options */
331static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
332
333static rb_encoding *
334get_encoding(VALUE str)
335{
336 return rb_enc_from_index(ENCODING_GET(str));
337}
338
339static void
340mustnot_broken(VALUE str)
341{
342 if (is_broken_string(str)) {
343 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
344 }
345}
346
347static void
348mustnot_wchar(VALUE str)
349{
350 rb_encoding *enc = STR_ENC_GET(str);
351 if (rb_enc_mbminlen(enc) > 1) {
352 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
353 }
354}
355
356static int fstring_cmp(VALUE a, VALUE b);
357
358static VALUE register_fstring(VALUE str, bool copy, bool force_precompute_hash);
359
360#if SIZEOF_LONG == SIZEOF_VOIDP
361#define PRECOMPUTED_FAKESTR_HASH 1
362#else
363#endif
364
365#ifdef PRECOMPUTED_FAKESTR_HASH
366static st_index_t
367fstring_hash(VALUE str)
368{
369 if (FL_TEST_RAW(str, STR_FAKESTR)) {
370 // register_fstring precomputes the hash and stores it in capa for fake strings
371 return (st_index_t)RSTRING(str)->as.heap.aux.capa;
372 }
373 else {
374 return rb_str_hash(str);
375 }
376}
377#else
378#define fstring_hash rb_str_hash
379#endif
380
381const struct st_hash_type rb_fstring_hash_type = {
382 fstring_cmp,
383 fstring_hash,
384};
385
386#define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
387
388static inline st_index_t
389str_do_hash(VALUE str)
390{
391 st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
392 int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
393 if (e && !is_ascii_string(str)) {
394 h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
395 }
396 return h;
397}
398
399static VALUE
400str_store_precomputed_hash(VALUE str, st_index_t hash)
401{
402 RUBY_ASSERT(!FL_TEST_RAW(str, STR_PRECOMPUTED_HASH));
403 RUBY_ASSERT(STR_EMBED_P(str));
404
405#if RUBY_DEBUG
406 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
407 size_t free_bytes = str_embed_capa(str) - used_bytes;
408 RUBY_ASSERT(free_bytes >= sizeof(st_index_t));
409#endif
410
411 memcpy(RSTRING_END(str) + TERM_LEN(str), &hash, sizeof(hash));
412
413 FL_SET(str, STR_PRECOMPUTED_HASH);
414
415 return str;
416}
417
419 VALUE fstr;
420 bool copy;
421 bool force_precompute_hash;
422};
423
424static int
425fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data, int existing)
426{
427 struct fstr_update_arg *arg = (struct fstr_update_arg *)data;
428 VALUE str = (VALUE)*key;
429
430 if (existing) {
431 /* because of lazy sweep, str may be unmarked already and swept
432 * at next time */
433
434 if (rb_objspace_garbage_object_p(str)) {
435 arg->fstr = Qundef;
436 // When RSTRING_FSTR strings are swept, they call `st_delete`.
437 // To avoid a race condition if an equivalent string was inserted
438 // we must remove the flag immediately.
439 FL_UNSET_RAW(str, RSTRING_FSTR);
440 return ST_DELETE;
441 }
442
443 arg->fstr = str;
444 return ST_STOP;
445 }
446 else {
447 // Unless the string is empty or binary, its coderange has been precomputed.
448 int coderange = ENC_CODERANGE(str);
449
450 if (FL_TEST_RAW(str, STR_FAKESTR)) {
451 if (arg->copy) {
452 VALUE new_str;
453 long len = RSTRING_LEN(str);
454 long capa = len + sizeof(st_index_t);
455 int term_len = TERM_LEN(str);
456
457 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(capa, term_len)) {
458 new_str = str_alloc_embed(rb_cString, capa + term_len);
459 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str), len);
460 STR_SET_LEN(new_str, RSTRING_LEN(str));
461 TERM_FILL(RSTRING_END(new_str), TERM_LEN(str));
462 rb_enc_copy(new_str, str);
463 str_store_precomputed_hash(new_str, fstring_hash(str));
464 }
465 else {
466 new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
467 rb_enc_copy(new_str, str);
468#ifdef PRECOMPUTED_FAKESTR_HASH
469 if (rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len + sizeof(st_index_t)) {
470 str_store_precomputed_hash(new_str, (st_index_t)RSTRING(str)->as.heap.aux.capa);
471 }
472#endif
473 }
474 str = new_str;
475 }
476 else {
477 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
478 RSTRING(str)->len,
479 ENCODING_GET(str));
480 }
481 OBJ_FREEZE(str);
482 }
483 else {
484 if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
485 str = str_new_frozen(rb_cString, str);
486 }
487 if (STR_SHARED_P(str)) { /* str should not be shared */
488 /* shared substring */
489 str_make_independent(str);
491 }
492 if (!BARE_STRING_P(str)) {
493 str = str_new_frozen(rb_cString, str);
494 }
495 }
496
497 ENC_CODERANGE_SET(str, coderange);
498 RBASIC(str)->flags |= RSTRING_FSTR;
499
500 *key = *value = arg->fstr = str;
501 return ST_CONTINUE;
502 }
503}
504
505VALUE
506rb_fstring(VALUE str)
507{
508 VALUE fstr;
509 int bare;
510
511 Check_Type(str, T_STRING);
512
513 if (FL_TEST(str, RSTRING_FSTR))
514 return str;
515
516 bare = BARE_STRING_P(str);
517 if (!bare) {
518 if (STR_EMBED_P(str)) {
519 OBJ_FREEZE(str);
520 return str;
521 }
522
523 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
525 return str;
526 }
527 }
528
529 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
530 rb_str_resize(str, RSTRING_LEN(str));
531
532 fstr = register_fstring(str, false, false);
533
534 if (!bare) {
535 str_replace_shared_without_enc(str, fstr);
536 OBJ_FREEZE(str);
537 return str;
538 }
539 return fstr;
540}
541
542static VALUE
543register_fstring(VALUE str, bool copy, bool force_precompute_hash)
544{
545 struct fstr_update_arg args = {
546 .copy = copy,
547 .force_precompute_hash = force_precompute_hash
548 };
549
550#if SIZEOF_VOIDP == SIZEOF_LONG
551 if (FL_TEST_RAW(str, STR_FAKESTR)) {
552 // if the string hasn't been interned, we'll need the hash twice, so we
553 // compute it once and store it in capa
554 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
555 }
556#endif
557
558 RB_VM_LOCK_ENTER();
559 {
560 st_table *frozen_strings = rb_vm_fstring_table();
561 do {
562 args.fstr = str;
563 st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
564 } while (UNDEF_P(args.fstr));
565 }
566 RB_VM_LOCK_LEAVE();
567
568 RUBY_ASSERT(OBJ_FROZEN(args.fstr));
569 RUBY_ASSERT(!FL_TEST_RAW(args.fstr, STR_FAKESTR));
570 RUBY_ASSERT(!FL_TEST_RAW(args.fstr, FL_EXIVAR));
571 RUBY_ASSERT(RBASIC_CLASS(args.fstr) == rb_cString);
572
573 return args.fstr;
574}
575
576static VALUE
577setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
578{
579 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
580
581 if (!name) {
583 name = "";
584 }
585
586 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
587
588 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
589 fake_str->len = len;
590 fake_str->as.heap.ptr = (char *)name;
591 fake_str->as.heap.aux.capa = len;
592 return (VALUE)fake_str;
593}
594
595/*
596 * set up a fake string which refers a static string literal.
597 */
598VALUE
599rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
600{
601 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
602}
603
604/*
605 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
606 * shared string which refers a static string literal. `ptr` must
607 * point a constant string.
608 */
609VALUE
610rb_fstring_new(const char *ptr, long len)
611{
612 struct RString fake_str;
613 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), false, false);
614}
615
616VALUE
617rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
618{
619 struct RString fake_str;
620 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), false, false);
621}
622
623VALUE
624rb_fstring_cstr(const char *ptr)
625{
626 return rb_fstring_new(ptr, strlen(ptr));
627}
628
629static int
630fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
631{
632 RBASIC_SET_CLASS((VALUE)key, (VALUE)arg);
633 return ST_CONTINUE;
634}
635
636static int
637fstring_cmp(VALUE a, VALUE b)
638{
639 long alen, blen;
640 const char *aptr, *bptr;
641 RSTRING_GETMEM(a, aptr, alen);
642 RSTRING_GETMEM(b, bptr, blen);
643 return (alen != blen ||
644 ENCODING_GET(a) != ENCODING_GET(b) ||
645 memcmp(aptr, bptr, alen) != 0);
646}
647
648static inline bool
649single_byte_optimizable(VALUE str)
650{
651 int encindex = ENCODING_GET(str);
652 switch (encindex) {
653 case ENCINDEX_ASCII_8BIT:
654 case ENCINDEX_US_ASCII:
655 return true;
656 case ENCINDEX_UTF_8:
657 // For UTF-8 it's worth scanning the string coderange when unknown.
659 }
660 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
661 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
662 return true;
663 }
664
665 if (rb_enc_mbmaxlen(rb_enc_from_index(encindex)) == 1) {
666 return true;
667 }
668
669 /* Conservative. Possibly single byte.
670 * "\xa1" in Shift_JIS for example. */
671 return false;
672}
673
675
676static inline const char *
677search_nonascii(const char *p, const char *e)
678{
679 const uintptr_t *s, *t;
680
681#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
682# if SIZEOF_UINTPTR_T == 8
683# define NONASCII_MASK UINT64_C(0x8080808080808080)
684# elif SIZEOF_UINTPTR_T == 4
685# define NONASCII_MASK UINT32_C(0x80808080)
686# else
687# error "don't know what to do."
688# endif
689#else
690# if SIZEOF_UINTPTR_T == 8
691# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
692# elif SIZEOF_UINTPTR_T == 4
693# define NONASCII_MASK 0x80808080UL /* or...? */
694# else
695# error "don't know what to do."
696# endif
697#endif
698
699 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
700#if !UNALIGNED_WORD_ACCESS
701 if ((uintptr_t)p % SIZEOF_VOIDP) {
702 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
703 p += l;
704 switch (l) {
705 default: UNREACHABLE;
706#if SIZEOF_VOIDP > 4
707 case 7: if (p[-7]&0x80) return p-7;
708 case 6: if (p[-6]&0x80) return p-6;
709 case 5: if (p[-5]&0x80) return p-5;
710 case 4: if (p[-4]&0x80) return p-4;
711#endif
712 case 3: if (p[-3]&0x80) return p-3;
713 case 2: if (p[-2]&0x80) return p-2;
714 case 1: if (p[-1]&0x80) return p-1;
715 case 0: break;
716 }
717 }
718#endif
719#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
720#define aligned_ptr(value) \
721 __builtin_assume_aligned((value), sizeof(uintptr_t))
722#else
723#define aligned_ptr(value) (uintptr_t *)(value)
724#endif
725 s = aligned_ptr(p);
726 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
727#undef aligned_ptr
728 for (;s < t; s++) {
729 if (*s & NONASCII_MASK) {
730#ifdef WORDS_BIGENDIAN
731 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
732#else
733 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
734#endif
735 }
736 }
737 p = (const char *)s;
738 }
739
740 switch (e - p) {
741 default: UNREACHABLE;
742#if SIZEOF_VOIDP > 4
743 case 7: if (e[-7]&0x80) return e-7;
744 case 6: if (e[-6]&0x80) return e-6;
745 case 5: if (e[-5]&0x80) return e-5;
746 case 4: if (e[-4]&0x80) return e-4;
747#endif
748 case 3: if (e[-3]&0x80) return e-3;
749 case 2: if (e[-2]&0x80) return e-2;
750 case 1: if (e[-1]&0x80) return e-1;
751 case 0: return NULL;
752 }
753}
754
755static int
756coderange_scan(const char *p, long len, rb_encoding *enc)
757{
758 const char *e = p + len;
759
760 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
761 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
762 p = search_nonascii(p, e);
764 }
765
766 if (rb_enc_asciicompat(enc)) {
767 p = search_nonascii(p, e);
768 if (!p) return ENC_CODERANGE_7BIT;
769 for (;;) {
770 int ret = rb_enc_precise_mbclen(p, e, enc);
772 p += MBCLEN_CHARFOUND_LEN(ret);
773 if (p == e) break;
774 p = search_nonascii(p, e);
775 if (!p) break;
776 }
777 }
778 else {
779 while (p < e) {
780 int ret = rb_enc_precise_mbclen(p, e, enc);
782 p += MBCLEN_CHARFOUND_LEN(ret);
783 }
784 }
785 return ENC_CODERANGE_VALID;
786}
787
788long
789rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
790{
791 const char *p = s;
792
793 if (*cr == ENC_CODERANGE_BROKEN)
794 return e - s;
795
796 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
797 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
798 if (*cr == ENC_CODERANGE_VALID) return e - s;
799 p = search_nonascii(p, e);
801 return e - s;
802 }
803 else if (rb_enc_asciicompat(enc)) {
804 p = search_nonascii(p, e);
805 if (!p) {
806 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
807 return e - s;
808 }
809 for (;;) {
810 int ret = rb_enc_precise_mbclen(p, e, enc);
811 if (!MBCLEN_CHARFOUND_P(ret)) {
813 return p - s;
814 }
815 p += MBCLEN_CHARFOUND_LEN(ret);
816 if (p == e) break;
817 p = search_nonascii(p, e);
818 if (!p) break;
819 }
820 }
821 else {
822 while (p < e) {
823 int ret = rb_enc_precise_mbclen(p, e, enc);
824 if (!MBCLEN_CHARFOUND_P(ret)) {
826 return p - s;
827 }
828 p += MBCLEN_CHARFOUND_LEN(ret);
829 }
830 }
832 return e - s;
833}
834
835static inline void
836str_enc_copy(VALUE str1, VALUE str2)
837{
838 rb_enc_set_index(str1, ENCODING_GET(str2));
839}
840
841/* Like str_enc_copy, but does not check frozen status of str1.
842 * You should use this only if you're certain that str1 is not frozen. */
843static inline void
844str_enc_copy_direct(VALUE str1, VALUE str2)
845{
846 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
847 if (inlined_encoding == ENCODING_INLINE_MAX) {
848 rb_enc_set_index(str1, rb_enc_get_index(str2));
849 }
850 else {
851 ENCODING_SET_INLINED(str1, inlined_encoding);
852 }
853}
854
855static void
856rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
857{
858 /* this function is designed for copying encoding and coderange
859 * from src to new string "dest" which is made from the part of src.
860 */
861 str_enc_copy(dest, src);
862 if (RSTRING_LEN(dest) == 0) {
863 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
865 else
867 return;
868 }
869 switch (ENC_CODERANGE(src)) {
872 break;
874 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
875 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
877 else
879 break;
880 default:
881 break;
882 }
883}
884
885static void
886rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
887{
888 str_enc_copy(dest, src);
890}
891
892static int
893enc_coderange_scan(VALUE str, rb_encoding *enc)
894{
895 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
896}
897
898int
899rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
900{
901 return enc_coderange_scan(str, enc);
902}
903
904int
906{
907 int cr = ENC_CODERANGE(str);
908
909 if (cr == ENC_CODERANGE_UNKNOWN) {
910 cr = enc_coderange_scan(str, get_encoding(str));
911 ENC_CODERANGE_SET(str, cr);
912 }
913 return cr;
914}
915
916static inline bool
917rb_enc_str_asciicompat(VALUE str)
918{
919 int encindex = ENCODING_GET_INLINED(str);
920 return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
921}
922
923int
925{
926 switch(ENC_CODERANGE(str)) {
928 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
930 return true;
931 default:
932 return false;
933 }
934}
935
936static inline void
937str_mod_check(VALUE s, const char *p, long len)
938{
939 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
940 rb_raise(rb_eRuntimeError, "string modified");
941 }
942}
943
944static size_t
945str_capacity(VALUE str, const int termlen)
946{
947 if (STR_EMBED_P(str)) {
948 return str_embed_capa(str) - termlen;
949 }
950 else if (FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
951 return RSTRING(str)->len;
952 }
953 else {
954 return RSTRING(str)->as.heap.aux.capa;
955 }
956}
957
958size_t
960{
961 return str_capacity(str, TERM_LEN(str));
962}
963
964static inline void
965must_not_null(const char *ptr)
966{
967 if (!ptr) {
968 rb_raise(rb_eArgError, "NULL pointer given");
969 }
970}
971
972static inline VALUE
973str_alloc_embed(VALUE klass, size_t capa)
974{
975 size_t size = rb_str_embed_size(capa);
976 RUBY_ASSERT(size > 0);
977 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
978
979 NEWOBJ_OF(str, struct RString, klass,
981
982 return (VALUE)str;
983}
984
985static inline VALUE
986str_alloc_heap(VALUE klass)
987{
988 NEWOBJ_OF(str, struct RString, klass,
989 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
990
991 return (VALUE)str;
992}
993
994static inline VALUE
995empty_str_alloc(VALUE klass)
996{
997 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
998 VALUE str = str_alloc_embed(klass, 0);
999 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
1001 return str;
1002}
1003
1004static VALUE
1005str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
1006{
1007 VALUE str;
1008
1009 if (len < 0) {
1010 rb_raise(rb_eArgError, "negative string size (or size too big)");
1011 }
1012
1013 if (enc == NULL) {
1014 enc = rb_ascii8bit_encoding();
1015 }
1016
1017 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1018
1019 int termlen = rb_enc_mbminlen(enc);
1020
1021 if (STR_EMBEDDABLE_P(len, termlen)) {
1022 str = str_alloc_embed(klass, len + termlen);
1023 if (len == 0) {
1024 ENC_CODERANGE_SET(str, rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1025 }
1026 }
1027 else {
1028 str = str_alloc_heap(klass);
1029 RSTRING(str)->as.heap.aux.capa = len;
1030 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
1031 * integer overflow. If we can STATIC_ASSERT that, the following
1032 * mul_add_mul can be reverted to a simple ALLOC_N. */
1033 RSTRING(str)->as.heap.ptr =
1034 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
1035 }
1036
1037 rb_enc_raw_set(str, enc);
1038
1039 if (ptr) {
1040 memcpy(RSTRING_PTR(str), ptr, len);
1041 }
1042
1043 STR_SET_LEN(str, len);
1044 TERM_FILL(RSTRING_PTR(str) + len, termlen);
1045 return str;
1046}
1047
1048static VALUE
1049str_new(VALUE klass, const char *ptr, long len)
1050{
1051 return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
1052}
1053
1054VALUE
1055rb_str_new(const char *ptr, long len)
1056{
1057 return str_new(rb_cString, ptr, len);
1058}
1059
1060VALUE
1061rb_usascii_str_new(const char *ptr, long len)
1062{
1063 return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
1064}
1065
1066VALUE
1067rb_utf8_str_new(const char *ptr, long len)
1068{
1069 return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
1070}
1071
1072VALUE
1073rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
1074{
1075 return str_enc_new(rb_cString, ptr, len, enc);
1076}
1077
1078VALUE
1079rb_str_new_cstr(const char *ptr)
1080{
1081 must_not_null(ptr);
1082 /* rb_str_new_cstr() can take pointer from non-malloc-generated
1083 * memory regions, and that cannot be detected by the MSAN. Just
1084 * trust the programmer that the argument passed here is a sane C
1085 * string. */
1086 __msan_unpoison_string(ptr);
1087 return rb_str_new(ptr, strlen(ptr));
1088}
1089
1090VALUE
1092{
1094}
1095
1096VALUE
1097rb_utf8_str_new_cstr(const char *ptr)
1098{
1100}
1101
1102VALUE
1103rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
1104{
1105 must_not_null(ptr);
1106 if (rb_enc_mbminlen(enc) != 1) {
1107 rb_raise(rb_eArgError, "wchar encoding given");
1108 }
1109 return rb_enc_str_new(ptr, strlen(ptr), enc);
1110}
1111
1112static VALUE
1113str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1114{
1115 VALUE str;
1116
1117 if (len < 0) {
1118 rb_raise(rb_eArgError, "negative string size (or size too big)");
1119 }
1120
1121 if (!ptr) {
1122 str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
1123 }
1124 else {
1125 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1126 str = str_alloc_heap(klass);
1127 RSTRING(str)->len = len;
1128 RSTRING(str)->as.heap.ptr = (char *)ptr;
1129 RSTRING(str)->as.heap.aux.capa = len;
1130 RBASIC(str)->flags |= STR_NOFREE;
1131 rb_enc_associate_index(str, encindex);
1132 }
1133 return str;
1134}
1135
1136VALUE
1137rb_str_new_static(const char *ptr, long len)
1138{
1139 return str_new_static(rb_cString, ptr, len, 0);
1140}
1141
1142VALUE
1143rb_usascii_str_new_static(const char *ptr, long len)
1144{
1145 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1146}
1147
1148VALUE
1149rb_utf8_str_new_static(const char *ptr, long len)
1150{
1151 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1152}
1153
1154VALUE
1155rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
1156{
1157 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1158}
1159
1160static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1161 rb_encoding *from, rb_encoding *to,
1162 int ecflags, VALUE ecopts);
1163
1164static inline bool
1165is_enc_ascii_string(VALUE str, rb_encoding *enc)
1166{
1167 int encidx = rb_enc_to_index(enc);
1168 if (rb_enc_get_index(str) == encidx)
1169 return is_ascii_string(str);
1170 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1171}
1172
1173VALUE
1174rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1175{
1176 long len;
1177 const char *ptr;
1178 VALUE newstr;
1179
1180 if (!to) return str;
1181 if (!from) from = rb_enc_get(str);
1182 if (from == to) return str;
1183 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1184 rb_is_ascii8bit_enc(to)) {
1185 if (STR_ENC_GET(str) != to) {
1186 str = rb_str_dup(str);
1187 rb_enc_associate(str, to);
1188 }
1189 return str;
1190 }
1191
1192 RSTRING_GETMEM(str, ptr, len);
1193 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1194 from, to, ecflags, ecopts);
1195 if (NIL_P(newstr)) {
1196 /* some error, return original */
1197 return str;
1198 }
1199 return newstr;
1200}
1201
1202VALUE
1203rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1204 rb_encoding *from, int ecflags, VALUE ecopts)
1205{
1206 long olen;
1207
1208 olen = RSTRING_LEN(newstr);
1209 if (ofs < -olen || olen < ofs)
1210 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1211 if (ofs < 0) ofs += olen;
1212 if (!from) {
1213 STR_SET_LEN(newstr, ofs);
1214 return rb_str_cat(newstr, ptr, len);
1215 }
1216
1217 rb_str_modify(newstr);
1218 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1219 rb_enc_get(newstr),
1220 ecflags, ecopts);
1221}
1222
1223VALUE
1224rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1225{
1226 STR_SET_LEN(str, 0);
1227 rb_enc_associate(str, enc);
1228 rb_str_cat(str, ptr, len);
1229 return str;
1230}
1231
1232static VALUE
1233str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1234 rb_encoding *from, rb_encoding *to,
1235 int ecflags, VALUE ecopts)
1236{
1237 rb_econv_t *ec;
1239 long olen;
1240 VALUE econv_wrapper;
1241 const unsigned char *start, *sp;
1242 unsigned char *dest, *dp;
1243 size_t converted_output = (size_t)ofs;
1244
1245 olen = rb_str_capacity(newstr);
1246
1247 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1248 RBASIC_CLEAR_CLASS(econv_wrapper);
1249 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1250 if (!ec) return Qnil;
1251 DATA_PTR(econv_wrapper) = ec;
1252
1253 sp = (unsigned char*)ptr;
1254 start = sp;
1255 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1256 (dp = dest + converted_output),
1257 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1259 /* destination buffer short */
1260 size_t converted_input = sp - start;
1261 size_t rest = len - converted_input;
1262 converted_output = dp - dest;
1263 rb_str_set_len(newstr, converted_output);
1264 if (converted_input && converted_output &&
1265 rest < (LONG_MAX / converted_output)) {
1266 rest = (rest * converted_output) / converted_input;
1267 }
1268 else {
1269 rest = olen;
1270 }
1271 olen += rest < 2 ? 2 : rest;
1272 rb_str_resize(newstr, olen);
1273 }
1274 DATA_PTR(econv_wrapper) = 0;
1275 RB_GC_GUARD(econv_wrapper);
1276 rb_econv_close(ec);
1277 switch (ret) {
1278 case econv_finished:
1279 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1280 rb_str_set_len(newstr, len);
1281 rb_enc_associate(newstr, to);
1282 return newstr;
1283
1284 default:
1285 return Qnil;
1286 }
1287}
1288
1289VALUE
1290rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
1291{
1292 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1293}
1294
1295VALUE
1296rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
1297{
1298 rb_encoding *ienc;
1299 VALUE str;
1300 const int eidx = rb_enc_to_index(eenc);
1301
1302 if (!ptr) {
1303 return rb_enc_str_new(ptr, len, eenc);
1304 }
1305
1306 /* ASCII-8BIT case, no conversion */
1307 if ((eidx == rb_ascii8bit_encindex()) ||
1308 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1309 return rb_str_new(ptr, len);
1310 }
1311 /* no default_internal or same encoding, no conversion */
1313 if (!ienc || eenc == ienc) {
1314 return rb_enc_str_new(ptr, len, eenc);
1315 }
1316 /* ASCII compatible, and ASCII only string, no conversion in
1317 * default_internal */
1318 if ((eidx == rb_ascii8bit_encindex()) ||
1319 (eidx == rb_usascii_encindex()) ||
1320 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1321 return rb_enc_str_new(ptr, len, ienc);
1322 }
1323 /* convert from the given encoding to default_internal */
1324 str = rb_enc_str_new(NULL, 0, ienc);
1325 /* when the conversion failed for some reason, just ignore the
1326 * default_internal and result in the given encoding as-is. */
1327 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1328 rb_str_initialize(str, ptr, len, eenc);
1329 }
1330 return str;
1331}
1332
1333VALUE
1334rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1335{
1336 int eidx = rb_enc_to_index(eenc);
1337 if (eidx == rb_usascii_encindex() &&
1338 !is_ascii_string(str)) {
1339 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1340 return str;
1341 }
1342 rb_enc_associate_index(str, eidx);
1343 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1344}
1345
1346VALUE
1347rb_external_str_new(const char *ptr, long len)
1348{
1350}
1351
1352VALUE
1354{
1356}
1357
1358VALUE
1359rb_locale_str_new(const char *ptr, long len)
1360{
1362}
1363
1364VALUE
1366{
1367 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1368}
1369
1370VALUE
1371rb_filesystem_str_new(const char *ptr, long len)
1372{
1374}
1375
1376VALUE
1378{
1379 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1380}
1381
1382VALUE
1387
1388VALUE
1393
1394VALUE
1395rb_str_export_to_enc(VALUE str, rb_encoding *enc)
1396{
1397 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1398}
1399
1400static VALUE
1401str_replace_shared_without_enc(VALUE str2, VALUE str)
1402{
1403 const int termlen = TERM_LEN(str);
1404 char *ptr;
1405 long len;
1406
1407 RSTRING_GETMEM(str, ptr, len);
1408 if (str_embed_capa(str2) >= len + termlen) {
1409 char *ptr2 = RSTRING(str2)->as.embed.ary;
1410 STR_SET_EMBED(str2);
1411 memcpy(ptr2, RSTRING_PTR(str), len);
1412 TERM_FILL(ptr2+len, termlen);
1413 }
1414 else {
1415 VALUE root;
1416 if (STR_SHARED_P(str)) {
1417 root = RSTRING(str)->as.heap.aux.shared;
1418 RSTRING_GETMEM(str, ptr, len);
1419 }
1420 else {
1421 root = rb_str_new_frozen(str);
1422 RSTRING_GETMEM(root, ptr, len);
1423 }
1424 RUBY_ASSERT(OBJ_FROZEN(root));
1425
1426 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1427 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1428 rb_fatal("about to free a possible shared root");
1429 }
1430 char *ptr2 = STR_HEAP_PTR(str2);
1431 if (ptr2 != ptr) {
1432 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1433 }
1434 }
1435 FL_SET(str2, STR_NOEMBED);
1436 RSTRING(str2)->as.heap.ptr = ptr;
1437 STR_SET_SHARED(str2, root);
1438 }
1439
1440 STR_SET_LEN(str2, len);
1441
1442 return str2;
1443}
1444
1445static VALUE
1446str_replace_shared(VALUE str2, VALUE str)
1447{
1448 str_replace_shared_without_enc(str2, str);
1449 rb_enc_cr_str_exact_copy(str2, str);
1450 return str2;
1451}
1452
1453static VALUE
1454str_new_shared(VALUE klass, VALUE str)
1455{
1456 return str_replace_shared(str_alloc_heap(klass), str);
1457}
1458
1459VALUE
1461{
1462 return str_new_shared(rb_obj_class(str), str);
1463}
1464
1465VALUE
1467{
1468 if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig;
1469 return str_new_frozen(rb_obj_class(orig), orig);
1470}
1471
1472static VALUE
1473rb_str_new_frozen_String(VALUE orig)
1474{
1475 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1476 return str_new_frozen(rb_cString, orig);
1477}
1478
1479VALUE
1480rb_str_tmp_frozen_acquire(VALUE orig)
1481{
1482 if (OBJ_FROZEN_RAW(orig)) return orig;
1483 return str_new_frozen_buffer(0, orig, FALSE);
1484}
1485
1486VALUE
1487rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1488{
1489 if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1490 if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1491
1492 VALUE str = str_alloc_heap(0);
1493 OBJ_FREEZE(str);
1494 /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1495 FL_SET(str, STR_SHARED_ROOT);
1496
1497 size_t capa = str_capacity(orig, TERM_LEN(orig));
1498
1499 /* If the string is embedded then we want to create a copy that is heap
1500 * allocated. If the string is shared then the shared root must be
1501 * embedded, so we want to create a copy. If the string is a shared root
1502 * then it must be embedded, so we want to create a copy. */
1503 if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT)) {
1504 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1505 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1506 }
1507 else {
1508 /* orig must be heap allocated and not shared, so we can safely transfer
1509 * the pointer to str. */
1510 RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1511 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1512 RBASIC(orig)->flags &= ~STR_NOFREE;
1513 STR_SET_SHARED(orig, str);
1514 }
1515
1516 RSTRING(str)->len = RSTRING(orig)->len;
1517 RSTRING(str)->as.heap.aux.capa = capa;
1518
1519 return str;
1520}
1521
1522void
1523rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1524{
1525 if (RBASIC_CLASS(tmp) != 0)
1526 return;
1527
1528 if (STR_EMBED_P(tmp)) {
1530 }
1531 else if (FL_TEST_RAW(orig, STR_SHARED) &&
1532 !FL_TEST_RAW(orig, STR_TMPLOCK|RUBY_FL_FREEZE)) {
1533 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1534
1535 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1536 RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1537 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1538
1539 /* Unshare orig since the root (tmp) only has this one child. */
1540 FL_UNSET_RAW(orig, STR_SHARED);
1541 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1542 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1544
1545 /* Make tmp embedded and empty so it is safe for sweeping. */
1546 STR_SET_EMBED(tmp);
1547 STR_SET_LEN(tmp, 0);
1548 }
1549 }
1550}
1551
1552static VALUE
1553str_new_frozen(VALUE klass, VALUE orig)
1554{
1555 return str_new_frozen_buffer(klass, orig, TRUE);
1556}
1557
1558static VALUE
1559heap_str_make_shared(VALUE klass, VALUE orig)
1560{
1561 RUBY_ASSERT(!STR_EMBED_P(orig));
1562 RUBY_ASSERT(!STR_SHARED_P(orig));
1563
1564 VALUE str = str_alloc_heap(klass);
1565 STR_SET_LEN(str, RSTRING_LEN(orig));
1566 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1567 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1568 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1569 RBASIC(orig)->flags &= ~STR_NOFREE;
1570 STR_SET_SHARED(orig, str);
1571 if (klass == 0)
1572 FL_UNSET_RAW(str, STR_BORROWED);
1573 return str;
1574}
1575
1576static VALUE
1577str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1578{
1579 VALUE str;
1580
1581 long len = RSTRING_LEN(orig);
1582 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1583 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1584
1585 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1586 str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
1587 RUBY_ASSERT(STR_EMBED_P(str));
1588 }
1589 else {
1590 if (FL_TEST_RAW(orig, STR_SHARED)) {
1591 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1592 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1593 long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1594 RUBY_ASSERT(ofs >= 0);
1595 RUBY_ASSERT(rest >= 0);
1596 RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared));
1598
1599 if ((ofs > 0) || (rest > 0) ||
1600 (klass != RBASIC(shared)->klass) ||
1601 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1602 str = str_new_shared(klass, shared);
1603 RUBY_ASSERT(!STR_EMBED_P(str));
1604 RSTRING(str)->as.heap.ptr += ofs;
1605 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1606 }
1607 else {
1608 if (RBASIC_CLASS(shared) == 0)
1609 FL_SET_RAW(shared, STR_BORROWED);
1610 return shared;
1611 }
1612 }
1613 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1614 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1615 STR_SET_EMBED(str);
1616 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1617 STR_SET_LEN(str, RSTRING_LEN(orig));
1618 ENC_CODERANGE_SET(str, ENC_CODERANGE(orig));
1619 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1620 }
1621 else {
1622 str = heap_str_make_shared(klass, orig);
1623 }
1624 }
1625
1626 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1627 OBJ_FREEZE(str);
1628 return str;
1629}
1630
1631VALUE
1632rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1633{
1634 return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
1635}
1636
1637static VALUE
1638str_new_empty_String(VALUE str)
1639{
1640 VALUE v = rb_str_new(0, 0);
1641 rb_enc_copy(v, str);
1642 return v;
1643}
1644
1645#define STR_BUF_MIN_SIZE 63
1646
1647VALUE
1649{
1650 if (STR_EMBEDDABLE_P(capa, 1)) {
1651 return str_alloc_embed(rb_cString, capa + 1);
1652 }
1653
1654 VALUE str = str_alloc_heap(rb_cString);
1655
1656 RSTRING(str)->as.heap.aux.capa = capa;
1657 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1658 RSTRING(str)->as.heap.ptr[0] = '\0';
1659
1660 return str;
1661}
1662
1663VALUE
1664rb_str_buf_new_cstr(const char *ptr)
1665{
1666 VALUE str;
1667 long len = strlen(ptr);
1668
1669 str = rb_str_buf_new(len);
1670 rb_str_buf_cat(str, ptr, len);
1671
1672 return str;
1673}
1674
1675VALUE
1677{
1678 return str_new(0, 0, len);
1679}
1680
1681void
1683{
1684 if (STR_EMBED_P(str)) {
1685 RB_DEBUG_COUNTER_INC(obj_str_embed);
1686 }
1687 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1688 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1689 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1690 }
1691 else {
1692 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1693 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1694 }
1695}
1696
1697size_t
1698rb_str_memsize(VALUE str)
1699{
1700 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1701 return STR_HEAP_SIZE(str);
1702 }
1703 else {
1704 return 0;
1705 }
1706}
1707
1708VALUE
1710{
1711 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1712}
1713
1714static inline void str_discard(VALUE str);
1715static void str_shared_replace(VALUE str, VALUE str2);
1716
1717void
1719{
1720 if (str != str2) str_shared_replace(str, str2);
1721}
1722
1723static void
1724str_shared_replace(VALUE str, VALUE str2)
1725{
1726 rb_encoding *enc;
1727 int cr;
1728 int termlen;
1729
1730 RUBY_ASSERT(str2 != str);
1731 enc = STR_ENC_GET(str2);
1732 cr = ENC_CODERANGE(str2);
1733 str_discard(str);
1734 termlen = rb_enc_mbminlen(enc);
1735
1736 STR_SET_LEN(str, RSTRING_LEN(str2));
1737
1738 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1739 STR_SET_EMBED(str);
1740 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1741 rb_enc_associate(str, enc);
1742 ENC_CODERANGE_SET(str, cr);
1743 }
1744 else {
1745 if (STR_EMBED_P(str2)) {
1746 RUBY_ASSERT(!FL_TEST(str2, STR_SHARED));
1747 long len = RSTRING_LEN(str2);
1748 RUBY_ASSERT(len + termlen <= str_embed_capa(str2));
1749
1750 char *new_ptr = ALLOC_N(char, len + termlen);
1751 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1752 RSTRING(str2)->as.heap.ptr = new_ptr;
1753 STR_SET_LEN(str2, len);
1754 RSTRING(str2)->as.heap.aux.capa = len;
1755 STR_SET_NOEMBED(str2);
1756 }
1757
1758 STR_SET_NOEMBED(str);
1759 FL_UNSET(str, STR_SHARED);
1760 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1761
1762 if (FL_TEST(str2, STR_SHARED)) {
1763 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1764 STR_SET_SHARED(str, shared);
1765 }
1766 else {
1767 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1768 }
1769
1770 /* abandon str2 */
1771 STR_SET_EMBED(str2);
1772 RSTRING_PTR(str2)[0] = 0;
1773 STR_SET_LEN(str2, 0);
1774 rb_enc_associate(str, enc);
1775 ENC_CODERANGE_SET(str, cr);
1776 }
1777}
1778
1779VALUE
1780rb_obj_as_string(VALUE obj)
1781{
1782 VALUE str;
1783
1784 if (RB_TYPE_P(obj, T_STRING)) {
1785 return obj;
1786 }
1787 str = rb_funcall(obj, idTo_s, 0);
1788 return rb_obj_as_string_result(str, obj);
1789}
1790
1791VALUE
1792rb_obj_as_string_result(VALUE str, VALUE obj)
1793{
1794 if (!RB_TYPE_P(str, T_STRING))
1795 return rb_any_to_s(obj);
1796 return str;
1797}
1798
1799static VALUE
1800str_replace(VALUE str, VALUE str2)
1801{
1802 long len;
1803
1804 len = RSTRING_LEN(str2);
1805 if (STR_SHARED_P(str2)) {
1806 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1808 STR_SET_NOEMBED(str);
1809 STR_SET_LEN(str, len);
1810 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1811 STR_SET_SHARED(str, shared);
1812 rb_enc_cr_str_exact_copy(str, str2);
1813 }
1814 else {
1815 str_replace_shared(str, str2);
1816 }
1817
1818 return str;
1819}
1820
1821static inline VALUE
1822ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1823{
1824 size_t size = rb_str_embed_size(capa);
1825 RUBY_ASSERT(size > 0);
1826 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1827
1828 NEWOBJ_OF(str, struct RString, klass,
1830
1831 return (VALUE)str;
1832}
1833
1834static inline VALUE
1835ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1836{
1837 NEWOBJ_OF(str, struct RString, klass,
1838 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
1839
1840 return (VALUE)str;
1841}
1842
1843static inline VALUE
1844str_duplicate_setup_encoding(VALUE str, VALUE dup, VALUE flags)
1845{
1846 int encidx = 0;
1847 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1848 encidx = rb_enc_get_index(str);
1849 flags &= ~ENCODING_MASK;
1850 }
1851 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1852 if (encidx) rb_enc_associate_index(dup, encidx);
1853 return dup;
1854}
1855
1856static const VALUE flag_mask = ENC_CODERANGE_MASK | ENCODING_MASK | FL_FREEZE;
1857
1858static inline VALUE
1859str_duplicate_setup_embed(VALUE klass, VALUE str, VALUE dup)
1860{
1861 VALUE flags = FL_TEST_RAW(str, flag_mask);
1862 long len = RSTRING_LEN(str);
1863
1864 RUBY_ASSERT(STR_EMBED_P(dup));
1865 RUBY_ASSERT(str_embed_capa(dup) >= len + 1);
1866 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1867 STR_SET_LEN(dup, RSTRING_LEN(str));
1868 return str_duplicate_setup_encoding(str, dup, flags);
1869}
1870
1871static inline VALUE
1872str_duplicate_setup_heap(VALUE klass, VALUE str, VALUE dup)
1873{
1874 VALUE flags = FL_TEST_RAW(str, flag_mask);
1875 VALUE root = str;
1876 if (FL_TEST_RAW(str, STR_SHARED)) {
1877 root = RSTRING(str)->as.heap.aux.shared;
1878 }
1879 else if (UNLIKELY(!(flags & FL_FREEZE))) {
1880 root = str = str_new_frozen(klass, str);
1881 flags = FL_TEST_RAW(str, flag_mask);
1882 }
1883 RUBY_ASSERT(!STR_SHARED_P(root));
1885
1886 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1887 FL_SET(root, STR_SHARED_ROOT);
1888 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1889 flags |= RSTRING_NOEMBED | STR_SHARED;
1890
1891 STR_SET_LEN(dup, RSTRING_LEN(str));
1892 return str_duplicate_setup_encoding(str, dup, flags);
1893}
1894
1895static inline VALUE
1896str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1897{
1898 if (STR_EMBED_P(str)) {
1899 return str_duplicate_setup_embed(klass, str, dup);
1900 }
1901 else {
1902 return str_duplicate_setup_heap(klass, str, dup);
1903 }
1904}
1905
1906static inline VALUE
1907str_duplicate(VALUE klass, VALUE str)
1908{
1909 VALUE dup;
1910 if (STR_EMBED_P(str)) {
1911 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1912 }
1913 else {
1914 dup = str_alloc_heap(klass);
1915 }
1916
1917 return str_duplicate_setup(klass, str, dup);
1918}
1919
1920VALUE
1922{
1923 return str_duplicate(rb_obj_class(str), str);
1924}
1925
1926/* :nodoc: */
1927VALUE
1928rb_str_dup_m(VALUE str)
1929{
1930 if (LIKELY(BARE_STRING_P(str))) {
1931 return str_duplicate(rb_obj_class(str), str);
1932 }
1933 else {
1934 return rb_obj_dup(str);
1935 }
1936}
1937
1938VALUE
1940{
1941 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1942 return str_duplicate(rb_cString, str);
1943}
1944
1945VALUE
1946rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled)
1947{
1948 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1949 VALUE new_str, klass = rb_cString;
1950
1951 if (!(chilled && RTEST(rb_ivar_defined(str, id_debug_created_info))) && STR_EMBED_P(str)) {
1952 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
1953 str_duplicate_setup_embed(klass, str, new_str);
1954 }
1955 else {
1956 new_str = ec_str_alloc_heap(ec, klass);
1957 str_duplicate_setup_heap(klass, str, new_str);
1958 }
1959 if (chilled) {
1960 FL_SET_RAW(new_str, STR_CHILLED_LITERAL);
1961 }
1962 return new_str;
1963}
1964
1965VALUE
1966rb_str_with_debug_created_info(VALUE str, VALUE path, int line)
1967{
1968 VALUE debug_info = rb_ary_new_from_args(2, path, INT2FIX(line));
1969 if (OBJ_FROZEN_RAW(str)) str = rb_str_dup(str);
1970 rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
1971 FL_SET_RAW(str, STR_CHILLED_LITERAL);
1972 return rb_str_freeze(str);
1973}
1974
1975/*
1976 *
1977 * call-seq:
1978 * String.new(string = '', **opts) -> new_string
1979 *
1980 * :include: doc/string/new.rdoc
1981 *
1982 */
1983
1984static VALUE
1985rb_str_init(int argc, VALUE *argv, VALUE str)
1986{
1987 static ID keyword_ids[2];
1988 VALUE orig, opt, venc, vcapa;
1989 VALUE kwargs[2];
1990 rb_encoding *enc = 0;
1991 int n;
1992
1993 if (!keyword_ids[0]) {
1994 keyword_ids[0] = rb_id_encoding();
1995 CONST_ID(keyword_ids[1], "capacity");
1996 }
1997
1998 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1999 if (!NIL_P(opt)) {
2000 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2001 venc = kwargs[0];
2002 vcapa = kwargs[1];
2003 if (!UNDEF_P(venc) && !NIL_P(venc)) {
2004 enc = rb_to_encoding(venc);
2005 }
2006 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
2007 long capa = NUM2LONG(vcapa);
2008 long len = 0;
2009 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2010
2011 if (capa < STR_BUF_MIN_SIZE) {
2012 capa = STR_BUF_MIN_SIZE;
2013 }
2014 if (n == 1) {
2015 StringValue(orig);
2016 len = RSTRING_LEN(orig);
2017 if (capa < len) {
2018 capa = len;
2019 }
2020 if (orig == str) n = 0;
2021 }
2022 str_modifiable(str);
2023 if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2024 /* make noembed always */
2025 const size_t size = (size_t)capa + termlen;
2026 const char *const old_ptr = RSTRING_PTR(str);
2027 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2028 char *new_ptr = ALLOC_N(char, size);
2029 if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str));
2030 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2031 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
2032 RSTRING(str)->as.heap.ptr = new_ptr;
2033 }
2034 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
2035 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2036 (size_t)capa + termlen, STR_HEAP_SIZE(str));
2037 }
2038 STR_SET_LEN(str, len);
2039 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
2040 if (n == 1) {
2041 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
2042 rb_enc_cr_str_exact_copy(str, orig);
2043 }
2044 FL_SET(str, STR_NOEMBED);
2045 RSTRING(str)->as.heap.aux.capa = capa;
2046 }
2047 else if (n == 1) {
2048 rb_str_replace(str, orig);
2049 }
2050 if (enc) {
2051 rb_enc_associate(str, enc);
2053 }
2054 }
2055 else if (n == 1) {
2056 rb_str_replace(str, orig);
2057 }
2058 return str;
2059}
2060
2061/* :nodoc: */
2062static VALUE
2063rb_str_s_new(int argc, VALUE *argv, VALUE klass)
2064{
2065 if (klass != rb_cString) {
2066 return rb_class_new_instance_pass_kw(argc, argv, klass);
2067 }
2068
2069 static ID keyword_ids[2];
2070 VALUE orig, opt, encoding = Qnil, capacity = Qnil;
2071 VALUE kwargs[2];
2072 rb_encoding *enc = NULL;
2073
2074 int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2075 if (NIL_P(opt)) {
2076 return rb_class_new_instance_pass_kw(argc, argv, klass);
2077 }
2078
2079 keyword_ids[0] = rb_id_encoding();
2080 CONST_ID(keyword_ids[1], "capacity");
2081 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2082 encoding = kwargs[0];
2083 capacity = kwargs[1];
2084
2085 if (n == 1) {
2086 orig = StringValue(orig);
2087 }
2088 else {
2089 orig = Qnil;
2090 }
2091
2092 if (UNDEF_P(encoding)) {
2093 if (!NIL_P(orig)) {
2094 encoding = rb_obj_encoding(orig);
2095 }
2096 }
2097
2098 if (!UNDEF_P(encoding)) {
2099 enc = rb_to_encoding(encoding);
2100 }
2101
2102 // If capacity is nil, we're basically just duping `orig`.
2103 if (UNDEF_P(capacity)) {
2104 if (NIL_P(orig)) {
2105 VALUE empty_str = str_new(klass, "", 0);
2106 if (enc) {
2107 rb_enc_associate(empty_str, enc);
2108 }
2109 return empty_str;
2110 }
2111 VALUE copy = str_duplicate(klass, orig);
2112 rb_enc_associate(copy, enc);
2113 ENC_CODERANGE_CLEAR(copy);
2114 return copy;
2115 }
2116
2117 long capa = 0;
2118 capa = NUM2LONG(capacity);
2119 if (capa < 0) {
2120 capa = 0;
2121 }
2122
2123 if (!NIL_P(orig)) {
2124 long orig_capa = rb_str_capacity(orig);
2125 if (orig_capa > capa) {
2126 capa = orig_capa;
2127 }
2128 }
2129
2130 VALUE str = str_enc_new(klass, NULL, capa, enc);
2131 STR_SET_LEN(str, 0);
2132 TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);
2133
2134 if (!NIL_P(orig)) {
2135 rb_str_buf_append(str, orig);
2136 }
2137
2138 return str;
2139}
2140
2141#ifdef NONASCII_MASK
2142#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2143
2144/*
2145 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
2146 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
2147 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2148 *
2149 * if (!(byte & 0x80))
2150 * byte |= 0x40; // turn on bit6
2151 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
2152 *
2153 * This function calculates whether a byte is leading or not for all bytes
2154 * in the argument word by concurrently using the above logic, and then
2155 * adds up the number of leading bytes in the word.
2156 */
2157static inline uintptr_t
2158count_utf8_lead_bytes_with_word(const uintptr_t *s)
2159{
2160 uintptr_t d = *s;
2161
2162 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2163 d = (d>>6) | (~d>>7);
2164 d &= NONASCII_MASK >> 7;
2165
2166 /* Gather all bytes. */
2167#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2168 /* use only if it can use POPCNT */
2169 return rb_popcount_intptr(d);
2170#else
2171 d += (d>>8);
2172 d += (d>>16);
2173# if SIZEOF_VOIDP == 8
2174 d += (d>>32);
2175# endif
2176 return (d&0xF);
2177#endif
2178}
2179#endif
2180
2181static inline long
2182enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2183{
2184 long c;
2185 const char *q;
2186
2187 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2188 long diff = (long)(e - p);
2189 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2190 }
2191#ifdef NONASCII_MASK
2192 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2193 uintptr_t len = 0;
2194 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2195 const uintptr_t *s, *t;
2196 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2197 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2198 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2199 while (p < (const char *)s) {
2200 if (is_utf8_lead_byte(*p)) len++;
2201 p++;
2202 }
2203 while (s < t) {
2204 len += count_utf8_lead_bytes_with_word(s);
2205 s++;
2206 }
2207 p = (const char *)s;
2208 }
2209 while (p < e) {
2210 if (is_utf8_lead_byte(*p)) len++;
2211 p++;
2212 }
2213 return (long)len;
2214 }
2215#endif
2216 else if (rb_enc_asciicompat(enc)) {
2217 c = 0;
2218 if (ENC_CODERANGE_CLEAN_P(cr)) {
2219 while (p < e) {
2220 if (ISASCII(*p)) {
2221 q = search_nonascii(p, e);
2222 if (!q)
2223 return c + (e - p);
2224 c += q - p;
2225 p = q;
2226 }
2227 p += rb_enc_fast_mbclen(p, e, enc);
2228 c++;
2229 }
2230 }
2231 else {
2232 while (p < e) {
2233 if (ISASCII(*p)) {
2234 q = search_nonascii(p, e);
2235 if (!q)
2236 return c + (e - p);
2237 c += q - p;
2238 p = q;
2239 }
2240 p += rb_enc_mbclen(p, e, enc);
2241 c++;
2242 }
2243 }
2244 return c;
2245 }
2246
2247 for (c=0; p<e; c++) {
2248 p += rb_enc_mbclen(p, e, enc);
2249 }
2250 return c;
2251}
2252
2253long
2254rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2255{
2256 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2257}
2258
2259/* To get strlen with cr
2260 * Note that given cr is not used.
2261 */
2262long
2263rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2264{
2265 long c;
2266 const char *q;
2267 int ret;
2268
2269 *cr = 0;
2270 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2271 long diff = (long)(e - p);
2272 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2273 }
2274 else if (rb_enc_asciicompat(enc)) {
2275 c = 0;
2276 while (p < e) {
2277 if (ISASCII(*p)) {
2278 q = search_nonascii(p, e);
2279 if (!q) {
2280 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2281 return c + (e - p);
2282 }
2283 c += q - p;
2284 p = q;
2285 }
2286 ret = rb_enc_precise_mbclen(p, e, enc);
2287 if (MBCLEN_CHARFOUND_P(ret)) {
2288 *cr |= ENC_CODERANGE_VALID;
2289 p += MBCLEN_CHARFOUND_LEN(ret);
2290 }
2291 else {
2293 p++;
2294 }
2295 c++;
2296 }
2297 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2298 return c;
2299 }
2300
2301 for (c=0; p<e; c++) {
2302 ret = rb_enc_precise_mbclen(p, e, enc);
2303 if (MBCLEN_CHARFOUND_P(ret)) {
2304 *cr |= ENC_CODERANGE_VALID;
2305 p += MBCLEN_CHARFOUND_LEN(ret);
2306 }
2307 else {
2309 if (p + rb_enc_mbminlen(enc) <= e)
2310 p += rb_enc_mbminlen(enc);
2311 else
2312 p = e;
2313 }
2314 }
2315 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2316 return c;
2317}
2318
2319/* enc must be str's enc or rb_enc_check(str, str2) */
2320static long
2321str_strlen(VALUE str, rb_encoding *enc)
2322{
2323 const char *p, *e;
2324 int cr;
2325
2326 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2327 if (!enc) enc = STR_ENC_GET(str);
2328 p = RSTRING_PTR(str);
2329 e = RSTRING_END(str);
2330 cr = ENC_CODERANGE(str);
2331
2332 if (cr == ENC_CODERANGE_UNKNOWN) {
2333 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2334 if (cr) ENC_CODERANGE_SET(str, cr);
2335 return n;
2336 }
2337 else {
2338 return enc_strlen(p, e, enc, cr);
2339 }
2340}
2341
2342long
2344{
2345 return str_strlen(str, NULL);
2346}
2347
2348/*
2349 * call-seq:
2350 * length -> integer
2351 *
2352 * :include: doc/string/length.rdoc
2353 *
2354 */
2355
2356VALUE
2358{
2359 return LONG2NUM(str_strlen(str, NULL));
2360}
2361
2362/*
2363 * call-seq:
2364 * bytesize -> integer
2365 *
2366 * :include: doc/string/bytesize.rdoc
2367 *
2368 */
2369
2370VALUE
2371rb_str_bytesize(VALUE str)
2372{
2373 return LONG2NUM(RSTRING_LEN(str));
2374}
2375
2376/*
2377 * call-seq:
2378 * empty? -> true or false
2379 *
2380 * Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2381 *
2382 * "hello".empty? # => false
2383 * " ".empty? # => false
2384 * "".empty? # => true
2385 *
2386 */
2387
2388static VALUE
2389rb_str_empty(VALUE str)
2390{
2391 return RBOOL(RSTRING_LEN(str) == 0);
2392}
2393
2394/*
2395 * call-seq:
2396 * string + other_string -> new_string
2397 *
2398 * Returns a new +String+ containing +other_string+ concatenated to +self+:
2399 *
2400 * "Hello from " + self.to_s # => "Hello from main"
2401 *
2402 */
2403
2404VALUE
2406{
2407 VALUE str3;
2408 rb_encoding *enc;
2409 char *ptr1, *ptr2, *ptr3;
2410 long len1, len2;
2411 int termlen;
2412
2413 StringValue(str2);
2414 enc = rb_enc_check_str(str1, str2);
2415 RSTRING_GETMEM(str1, ptr1, len1);
2416 RSTRING_GETMEM(str2, ptr2, len2);
2417 termlen = rb_enc_mbminlen(enc);
2418 if (len1 > LONG_MAX - len2) {
2419 rb_raise(rb_eArgError, "string size too big");
2420 }
2421 str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
2422 ptr3 = RSTRING_PTR(str3);
2423 memcpy(ptr3, ptr1, len1);
2424 memcpy(ptr3+len1, ptr2, len2);
2425 TERM_FILL(&ptr3[len1+len2], termlen);
2426
2427 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2429 RB_GC_GUARD(str1);
2430 RB_GC_GUARD(str2);
2431 return str3;
2432}
2433
2434/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2435VALUE
2436rb_str_opt_plus(VALUE str1, VALUE str2)
2437{
2440 long len1, len2;
2441 MAYBE_UNUSED(char) *ptr1, *ptr2;
2442 RSTRING_GETMEM(str1, ptr1, len1);
2443 RSTRING_GETMEM(str2, ptr2, len2);
2444 int enc1 = rb_enc_get_index(str1);
2445 int enc2 = rb_enc_get_index(str2);
2446
2447 if (enc1 < 0) {
2448 return Qundef;
2449 }
2450 else if (enc2 < 0) {
2451 return Qundef;
2452 }
2453 else if (enc1 != enc2) {
2454 return Qundef;
2455 }
2456 else if (len1 > LONG_MAX - len2) {
2457 return Qundef;
2458 }
2459 else {
2460 return rb_str_plus(str1, str2);
2461 }
2462
2463}
2464
2465/*
2466 * call-seq:
2467 * string * integer -> new_string
2468 *
2469 * Returns a new +String+ containing +integer+ copies of +self+:
2470 *
2471 * "Ho! " * 3 # => "Ho! Ho! Ho! "
2472 * "Ho! " * 0 # => ""
2473 *
2474 */
2475
2476VALUE
2478{
2479 VALUE str2;
2480 long n, len;
2481 char *ptr2;
2482 int termlen;
2483
2484 if (times == INT2FIX(1)) {
2485 return str_duplicate(rb_cString, str);
2486 }
2487 if (times == INT2FIX(0)) {
2488 str2 = str_alloc_embed(rb_cString, 0);
2489 rb_enc_copy(str2, str);
2490 return str2;
2491 }
2492 len = NUM2LONG(times);
2493 if (len < 0) {
2494 rb_raise(rb_eArgError, "negative argument");
2495 }
2496 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2497 if (STR_EMBEDDABLE_P(len, 1)) {
2498 str2 = str_alloc_embed(rb_cString, len + 1);
2499 memset(RSTRING_PTR(str2), 0, len + 1);
2500 }
2501 else {
2502 str2 = str_alloc_heap(rb_cString);
2503 RSTRING(str2)->as.heap.aux.capa = len;
2504 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2505 }
2506 STR_SET_LEN(str2, len);
2507 rb_enc_copy(str2, str);
2508 return str2;
2509 }
2510 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2511 rb_raise(rb_eArgError, "argument too big");
2512 }
2513
2514 len *= RSTRING_LEN(str);
2515 termlen = TERM_LEN(str);
2516 str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
2517 ptr2 = RSTRING_PTR(str2);
2518 if (len) {
2519 n = RSTRING_LEN(str);
2520 memcpy(ptr2, RSTRING_PTR(str), n);
2521 while (n <= len/2) {
2522 memcpy(ptr2 + n, ptr2, n);
2523 n *= 2;
2524 }
2525 memcpy(ptr2 + n, ptr2, len-n);
2526 }
2527 STR_SET_LEN(str2, len);
2528 TERM_FILL(&ptr2[len], termlen);
2529 rb_enc_cr_str_copy_for_substr(str2, str);
2530
2531 return str2;
2532}
2533
2534/*
2535 * call-seq:
2536 * string % object -> new_string
2537 *
2538 * Returns the result of formatting +object+ into the format specification +self+
2539 * (see Kernel#sprintf for formatting details):
2540 *
2541 * "%05d" % 123 # => "00123"
2542 *
2543 * If +self+ contains multiple substitutions, +object+ must be
2544 * an Array or Hash containing the values to be substituted:
2545 *
2546 * "%-5s: %016x" % [ "ID", self.object_id ] # => "ID : 00002b054ec93168"
2547 * "foo = %{foo}" % {foo: 'bar'} # => "foo = bar"
2548 * "foo = %{foo}, baz = %{baz}" % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2549 *
2550 */
2551
2552static VALUE
2553rb_str_format_m(VALUE str, VALUE arg)
2554{
2555 VALUE tmp = rb_check_array_type(arg);
2556
2557 if (!NIL_P(tmp)) {
2558 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2559 }
2560 return rb_str_format(1, &arg, str);
2561}
2562
2563static inline void
2564rb_check_lockedtmp(VALUE str)
2565{
2566 if (FL_TEST(str, STR_TMPLOCK)) {
2567 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2568 }
2569}
2570
2571// If none of these flags are set, we know we have an modifiable string.
2572// If any is set, we need to do more detailed checks.
2573#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2574static inline void
2575str_modifiable(VALUE str)
2576{
2577 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2578 if (CHILLED_STRING_P(str)) {
2579 CHILLED_STRING_MUTATED(str);
2580 }
2581 rb_check_lockedtmp(str);
2582 rb_check_frozen(str);
2583 }
2584}
2585
2586static inline int
2587str_dependent_p(VALUE str)
2588{
2589 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2590 return FALSE;
2591 }
2592 else {
2593 return TRUE;
2594 }
2595}
2596
2597// If none of these flags are set, we know we have an independent string.
2598// If any is set, we need to do more detailed checks.
2599#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2600static inline int
2601str_independent(VALUE str)
2602{
2603 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2604 str_modifiable(str);
2605 return !str_dependent_p(str);
2606 }
2607 return TRUE;
2608}
2609
2610static void
2611str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2612{
2613 char *ptr;
2614 char *oldptr;
2615 long capa = len + expand;
2616
2617 if (len > capa) len = capa;
2618
2619 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2620 ptr = RSTRING(str)->as.heap.ptr;
2621 STR_SET_EMBED(str);
2622 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2623 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2624 STR_SET_LEN(str, len);
2625 return;
2626 }
2627
2628 ptr = ALLOC_N(char, (size_t)capa + termlen);
2629 oldptr = RSTRING_PTR(str);
2630 if (oldptr) {
2631 memcpy(ptr, oldptr, len);
2632 }
2633 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2634 xfree(oldptr);
2635 }
2636 STR_SET_NOEMBED(str);
2637 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2638 TERM_FILL(ptr + len, termlen);
2639 RSTRING(str)->as.heap.ptr = ptr;
2640 STR_SET_LEN(str, len);
2641 RSTRING(str)->as.heap.aux.capa = capa;
2642}
2643
2644void
2645rb_str_modify(VALUE str)
2646{
2647 if (!str_independent(str))
2648 str_make_independent(str);
2650}
2651
2652void
2654{
2655 int termlen = TERM_LEN(str);
2656 long len = RSTRING_LEN(str);
2657
2658 if (expand < 0) {
2659 rb_raise(rb_eArgError, "negative expanding string size");
2660 }
2661 if (expand >= LONG_MAX - len) {
2662 rb_raise(rb_eArgError, "string size too big");
2663 }
2664
2665 if (!str_independent(str)) {
2666 str_make_independent_expand(str, len, expand, termlen);
2667 }
2668 else if (expand > 0) {
2669 RESIZE_CAPA_TERM(str, len + expand, termlen);
2670 }
2672}
2673
2674/* As rb_str_modify(), but don't clear coderange */
2675static void
2676str_modify_keep_cr(VALUE str)
2677{
2678 if (!str_independent(str))
2679 str_make_independent(str);
2681 /* Force re-scan later */
2683}
2684
2685static inline void
2686str_discard(VALUE str)
2687{
2688 str_modifiable(str);
2689 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2690 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2691 RSTRING(str)->as.heap.ptr = 0;
2692 STR_SET_LEN(str, 0);
2693 }
2694}
2695
2696void
2698{
2699 int encindex = rb_enc_get_index(str);
2700
2701 if (RB_UNLIKELY(encindex == -1)) {
2702 rb_raise(rb_eTypeError, "not encoding capable object");
2703 }
2704
2705 if (RB_LIKELY(str_encindex_fastpath(encindex))) {
2706 return;
2707 }
2708
2709 rb_encoding *enc = rb_enc_from_index(encindex);
2710 if (!rb_enc_asciicompat(enc)) {
2711 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2712 }
2713}
2714
2715VALUE
2717{
2718 VALUE s = *ptr;
2719 if (!RB_TYPE_P(s, T_STRING)) {
2720 s = rb_str_to_str(s);
2721 *ptr = s;
2722 }
2723 return s;
2724}
2725
2726char *
2728{
2729 VALUE str = rb_string_value(ptr);
2730 return RSTRING_PTR(str);
2731}
2732
2733static int
2734zero_filled(const char *s, int n)
2735{
2736 for (; n > 0; --n) {
2737 if (*s++) return 0;
2738 }
2739 return 1;
2740}
2741
2742static const char *
2743str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2744{
2745 const char *e = s + len;
2746
2747 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2748 if (zero_filled(s, minlen)) return s;
2749 }
2750 return 0;
2751}
2752
2753static char *
2754str_fill_term(VALUE str, char *s, long len, int termlen)
2755{
2756 /* This function assumes that (capa + termlen) bytes of memory
2757 * is allocated, like many other functions in this file.
2758 */
2759 if (str_dependent_p(str)) {
2760 if (!zero_filled(s + len, termlen))
2761 str_make_independent_expand(str, len, 0L, termlen);
2762 }
2763 else {
2764 TERM_FILL(s + len, termlen);
2765 return s;
2766 }
2767 return RSTRING_PTR(str);
2768}
2769
2770void
2771rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2772{
2773 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2774 long len = RSTRING_LEN(str);
2775
2776 RUBY_ASSERT(capa >= len);
2777 if (capa - len < termlen) {
2778 rb_check_lockedtmp(str);
2779 str_make_independent_expand(str, len, 0L, termlen);
2780 }
2781 else if (str_dependent_p(str)) {
2782 if (termlen > oldtermlen)
2783 str_make_independent_expand(str, len, 0L, termlen);
2784 }
2785 else {
2786 if (!STR_EMBED_P(str)) {
2787 /* modify capa instead of realloc */
2788 RUBY_ASSERT(!FL_TEST((str), STR_SHARED));
2789 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2790 }
2791 if (termlen > oldtermlen) {
2792 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2793 }
2794 }
2795
2796 return;
2797}
2798
2799static char *
2800str_null_check(VALUE str, int *w)
2801{
2802 char *s = RSTRING_PTR(str);
2803 long len = RSTRING_LEN(str);
2804 rb_encoding *enc = rb_enc_get(str);
2805 const int minlen = rb_enc_mbminlen(enc);
2806
2807 if (minlen > 1) {
2808 *w = 1;
2809 if (str_null_char(s, len, minlen, enc)) {
2810 return NULL;
2811 }
2812 return str_fill_term(str, s, len, minlen);
2813 }
2814 *w = 0;
2815 if (!s || memchr(s, 0, len)) {
2816 return NULL;
2817 }
2818 if (s[len]) {
2819 s = str_fill_term(str, s, len, minlen);
2820 }
2821 return s;
2822}
2823
2824char *
2825rb_str_to_cstr(VALUE str)
2826{
2827 int w;
2828 return str_null_check(str, &w);
2829}
2830
2831char *
2833{
2834 VALUE str = rb_string_value(ptr);
2835 int w;
2836 char *s = str_null_check(str, &w);
2837 if (!s) {
2838 if (w) {
2839 rb_raise(rb_eArgError, "string contains null char");
2840 }
2841 rb_raise(rb_eArgError, "string contains null byte");
2842 }
2843 return s;
2844}
2845
2846char *
2847rb_str_fill_terminator(VALUE str, const int newminlen)
2848{
2849 char *s = RSTRING_PTR(str);
2850 long len = RSTRING_LEN(str);
2851 return str_fill_term(str, s, len, newminlen);
2852}
2853
2854VALUE
2856{
2857 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2858 return str;
2859}
2860
2861/*
2862 * call-seq:
2863 * String.try_convert(object) -> object, new_string, or nil
2864 *
2865 * If +object+ is a +String+ object, returns +object+.
2866 *
2867 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2868 * calls <tt>object.to_str</tt> and returns the result.
2869 *
2870 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2871 *
2872 * Raises an exception unless <tt>object.to_str</tt> returns a +String+ object.
2873 */
2874static VALUE
2875rb_str_s_try_convert(VALUE dummy, VALUE str)
2876{
2877 return rb_check_string_type(str);
2878}
2879
2880static char*
2881str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2882{
2883 long nth = *nthp;
2884 if (rb_enc_mbmaxlen(enc) == 1) {
2885 p += nth;
2886 }
2887 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2888 p += nth * rb_enc_mbmaxlen(enc);
2889 }
2890 else if (rb_enc_asciicompat(enc)) {
2891 const char *p2, *e2;
2892 int n;
2893
2894 while (p < e && 0 < nth) {
2895 e2 = p + nth;
2896 if (e < e2) {
2897 *nthp = nth;
2898 return (char *)e;
2899 }
2900 if (ISASCII(*p)) {
2901 p2 = search_nonascii(p, e2);
2902 if (!p2) {
2903 nth -= e2 - p;
2904 *nthp = nth;
2905 return (char *)e2;
2906 }
2907 nth -= p2 - p;
2908 p = p2;
2909 }
2910 n = rb_enc_mbclen(p, e, enc);
2911 p += n;
2912 nth--;
2913 }
2914 *nthp = nth;
2915 if (nth != 0) {
2916 return (char *)e;
2917 }
2918 return (char *)p;
2919 }
2920 else {
2921 while (p < e && nth--) {
2922 p += rb_enc_mbclen(p, e, enc);
2923 }
2924 }
2925 if (p > e) p = e;
2926 *nthp = nth;
2927 return (char*)p;
2928}
2929
2930char*
2931rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2932{
2933 return str_nth_len(p, e, &nth, enc);
2934}
2935
2936static char*
2937str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2938{
2939 if (singlebyte)
2940 p += nth;
2941 else {
2942 p = str_nth_len(p, e, &nth, enc);
2943 }
2944 if (!p) return 0;
2945 if (p > e) p = e;
2946 return (char *)p;
2947}
2948
2949/* char offset to byte offset */
2950static long
2951str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2952{
2953 const char *pp = str_nth(p, e, nth, enc, singlebyte);
2954 if (!pp) return e - p;
2955 return pp - p;
2956}
2957
2958long
2959rb_str_offset(VALUE str, long pos)
2960{
2961 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2962 STR_ENC_GET(str), single_byte_optimizable(str));
2963}
2964
2965#ifdef NONASCII_MASK
2966static char *
2967str_utf8_nth(const char *p, const char *e, long *nthp)
2968{
2969 long nth = *nthp;
2970 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
2971 const uintptr_t *s, *t;
2972 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2973 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2974 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2975 while (p < (const char *)s) {
2976 if (is_utf8_lead_byte(*p)) nth--;
2977 p++;
2978 }
2979 do {
2980 nth -= count_utf8_lead_bytes_with_word(s);
2981 s++;
2982 } while (s < t && (int)SIZEOF_VOIDP <= nth);
2983 p = (char *)s;
2984 }
2985 while (p < e) {
2986 if (is_utf8_lead_byte(*p)) {
2987 if (nth == 0) break;
2988 nth--;
2989 }
2990 p++;
2991 }
2992 *nthp = nth;
2993 return (char *)p;
2994}
2995
2996static long
2997str_utf8_offset(const char *p, const char *e, long nth)
2998{
2999 const char *pp = str_utf8_nth(p, e, &nth);
3000 return pp - p;
3001}
3002#endif
3003
3004/* byte offset to char offset */
3005long
3006rb_str_sublen(VALUE str, long pos)
3007{
3008 if (single_byte_optimizable(str) || pos < 0)
3009 return pos;
3010 else {
3011 char *p = RSTRING_PTR(str);
3012 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
3013 }
3014}
3015
3016static VALUE
3017str_subseq(VALUE str, long beg, long len)
3018{
3019 VALUE str2;
3020
3021 RUBY_ASSERT(beg >= 0);
3022 RUBY_ASSERT(len >= 0);
3023 RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
3024
3025 const int termlen = TERM_LEN(str);
3026 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
3027 str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
3028 RB_GC_GUARD(str);
3029 return str2;
3030 }
3031
3032 str2 = str_alloc_heap(rb_cString);
3033 if (str_embed_capa(str2) >= len + termlen) {
3034 char *ptr2 = RSTRING(str2)->as.embed.ary;
3035 STR_SET_EMBED(str2);
3036 memcpy(ptr2, RSTRING_PTR(str) + beg, len);
3037 TERM_FILL(ptr2+len, termlen);
3038
3039 STR_SET_LEN(str2, len);
3040 RB_GC_GUARD(str);
3041 }
3042 else {
3043 str_replace_shared(str2, str);
3044 RUBY_ASSERT(!STR_EMBED_P(str2));
3045 ENC_CODERANGE_CLEAR(str2);
3046 RSTRING(str2)->as.heap.ptr += beg;
3047 if (RSTRING_LEN(str2) > len) {
3048 STR_SET_LEN(str2, len);
3049 }
3050 }
3051
3052 return str2;
3053}
3054
3055VALUE
3056rb_str_subseq(VALUE str, long beg, long len)
3057{
3058 VALUE str2 = str_subseq(str, beg, len);
3059 rb_enc_cr_str_copy_for_substr(str2, str);
3060 return str2;
3061}
3062
3063char *
3064rb_str_subpos(VALUE str, long beg, long *lenp)
3065{
3066 long len = *lenp;
3067 long slen = -1L;
3068 const long blen = RSTRING_LEN(str);
3069 rb_encoding *enc = STR_ENC_GET(str);
3070 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3071
3072 if (len < 0) return 0;
3073 if (beg < 0 && -beg < 0) return 0;
3074 if (!blen) {
3075 len = 0;
3076 }
3077 if (single_byte_optimizable(str)) {
3078 if (beg > blen) return 0;
3079 if (beg < 0) {
3080 beg += blen;
3081 if (beg < 0) return 0;
3082 }
3083 if (len > blen - beg)
3084 len = blen - beg;
3085 if (len < 0) return 0;
3086 p = s + beg;
3087 goto end;
3088 }
3089 if (beg < 0) {
3090 if (len > -beg) len = -beg;
3091 if ((ENC_CODERANGE(str) == ENC_CODERANGE_VALID) &&
3092 (-beg * rb_enc_mbmaxlen(enc) < blen / 8)) {
3093 beg = -beg;
3094 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3095 p = e;
3096 if (!p) return 0;
3097 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3098 if (!p) return 0;
3099 len = e - p;
3100 goto end;
3101 }
3102 else {
3103 slen = str_strlen(str, enc);
3104 beg += slen;
3105 if (beg < 0) return 0;
3106 p = s + beg;
3107 if (len == 0) goto end;
3108 }
3109 }
3110 else if (beg > 0 && beg > blen) {
3111 return 0;
3112 }
3113 if (len == 0) {
3114 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
3115 p = s + beg;
3116 }
3117#ifdef NONASCII_MASK
3118 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
3119 enc == rb_utf8_encoding()) {
3120 p = str_utf8_nth(s, e, &beg);
3121 if (beg > 0) return 0;
3122 len = str_utf8_offset(p, e, len);
3123 }
3124#endif
3125 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3126 int char_sz = rb_enc_mbmaxlen(enc);
3127
3128 p = s + beg * char_sz;
3129 if (p > e) {
3130 return 0;
3131 }
3132 else if (len * char_sz > e - p)
3133 len = e - p;
3134 else
3135 len *= char_sz;
3136 }
3137 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3138 if (beg > 0) return 0;
3139 len = 0;
3140 }
3141 else {
3142 len = str_offset(p, e, len, enc, 0);
3143 }
3144 end:
3145 *lenp = len;
3146 RB_GC_GUARD(str);
3147 return p;
3148}
3149
3150static VALUE str_substr(VALUE str, long beg, long len, int empty);
3151
3152VALUE
3153rb_str_substr(VALUE str, long beg, long len)
3154{
3155 return str_substr(str, beg, len, TRUE);
3156}
3157
3158VALUE
3159rb_str_substr_two_fixnums(VALUE str, VALUE beg, VALUE len, int empty)
3160{
3161 return str_substr(str, NUM2LONG(beg), NUM2LONG(len), empty);
3162}
3163
3164static VALUE
3165str_substr(VALUE str, long beg, long len, int empty)
3166{
3167 char *p = rb_str_subpos(str, beg, &len);
3168
3169 if (!p) return Qnil;
3170 if (!len && !empty) return Qnil;
3171
3172 beg = p - RSTRING_PTR(str);
3173
3174 VALUE str2 = str_subseq(str, beg, len);
3175 rb_enc_cr_str_copy_for_substr(str2, str);
3176 return str2;
3177}
3178
3179/* :nodoc: */
3180VALUE
3182{
3183 if (CHILLED_STRING_P(str)) {
3184 FL_UNSET_RAW(str, STR_CHILLED);
3185 }
3186
3187 if (OBJ_FROZEN(str)) return str;
3188 rb_str_resize(str, RSTRING_LEN(str));
3189 return rb_obj_freeze(str);
3190}
3191
3192/*
3193 * call-seq:
3194 * +string -> new_string or self
3195 *
3196 * Returns +self+ if +self+ is not frozen and can be mutated
3197 * without warning issuance.
3198 *
3199 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3200 */
3201static VALUE
3202str_uplus(VALUE str)
3203{
3204 if (OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3205 return rb_str_dup(str);
3206 }
3207 else {
3208 return str;
3209 }
3210}
3211
3212/*
3213 * call-seq:
3214 * -string -> frozen_string
3215 * dedup -> frozen_string
3216 *
3217 * Returns a frozen, possibly pre-existing copy of the string.
3218 *
3219 * The returned +String+ will be deduplicated as long as it does not have
3220 * any instance variables set on it and is not a String subclass.
3221 *
3222 * Note that <tt>-string</tt> variant is more convenient for defining
3223 * constants:
3224 *
3225 * FILENAME = -'config/database.yml'
3226 *
3227 * while +dedup+ is better suitable for using the method in chains
3228 * of calculations:
3229 *
3230 * @url_list.concat(urls.map(&:dedup))
3231 *
3232 */
3233static VALUE
3234str_uminus(VALUE str)
3235{
3236 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3237 str = rb_str_dup(str);
3238 }
3239 return rb_fstring(str);
3240}
3241
3242RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3243#define rb_str_dup_frozen rb_str_new_frozen
3244
3245VALUE
3247{
3248 if (FL_TEST(str, STR_TMPLOCK)) {
3249 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3250 }
3251 FL_SET(str, STR_TMPLOCK);
3252 return str;
3253}
3254
3255VALUE
3257{
3258 if (!FL_TEST(str, STR_TMPLOCK)) {
3259 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3260 }
3261 FL_UNSET(str, STR_TMPLOCK);
3262 return str;
3263}
3264
3265VALUE
3266rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3267{
3268 rb_str_locktmp(str);
3269 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3270}
3271
3272void
3274{
3275 long capa;
3276 const int termlen = TERM_LEN(str);
3277
3278 str_modifiable(str);
3279 if (STR_SHARED_P(str)) {
3280 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3281 }
3282 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3283 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3284 }
3285
3286 int cr = ENC_CODERANGE(str);
3287 if (len == 0) {
3288 /* Empty string does not contain non-ASCII */
3290 }
3291 else if (cr == ENC_CODERANGE_UNKNOWN) {
3292 /* Leave unknown. */
3293 }
3294 else if (len > RSTRING_LEN(str)) {
3295 if (ENC_CODERANGE_CLEAN_P(cr)) {
3296 /* Update the coderange regarding the extended part. */
3297 const char *const prev_end = RSTRING_END(str);
3298 const char *const new_end = RSTRING_PTR(str) + len;
3299 rb_encoding *enc = rb_enc_get(str);
3300 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3301 ENC_CODERANGE_SET(str, cr);
3302 }
3303 else if (cr == ENC_CODERANGE_BROKEN) {
3304 /* May be valid now, by appended part. */
3306 }
3307 }
3308 else if (len < RSTRING_LEN(str)) {
3309 if (cr != ENC_CODERANGE_7BIT) {
3310 /* ASCII-only string is keeping after truncated. Valid
3311 * and broken may be invalid or valid, leave unknown. */
3313 }
3314 }
3315
3316 STR_SET_LEN(str, len);
3317 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3318}
3319
3320VALUE
3321rb_str_resize(VALUE str, long len)
3322{
3323 if (len < 0) {
3324 rb_raise(rb_eArgError, "negative string size (or size too big)");
3325 }
3326
3327 int independent = str_independent(str);
3328 long slen = RSTRING_LEN(str);
3329 const int termlen = TERM_LEN(str);
3330
3331 if (slen > len || (termlen != 1 && slen < len)) {
3333 }
3334
3335 {
3336 long capa;
3337 if (STR_EMBED_P(str)) {
3338 if (len == slen) return str;
3339 if (str_embed_capa(str) >= len + termlen) {
3340 STR_SET_LEN(str, len);
3341 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3342 return str;
3343 }
3344 str_make_independent_expand(str, slen, len - slen, termlen);
3345 }
3346 else if (str_embed_capa(str) >= len + termlen) {
3347 char *ptr = STR_HEAP_PTR(str);
3348 STR_SET_EMBED(str);
3349 if (slen > len) slen = len;
3350 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3351 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3352 STR_SET_LEN(str, len);
3353 if (independent) ruby_xfree(ptr);
3354 return str;
3355 }
3356 else if (!independent) {
3357 if (len == slen) return str;
3358 str_make_independent_expand(str, slen, len - slen, termlen);
3359 }
3360 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3361 (capa - len) > (len < 1024 ? len : 1024)) {
3362 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3363 (size_t)len + termlen, STR_HEAP_SIZE(str));
3364 RSTRING(str)->as.heap.aux.capa = len;
3365 }
3366 else if (len == slen) return str;
3367 STR_SET_LEN(str, len);
3368 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3369 }
3370 return str;
3371}
3372
3373static void
3374str_ensure_available_capa(VALUE str, long len)
3375{
3376 str_modify_keep_cr(str);
3377
3378 const int termlen = TERM_LEN(str);
3379 long olen = RSTRING_LEN(str);
3380
3381 if (RB_UNLIKELY(olen > LONG_MAX - len)) {
3382 rb_raise(rb_eArgError, "string sizes too big");
3383 }
3384
3385 long total = olen + len;
3386 long capa = str_capacity(str, termlen);
3387
3388 if (capa < total) {
3389 if (total >= LONG_MAX / 2) {
3390 capa = total;
3391 }
3392 while (total > capa) {
3393 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3394 }
3395 RESIZE_CAPA_TERM(str, capa, termlen);
3396 }
3397}
3398
3399static VALUE
3400str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3401{
3402 if (keep_cr) {
3403 str_modify_keep_cr(str);
3404 }
3405 else {
3406 rb_str_modify(str);
3407 }
3408 if (len == 0) return 0;
3409
3410 long total, olen, off = -1;
3411 char *sptr;
3412 const int termlen = TERM_LEN(str);
3413
3414 RSTRING_GETMEM(str, sptr, olen);
3415 if (ptr >= sptr && ptr <= sptr + olen) {
3416 off = ptr - sptr;
3417 }
3418
3419 long capa = str_capacity(str, termlen);
3420
3421 if (olen > LONG_MAX - len) {
3422 rb_raise(rb_eArgError, "string sizes too big");
3423 }
3424 total = olen + len;
3425 if (capa < total) {
3426 if (total >= LONG_MAX / 2) {
3427 capa = total;
3428 }
3429 while (total > capa) {
3430 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3431 }
3432 RESIZE_CAPA_TERM(str, capa, termlen);
3433 sptr = RSTRING_PTR(str);
3434 }
3435 if (off != -1) {
3436 ptr = sptr + off;
3437 }
3438 memcpy(sptr + olen, ptr, len);
3439 STR_SET_LEN(str, total);
3440 TERM_FILL(sptr + total, termlen); /* sentinel */
3441
3442 return str;
3443}
3444
3445#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3446#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3447
3448VALUE
3449rb_str_cat(VALUE str, const char *ptr, long len)
3450{
3451 if (len == 0) return str;
3452 if (len < 0) {
3453 rb_raise(rb_eArgError, "negative string size (or size too big)");
3454 }
3455 return str_buf_cat(str, ptr, len);
3456}
3457
3458VALUE
3459rb_str_cat_cstr(VALUE str, const char *ptr)
3460{
3461 must_not_null(ptr);
3462 return rb_str_buf_cat(str, ptr, strlen(ptr));
3463}
3464
3465static void
3466rb_str_buf_cat_byte(VALUE str, unsigned char byte)
3467{
3468 RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT || RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII);
3469
3470 // We can't write directly to shared strings without impacting others, so we must make the string independent.
3471 if (UNLIKELY(!str_independent(str))) {
3472 str_make_independent(str);
3473 }
3474
3475 long string_length = -1;
3476 const int null_terminator_length = 1;
3477 char *sptr;
3478 RSTRING_GETMEM(str, sptr, string_length);
3479
3480 // Ensure the resulting string wouldn't be too long.
3481 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3482 rb_raise(rb_eArgError, "string sizes too big");
3483 }
3484
3485 long string_capacity = str_capacity(str, null_terminator_length);
3486
3487 // Get the code range before any modifications since those might clear the code range.
3488 int cr = ENC_CODERANGE(str);
3489
3490 // Check if the string has spare string_capacity to write the new byte.
3491 if (LIKELY(string_capacity >= string_length + 1)) {
3492 // In fast path we can write the new byte and note the string's new length.
3493 sptr[string_length] = byte;
3494 STR_SET_LEN(str, string_length + 1);
3495 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3496 }
3497 else {
3498 // If there's not enough string_capacity, make a call into the general string concatenation function.
3499 str_buf_cat(str, (char *)&byte, 1);
3500 }
3501
3502 // If the code range is already known, we can derive the resulting code range cheaply by looking at the byte we
3503 // just appended. If the code range is unknown, but the string was empty, then we can also derive the code range
3504 // by looking at the byte we just appended. Otherwise, we'd have to scan the bytes to determine the code range so
3505 // we leave it as unknown. It cannot be broken for binary strings so we don't need to handle that option.
3506 if (cr == ENC_CODERANGE_7BIT || string_length == 0) {
3507 if (ISASCII(byte)) {
3509 }
3510 else {
3512
3513 // Promote a US-ASCII string to ASCII-8BIT when a non-ASCII byte is appended.
3514 if (UNLIKELY(RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)) {
3515 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3516 }
3517 }
3518 }
3519}
3520
3521RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3522RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3523RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3524
3525static VALUE
3526rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3527 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3528{
3529 int str_encindex = ENCODING_GET(str);
3530 int res_encindex;
3531 int str_cr, res_cr;
3532 rb_encoding *str_enc, *ptr_enc;
3533
3534 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3535
3536 if (str_encindex == ptr_encindex) {
3537 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3538 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3539 }
3540 }
3541 else {
3542 str_enc = rb_enc_from_index(str_encindex);
3543 ptr_enc = rb_enc_from_index(ptr_encindex);
3544 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3545 if (len == 0)
3546 return str;
3547 if (RSTRING_LEN(str) == 0) {
3548 rb_str_buf_cat(str, ptr, len);
3549 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3550 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3551 return str;
3552 }
3553 goto incompatible;
3554 }
3555 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3556 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3557 }
3558 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3559 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3560 str_cr = rb_enc_str_coderange(str);
3561 }
3562 }
3563 }
3564 if (ptr_cr_ret)
3565 *ptr_cr_ret = ptr_cr;
3566
3567 if (str_encindex != ptr_encindex &&
3568 str_cr != ENC_CODERANGE_7BIT &&
3569 ptr_cr != ENC_CODERANGE_7BIT) {
3570 str_enc = rb_enc_from_index(str_encindex);
3571 ptr_enc = rb_enc_from_index(ptr_encindex);
3572 goto incompatible;
3573 }
3574
3575 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3576 res_encindex = str_encindex;
3577 res_cr = ENC_CODERANGE_UNKNOWN;
3578 }
3579 else if (str_cr == ENC_CODERANGE_7BIT) {
3580 if (ptr_cr == ENC_CODERANGE_7BIT) {
3581 res_encindex = str_encindex;
3582 res_cr = ENC_CODERANGE_7BIT;
3583 }
3584 else {
3585 res_encindex = ptr_encindex;
3586 res_cr = ptr_cr;
3587 }
3588 }
3589 else if (str_cr == ENC_CODERANGE_VALID) {
3590 res_encindex = str_encindex;
3591 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3592 res_cr = str_cr;
3593 else
3594 res_cr = ptr_cr;
3595 }
3596 else { /* str_cr == ENC_CODERANGE_BROKEN */
3597 res_encindex = str_encindex;
3598 res_cr = str_cr;
3599 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3600 }
3601
3602 if (len < 0) {
3603 rb_raise(rb_eArgError, "negative string size (or size too big)");
3604 }
3605 str_buf_cat(str, ptr, len);
3606 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3607 return str;
3608
3609 incompatible:
3610 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3611 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3613}
3614
3615VALUE
3616rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3617{
3618 return rb_enc_cr_str_buf_cat(str, ptr, len,
3619 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3620}
3621
3622VALUE
3623rb_str_buf_cat_ascii(VALUE str, const char *ptr)
3624{
3625 /* ptr must reference NUL terminated ASCII string. */
3626 int encindex = ENCODING_GET(str);
3627 rb_encoding *enc = rb_enc_from_index(encindex);
3628 if (rb_enc_asciicompat(enc)) {
3629 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3630 encindex, ENC_CODERANGE_7BIT, 0);
3631 }
3632 else {
3633 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3634 while (*ptr) {
3635 unsigned int c = (unsigned char)*ptr;
3636 int len = rb_enc_codelen(c, enc);
3637 rb_enc_mbcput(c, buf, enc);
3638 rb_enc_cr_str_buf_cat(str, buf, len,
3639 encindex, ENC_CODERANGE_VALID, 0);
3640 ptr++;
3641 }
3642 return str;
3643 }
3644}
3645
3646VALUE
3648{
3649 int str2_cr = rb_enc_str_coderange(str2);
3650
3651 if (str_enc_fastpath(str)) {
3652 switch (str2_cr) {
3653 case ENC_CODERANGE_7BIT:
3654 // If RHS is 7bit we can do simple concatenation
3655 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3656 RB_GC_GUARD(str2);
3657 return str;
3659 // If RHS is valid, we can do simple concatenation if encodings are the same
3660 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3661 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3662 int str_cr = ENC_CODERANGE(str);
3663 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3664 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3665 }
3666 RB_GC_GUARD(str2);
3667 return str;
3668 }
3669 }
3670 }
3671
3672 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3673 ENCODING_GET(str2), str2_cr, &str2_cr);
3674
3675 ENC_CODERANGE_SET(str2, str2_cr);
3676
3677 return str;
3678}
3679
3680VALUE
3682{
3683 StringValue(str2);
3684 return rb_str_buf_append(str, str2);
3685}
3686
3687VALUE
3688rb_str_concat_literals(size_t num, const VALUE *strary)
3689{
3690 VALUE str;
3691 size_t i, s = 0;
3692 unsigned long len = 1;
3693
3694 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3695 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3696
3697 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3698 str = rb_str_buf_new(len);
3699 str_enc_copy_direct(str, strary[0]);
3700
3701 for (i = s; i < num; ++i) {
3702 const VALUE v = strary[i];
3703 int encidx = ENCODING_GET(v);
3704
3705 rb_str_buf_append(str, v);
3706 if (encidx != ENCINDEX_US_ASCII) {
3707 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3708 rb_enc_set_index(str, encidx);
3709 }
3710 }
3711 return str;
3712}
3713
3714/*
3715 * call-seq:
3716 * concat(*objects) -> string
3717 *
3718 * Concatenates each object in +objects+ to +self+ and returns +self+:
3719 *
3720 * s = 'foo'
3721 * s.concat('bar', 'baz') # => "foobarbaz"
3722 * s # => "foobarbaz"
3723 *
3724 * For each given object +object+ that is an Integer,
3725 * the value is considered a codepoint and converted to a character before concatenation:
3726 *
3727 * s = 'foo'
3728 * s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
3729 *
3730 * Related: String#<<, which takes a single argument.
3731 */
3732static VALUE
3733rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3734{
3735 str_modifiable(str);
3736
3737 if (argc == 1) {
3738 return rb_str_concat(str, argv[0]);
3739 }
3740 else if (argc > 1) {
3741 int i;
3742 VALUE arg_str = rb_str_tmp_new(0);
3743 rb_enc_copy(arg_str, str);
3744 for (i = 0; i < argc; i++) {
3745 rb_str_concat(arg_str, argv[i]);
3746 }
3747 rb_str_buf_append(str, arg_str);
3748 }
3749
3750 return str;
3751}
3752
3753/*
3754 * call-seq:
3755 * append_as_bytes(*objects) -> string
3756 *
3757 * Concatenates each object in +objects+ into +self+ without any encoding
3758 * validation or conversion and returns +self+:
3759 *
3760 * s = 'foo'
3761 * s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82"
3762 * s.valid_encoding? # => false
3763 * s.append_as_bytes("\xAC 12")
3764 * s.valid_encoding? # => true
3765 *
3766 * For each given object +object+ that is an Integer,
3767 * the value is considered a Byte. If the Integer is bigger
3768 * than one byte, only the lower byte is considered, similar to String#setbyte:
3769 *
3770 * s = ""
3771 * s.append_as_bytes(0, 257) # => "\u0000\u0001"
3772 *
3773 * Related: String#<<, String#concat, which do an encoding aware concatenation.
3774 */
3775
3776VALUE
3777rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
3778{
3779 long needed_capacity = 0;
3780 volatile VALUE t0;
3781 enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
3782
3783 for (int index = 0; index < argc; index++) {
3784 VALUE obj = argv[index];
3785 enum ruby_value_type type = types[index] = rb_type(obj);
3786 switch (type) {
3787 case T_FIXNUM:
3788 case T_BIGNUM:
3789 needed_capacity++;
3790 break;
3791 case T_STRING:
3792 needed_capacity += RSTRING_LEN(obj);
3793 break;
3794 default:
3795 rb_raise(
3797 "wrong argument type %"PRIsVALUE" (expected String or Integer)",
3798 rb_obj_class(obj)
3799 );
3800 break;
3801 }
3802 }
3803
3804 str_ensure_available_capa(str, needed_capacity);
3805 char *sptr = RSTRING_END(str);
3806
3807 for (int index = 0; index < argc; index++) {
3808 VALUE obj = argv[index];
3809 enum ruby_value_type type = types[index];
3810 switch (type) {
3811 case T_FIXNUM:
3812 case T_BIGNUM: {
3813 argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
3814 char byte = (char)(NUM2INT(obj) & 0xFF);
3815 *sptr = byte;
3816 sptr++;
3817 break;
3818 }
3819 case T_STRING: {
3820 const char *ptr;
3821 long len;
3822 RSTRING_GETMEM(obj, ptr, len);
3823 memcpy(sptr, ptr, len);
3824 sptr += len;
3825 break;
3826 }
3827 default:
3828 rb_bug("append_as_bytes arguments should have been validated");
3829 }
3830 }
3831
3832 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3833 TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
3834
3835 int cr = ENC_CODERANGE(str);
3836 switch (cr) {
3837 case ENC_CODERANGE_7BIT: {
3838 for (int index = 0; index < argc; index++) {
3839 VALUE obj = argv[index];
3840 enum ruby_value_type type = types[index];
3841 switch (type) {
3842 case T_FIXNUM:
3843 case T_BIGNUM: {
3844 if (!ISASCII(NUM2INT(obj))) {
3845 goto clear_cr;
3846 }
3847 break;
3848 }
3849 case T_STRING: {
3850 if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
3851 goto clear_cr;
3852 }
3853 break;
3854 }
3855 default:
3856 rb_bug("append_as_bytes arguments should have been validated");
3857 }
3858 }
3859 break;
3860 }
3862 if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
3863 goto keep_cr;
3864 }
3865 else {
3866 goto clear_cr;
3867 }
3868 break;
3869 default:
3870 goto clear_cr;
3871 break;
3872 }
3873
3874 RB_GC_GUARD(t0);
3875
3876 clear_cr:
3877 // If no fast path was hit, we clear the coderange.
3878 // append_as_bytes is predominently meant to be used in
3879 // buffering situation, hence it's likely the coderange
3880 // will never be scanned, so it's not worth spending time
3881 // precomputing the coderange except for simple and common
3882 // situations.
3884 keep_cr:
3885 return str;
3886}
3887
3888/*
3889 * call-seq:
3890 * string << object -> string
3891 *
3892 * Concatenates +object+ to +self+ and returns +self+:
3893 *
3894 * s = 'foo'
3895 * s << 'bar' # => "foobar"
3896 * s # => "foobar"
3897 *
3898 * If +object+ is an Integer,
3899 * the value is considered a codepoint and converted to a character before concatenation:
3900 *
3901 * s = 'foo'
3902 * s << 33 # => "foo!"
3903 *
3904 * If that codepoint is not representable in the encoding of
3905 * _string_, RangeError is raised.
3906 *
3907 * s = 'foo'
3908 * s.encoding # => <Encoding:UTF-8>
3909 * s << 0x00110000 # 1114112 out of char range (RangeError)
3910 * s = 'foo'.encode('EUC-JP')
3911 * s << 0x00800080 # invalid codepoint 0x800080 in EUC-JP (RangeError)
3912 *
3913 * If the encoding is US-ASCII and the codepoint is 0..0xff, _string_
3914 * is automatically promoted to ASCII-8BIT.
3915 *
3916 * s = 'foo'.encode('US-ASCII')
3917 * s << 0xff
3918 * s.encoding # => #<Encoding:BINARY (ASCII-8BIT)>
3919 *
3920 * Related: String#concat, which takes multiple arguments.
3921 */
3922VALUE
3924{
3925 unsigned int code;
3926 rb_encoding *enc = STR_ENC_GET(str1);
3927 int encidx;
3928
3929 if (RB_INTEGER_TYPE_P(str2)) {
3930 if (rb_num_to_uint(str2, &code) == 0) {
3931 }
3932 else if (FIXNUM_P(str2)) {
3933 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
3934 }
3935 else {
3936 rb_raise(rb_eRangeError, "bignum out of char range");
3937 }
3938 }
3939 else {
3940 return rb_str_append(str1, str2);
3941 }
3942
3943 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
3944
3945 if (encidx >= 0) {
3946 rb_str_buf_cat_byte(str1, (unsigned char)code);
3947 }
3948 else {
3949 long pos = RSTRING_LEN(str1);
3950 int cr = ENC_CODERANGE(str1);
3951 int len;
3952 char *buf;
3953
3954 switch (len = rb_enc_codelen(code, enc)) {
3955 case ONIGERR_INVALID_CODE_POINT_VALUE:
3956 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3957 break;
3958 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3959 case 0:
3960 rb_raise(rb_eRangeError, "%u out of char range", code);
3961 break;
3962 }
3963 buf = ALLOCA_N(char, len + 1);
3964 rb_enc_mbcput(code, buf, enc);
3965 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
3966 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3967 }
3968 rb_str_resize(str1, pos+len);
3969 memcpy(RSTRING_PTR(str1) + pos, buf, len);
3970 if (cr == ENC_CODERANGE_7BIT && code > 127) {
3972 }
3973 else if (cr == ENC_CODERANGE_BROKEN) {
3975 }
3976 ENC_CODERANGE_SET(str1, cr);
3977 }
3978 return str1;
3979}
3980
3981int
3982rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
3983{
3984 int encidx = rb_enc_to_index(enc);
3985
3986 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
3987 /* US-ASCII automatically extended to ASCII-8BIT */
3988 if (code > 0xFF) {
3989 rb_raise(rb_eRangeError, "%u out of char range", code);
3990 }
3991 if (encidx == ENCINDEX_US_ASCII && code > 127) {
3992 return ENCINDEX_ASCII_8BIT;
3993 }
3994 return encidx;
3995 }
3996 else {
3997 return -1;
3998 }
3999}
4000
4001/*
4002 * call-seq:
4003 * prepend(*other_strings) -> string
4004 *
4005 * Prepends each string in +other_strings+ to +self+ and returns +self+:
4006 *
4007 * s = 'foo'
4008 * s.prepend('bar', 'baz') # => "barbazfoo"
4009 * s # => "barbazfoo"
4010 *
4011 * Related: String#concat.
4012 */
4013
4014static VALUE
4015rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
4016{
4017 str_modifiable(str);
4018
4019 if (argc == 1) {
4020 rb_str_update(str, 0L, 0L, argv[0]);
4021 }
4022 else if (argc > 1) {
4023 int i;
4024 VALUE arg_str = rb_str_tmp_new(0);
4025 rb_enc_copy(arg_str, str);
4026 for (i = 0; i < argc; i++) {
4027 rb_str_append(arg_str, argv[i]);
4028 }
4029 rb_str_update(str, 0L, 0L, arg_str);
4030 }
4031
4032 return str;
4033}
4034
4035st_index_t
4037{
4038 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
4039 st_index_t precomputed_hash;
4040 memcpy(&precomputed_hash, RSTRING_END(str) + TERM_LEN(str), sizeof(precomputed_hash));
4041
4042 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4043 return precomputed_hash;
4044 }
4045
4046 return str_do_hash(str);
4047}
4048
4049int
4051{
4052 long len1, len2;
4053 const char *ptr1, *ptr2;
4054 RSTRING_GETMEM(str1, ptr1, len1);
4055 RSTRING_GETMEM(str2, ptr2, len2);
4056 return (len1 != len2 ||
4057 !rb_str_comparable(str1, str2) ||
4058 memcmp(ptr1, ptr2, len1) != 0);
4059}
4060
4061/*
4062 * call-seq:
4063 * hash -> integer
4064 *
4065 * Returns the integer hash value for +self+.
4066 * The value is based on the length, content and encoding of +self+.
4067 *
4068 * Related: Object#hash.
4069 */
4070
4071static VALUE
4072rb_str_hash_m(VALUE str)
4073{
4074 st_index_t hval = rb_str_hash(str);
4075 return ST2FIX(hval);
4076}
4077
4078#define lesser(a,b) (((a)>(b))?(b):(a))
4079
4080int
4082{
4083 int idx1, idx2;
4084 int rc1, rc2;
4085
4086 if (RSTRING_LEN(str1) == 0) return TRUE;
4087 if (RSTRING_LEN(str2) == 0) return TRUE;
4088 idx1 = ENCODING_GET(str1);
4089 idx2 = ENCODING_GET(str2);
4090 if (idx1 == idx2) return TRUE;
4091 rc1 = rb_enc_str_coderange(str1);
4092 rc2 = rb_enc_str_coderange(str2);
4093 if (rc1 == ENC_CODERANGE_7BIT) {
4094 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
4095 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4096 return TRUE;
4097 }
4098 if (rc2 == ENC_CODERANGE_7BIT) {
4099 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4100 return TRUE;
4101 }
4102 return FALSE;
4103}
4104
4105int
4107{
4108 long len1, len2;
4109 const char *ptr1, *ptr2;
4110 int retval;
4111
4112 if (str1 == str2) return 0;
4113 RSTRING_GETMEM(str1, ptr1, len1);
4114 RSTRING_GETMEM(str2, ptr2, len2);
4115 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4116 if (len1 == len2) {
4117 if (!rb_str_comparable(str1, str2)) {
4118 if (ENCODING_GET(str1) > ENCODING_GET(str2))
4119 return 1;
4120 return -1;
4121 }
4122 return 0;
4123 }
4124 if (len1 > len2) return 1;
4125 return -1;
4126 }
4127 if (retval > 0) return 1;
4128 return -1;
4129}
4130
4131/*
4132 * call-seq:
4133 * string == object -> true or false
4134 * string === object -> true or false
4135 *
4136 * Returns +true+ if +object+ has the same length and content;
4137 * as +self+; +false+ otherwise:
4138 *
4139 * s = 'foo'
4140 * s == 'foo' # => true
4141 * s == 'food' # => false
4142 * s == 'FOO' # => false
4143 *
4144 * Returns +false+ if the two strings' encodings are not compatible:
4145 * "\u{e4 f6 fc}".encode("ISO-8859-1") == ("\u{c4 d6 dc}") # => false
4146 *
4147 * If +object+ is not an instance of +String+ but responds to +to_str+, then the
4148 * two strings are compared using <code>object.==</code>.
4149 */
4150
4151VALUE
4153{
4154 if (str1 == str2) return Qtrue;
4155 if (!RB_TYPE_P(str2, T_STRING)) {
4156 if (!rb_respond_to(str2, idTo_str)) {
4157 return Qfalse;
4158 }
4159 return rb_equal(str2, str1);
4160 }
4161 return rb_str_eql_internal(str1, str2);
4162}
4163
4164/*
4165 * call-seq:
4166 * eql?(object) -> true or false
4167 *
4168 * Returns +true+ if +object+ has the same length and content;
4169 * as +self+; +false+ otherwise:
4170 *
4171 * s = 'foo'
4172 * s.eql?('foo') # => true
4173 * s.eql?('food') # => false
4174 * s.eql?('FOO') # => false
4175 *
4176 * Returns +false+ if the two strings' encodings are not compatible:
4177 *
4178 * "\u{e4 f6 fc}".encode("ISO-8859-1").eql?("\u{c4 d6 dc}") # => false
4179 *
4180 */
4181
4182VALUE
4183rb_str_eql(VALUE str1, VALUE str2)
4184{
4185 if (str1 == str2) return Qtrue;
4186 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
4187 return rb_str_eql_internal(str1, str2);
4188}
4189
4190/*
4191 * call-seq:
4192 * string <=> other_string -> -1, 0, 1, or nil
4193 *
4194 * Compares +self+ and +other_string+, returning:
4195 *
4196 * - -1 if +other_string+ is larger.
4197 * - 0 if the two are equal.
4198 * - 1 if +other_string+ is smaller.
4199 * - +nil+ if the two are incomparable.
4200 *
4201 * Examples:
4202 *
4203 * 'foo' <=> 'foo' # => 0
4204 * 'foo' <=> 'food' # => -1
4205 * 'food' <=> 'foo' # => 1
4206 * 'FOO' <=> 'foo' # => -1
4207 * 'foo' <=> 'FOO' # => 1
4208 * 'foo' <=> 1 # => nil
4209 *
4210 */
4211
4212static VALUE
4213rb_str_cmp_m(VALUE str1, VALUE str2)
4214{
4215 int result;
4216 VALUE s = rb_check_string_type(str2);
4217 if (NIL_P(s)) {
4218 return rb_invcmp(str1, str2);
4219 }
4220 result = rb_str_cmp(str1, s);
4221 return INT2FIX(result);
4222}
4223
4224static VALUE str_casecmp(VALUE str1, VALUE str2);
4225static VALUE str_casecmp_p(VALUE str1, VALUE str2);
4226
4227/*
4228 * call-seq:
4229 * casecmp(other_string) -> -1, 0, 1, or nil
4230 *
4231 * Compares <tt>self.downcase</tt> and <tt>other_string.downcase</tt>; returns:
4232 *
4233 * - -1 if <tt>other_string.downcase</tt> is larger.
4234 * - 0 if the two are equal.
4235 * - 1 if <tt>other_string.downcase</tt> is smaller.
4236 * - +nil+ if the two are incomparable.
4237 *
4238 * Examples:
4239 *
4240 * 'foo'.casecmp('foo') # => 0
4241 * 'foo'.casecmp('food') # => -1
4242 * 'food'.casecmp('foo') # => 1
4243 * 'FOO'.casecmp('foo') # => 0
4244 * 'foo'.casecmp('FOO') # => 0
4245 * 'foo'.casecmp(1) # => nil
4246 *
4247 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4248 *
4249 * Related: String#casecmp?.
4250 *
4251 */
4252
4253static VALUE
4254rb_str_casecmp(VALUE str1, VALUE str2)
4255{
4256 VALUE s = rb_check_string_type(str2);
4257 if (NIL_P(s)) {
4258 return Qnil;
4259 }
4260 return str_casecmp(str1, s);
4261}
4262
4263static VALUE
4264str_casecmp(VALUE str1, VALUE str2)
4265{
4266 long len;
4267 rb_encoding *enc;
4268 const char *p1, *p1end, *p2, *p2end;
4269
4270 enc = rb_enc_compatible(str1, str2);
4271 if (!enc) {
4272 return Qnil;
4273 }
4274
4275 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
4276 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
4277 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4278 while (p1 < p1end && p2 < p2end) {
4279 if (*p1 != *p2) {
4280 unsigned int c1 = TOLOWER(*p1 & 0xff);
4281 unsigned int c2 = TOLOWER(*p2 & 0xff);
4282 if (c1 != c2)
4283 return INT2FIX(c1 < c2 ? -1 : 1);
4284 }
4285 p1++;
4286 p2++;
4287 }
4288 }
4289 else {
4290 while (p1 < p1end && p2 < p2end) {
4291 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4292 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4293
4294 if (0 <= c1 && 0 <= c2) {
4295 c1 = TOLOWER(c1);
4296 c2 = TOLOWER(c2);
4297 if (c1 != c2)
4298 return INT2FIX(c1 < c2 ? -1 : 1);
4299 }
4300 else {
4301 int r;
4302 l1 = rb_enc_mbclen(p1, p1end, enc);
4303 l2 = rb_enc_mbclen(p2, p2end, enc);
4304 len = l1 < l2 ? l1 : l2;
4305 r = memcmp(p1, p2, len);
4306 if (r != 0)
4307 return INT2FIX(r < 0 ? -1 : 1);
4308 if (l1 != l2)
4309 return INT2FIX(l1 < l2 ? -1 : 1);
4310 }
4311 p1 += l1;
4312 p2 += l2;
4313 }
4314 }
4315 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
4316 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
4317 return INT2FIX(-1);
4318}
4319
4320/*
4321 * call-seq:
4322 * casecmp?(other_string) -> true, false, or nil
4323 *
4324 * Returns +true+ if +self+ and +other_string+ are equal after
4325 * Unicode case folding, otherwise +false+:
4326 *
4327 * 'foo'.casecmp?('foo') # => true
4328 * 'foo'.casecmp?('food') # => false
4329 * 'food'.casecmp?('foo') # => false
4330 * 'FOO'.casecmp?('foo') # => true
4331 * 'foo'.casecmp?('FOO') # => true
4332 *
4333 * Returns +nil+ if the two values are incomparable:
4334 *
4335 * 'foo'.casecmp?(1) # => nil
4336 *
4337 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4338 *
4339 * Related: String#casecmp.
4340 *
4341 */
4342
4343static VALUE
4344rb_str_casecmp_p(VALUE str1, VALUE str2)
4345{
4346 VALUE s = rb_check_string_type(str2);
4347 if (NIL_P(s)) {
4348 return Qnil;
4349 }
4350 return str_casecmp_p(str1, s);
4351}
4352
4353static VALUE
4354str_casecmp_p(VALUE str1, VALUE str2)
4355{
4356 rb_encoding *enc;
4357 VALUE folded_str1, folded_str2;
4358 VALUE fold_opt = sym_fold;
4359
4360 enc = rb_enc_compatible(str1, str2);
4361 if (!enc) {
4362 return Qnil;
4363 }
4364
4365 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4366 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4367
4368 return rb_str_eql(folded_str1, folded_str2);
4369}
4370
4371static long
4372strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
4373 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
4374{
4375 const char *search_start = str_ptr;
4376 long pos, search_len = str_len - offset;
4377
4378 for (;;) {
4379 const char *t;
4380 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4381 if (pos < 0) return pos;
4382 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
4383 if (t == search_start + pos) break;
4384 search_len -= t - search_start;
4385 if (search_len <= 0) return -1;
4386 offset += t - search_start;
4387 search_start = t;
4388 }
4389 return pos + offset;
4390}
4391
4392/* found index in byte */
4393#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4394#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4395
4396static long
4397rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
4398{
4399 const char *str_ptr, *str_ptr_end, *sub_ptr;
4400 long str_len, sub_len;
4401 rb_encoding *enc;
4402
4403 enc = rb_enc_check(str, sub);
4404 if (is_broken_string(sub)) return -1;
4405
4406 str_ptr = RSTRING_PTR(str);
4407 str_ptr_end = RSTRING_END(str);
4408 str_len = RSTRING_LEN(str);
4409 sub_ptr = RSTRING_PTR(sub);
4410 sub_len = RSTRING_LEN(sub);
4411
4412 if (str_len < sub_len) return -1;
4413
4414 if (offset != 0) {
4415 long str_len_char, sub_len_char;
4416 int single_byte = single_byte_optimizable(str);
4417 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4418 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4419 if (offset < 0) {
4420 offset += str_len_char;
4421 if (offset < 0) return -1;
4422 }
4423 if (str_len_char - offset < sub_len_char) return -1;
4424 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4425 str_ptr += offset;
4426 }
4427 if (sub_len == 0) return offset;
4428
4429 /* need proceed one character at a time */
4430 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4431}
4432
4433
4434/*
4435 * call-seq:
4436 * index(substring, offset = 0) -> integer or nil
4437 * index(regexp, offset = 0) -> integer or nil
4438 *
4439 * :include: doc/string/index.rdoc
4440 *
4441 */
4442
4443static VALUE
4444rb_str_index_m(int argc, VALUE *argv, VALUE str)
4445{
4446 VALUE sub;
4447 VALUE initpos;
4448 rb_encoding *enc = STR_ENC_GET(str);
4449 long pos;
4450
4451 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4452 long slen = str_strlen(str, enc); /* str's enc */
4453 pos = NUM2LONG(initpos);
4454 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4455 if (RB_TYPE_P(sub, T_REGEXP)) {
4457 }
4458 return Qnil;
4459 }
4460 }
4461 else {
4462 pos = 0;
4463 }
4464
4465 if (RB_TYPE_P(sub, T_REGEXP)) {
4466 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4467 enc, single_byte_optimizable(str));
4468
4469 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4470 VALUE match = rb_backref_get();
4471 struct re_registers *regs = RMATCH_REGS(match);
4472 pos = rb_str_sublen(str, BEG(0));
4473 return LONG2NUM(pos);
4474 }
4475 }
4476 else {
4477 StringValue(sub);
4478 pos = rb_str_index(str, sub, pos);
4479 if (pos >= 0) {
4480 pos = rb_str_sublen(str, pos);
4481 return LONG2NUM(pos);
4482 }
4483 }
4484 return Qnil;
4485}
4486
4487/* Ensure that the given pos is a valid character boundary.
4488 * Note that in this function, "character" means a code point
4489 * (Unicode scalar value), not a grapheme cluster.
4490 */
4491static void
4492str_ensure_byte_pos(VALUE str, long pos)
4493{
4494 if (!single_byte_optimizable(str)) {
4495 const char *s = RSTRING_PTR(str);
4496 const char *e = RSTRING_END(str);
4497 const char *p = s + pos;
4498 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4499 rb_raise(rb_eIndexError,
4500 "offset %ld does not land on character boundary", pos);
4501 }
4502 }
4503}
4504
4505/*
4506 * call-seq:
4507 * byteindex(substring, offset = 0) -> integer or nil
4508 * byteindex(regexp, offset = 0) -> integer or nil
4509 *
4510 * Returns the Integer byte-based index of the first occurrence of the given +substring+,
4511 * or +nil+ if none found:
4512 *
4513 * 'foo'.byteindex('f') # => 0
4514 * 'foo'.byteindex('o') # => 1
4515 * 'foo'.byteindex('oo') # => 1
4516 * 'foo'.byteindex('ooo') # => nil
4517 *
4518 * Returns the Integer byte-based index of the first match for the given Regexp +regexp+,
4519 * or +nil+ if none found:
4520 *
4521 * 'foo'.byteindex(/f/) # => 0
4522 * 'foo'.byteindex(/o/) # => 1
4523 * 'foo'.byteindex(/oo/) # => 1
4524 * 'foo'.byteindex(/ooo/) # => nil
4525 *
4526 * Integer argument +offset+, if given, specifies the byte-based position in the
4527 * string to begin the search:
4528 *
4529 * 'foo'.byteindex('o', 1) # => 1
4530 * 'foo'.byteindex('o', 2) # => 2
4531 * 'foo'.byteindex('o', 3) # => nil
4532 *
4533 * If +offset+ is negative, counts backward from the end of +self+:
4534 *
4535 * 'foo'.byteindex('o', -1) # => 2
4536 * 'foo'.byteindex('o', -2) # => 1
4537 * 'foo'.byteindex('o', -3) # => 1
4538 * 'foo'.byteindex('o', -4) # => nil
4539 *
4540 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4541 * raised.
4542 *
4543 * Related: String#index, String#byterindex.
4544 */
4545
4546static VALUE
4547rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4548{
4549 VALUE sub;
4550 VALUE initpos;
4551 long pos;
4552
4553 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4554 long slen = RSTRING_LEN(str);
4555 pos = NUM2LONG(initpos);
4556 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4557 if (RB_TYPE_P(sub, T_REGEXP)) {
4559 }
4560 return Qnil;
4561 }
4562 }
4563 else {
4564 pos = 0;
4565 }
4566
4567 str_ensure_byte_pos(str, pos);
4568
4569 if (RB_TYPE_P(sub, T_REGEXP)) {
4570 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4571 VALUE match = rb_backref_get();
4572 struct re_registers *regs = RMATCH_REGS(match);
4573 pos = BEG(0);
4574 return LONG2NUM(pos);
4575 }
4576 }
4577 else {
4578 StringValue(sub);
4579 pos = rb_str_byteindex(str, sub, pos);
4580 if (pos >= 0) return LONG2NUM(pos);
4581 }
4582 return Qnil;
4583}
4584
4585#ifndef HAVE_MEMRCHR
4586static void*
4587memrchr(const char *search_str, int chr, long search_len)
4588{
4589 const char *ptr = search_str + search_len;
4590 while (ptr > search_str) {
4591 if ((unsigned char)*(--ptr) == chr) return (void *)ptr;
4592 }
4593
4594 return ((void *)0);
4595}
4596#endif
4597
4598static long
4599str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4600{
4601 char *hit, *adjusted;
4602 int c;
4603 long slen, searchlen;
4604 char *sbeg, *e, *t;
4605
4606 sbeg = RSTRING_PTR(str);
4607 slen = RSTRING_LEN(sub);
4608 if (slen == 0) return s - sbeg;
4609 e = RSTRING_END(str);
4610 t = RSTRING_PTR(sub);
4611 c = *t & 0xff;
4612 searchlen = s - sbeg + 1;
4613
4614 if (memcmp(s, t, slen) == 0) {
4615 return s - sbeg;
4616 }
4617
4618 do {
4619 hit = memrchr(sbeg, c, searchlen);
4620 if (!hit) break;
4621 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4622 if (hit != adjusted) {
4623 searchlen = adjusted - sbeg;
4624 continue;
4625 }
4626 if (memcmp(hit, t, slen) == 0)
4627 return hit - sbeg;
4628 searchlen = adjusted - sbeg;
4629 } while (searchlen > 0);
4630
4631 return -1;
4632}
4633
4634/* found index in byte */
4635static long
4636rb_str_rindex(VALUE str, VALUE sub, long pos)
4637{
4638 long len, slen;
4639 char *sbeg, *s;
4640 rb_encoding *enc;
4641 int singlebyte;
4642
4643 enc = rb_enc_check(str, sub);
4644 if (is_broken_string(sub)) return -1;
4645 singlebyte = single_byte_optimizable(str);
4646 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4647 slen = str_strlen(sub, enc); /* rb_enc_check */
4648
4649 /* substring longer than string */
4650 if (len < slen) return -1;
4651 if (len - pos < slen) pos = len - slen;
4652 if (len == 0) return pos;
4653
4654 sbeg = RSTRING_PTR(str);
4655
4656 if (pos == 0) {
4657 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4658 return 0;
4659 else
4660 return -1;
4661 }
4662
4663 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4664 return str_rindex(str, sub, s, enc);
4665}
4666
4667/*
4668 * call-seq:
4669 * rindex(substring, offset = self.length) -> integer or nil
4670 * rindex(regexp, offset = self.length) -> integer or nil
4671 *
4672 * Returns the Integer index of the _last_ occurrence of the given +substring+,
4673 * or +nil+ if none found:
4674 *
4675 * 'foo'.rindex('f') # => 0
4676 * 'foo'.rindex('o') # => 2
4677 * 'foo'.rindex('oo') # => 1
4678 * 'foo'.rindex('ooo') # => nil
4679 *
4680 * Returns the Integer index of the _last_ match for the given Regexp +regexp+,
4681 * or +nil+ if none found:
4682 *
4683 * 'foo'.rindex(/f/) # => 0
4684 * 'foo'.rindex(/o/) # => 2
4685 * 'foo'.rindex(/oo/) # => 1
4686 * 'foo'.rindex(/ooo/) # => nil
4687 *
4688 * The _last_ match means starting at the possible last position, not
4689 * the last of longest matches.
4690 *
4691 * 'foo'.rindex(/o+/) # => 2
4692 * $~ #=> #<MatchData "o">
4693 *
4694 * To get the last longest match, needs to combine with negative
4695 * lookbehind.
4696 *
4697 * 'foo'.rindex(/(?<!o)o+/) # => 1
4698 * $~ #=> #<MatchData "oo">
4699 *
4700 * Or String#index with negative lookforward.
4701 *
4702 * 'foo'.index(/o+(?!.*o)/) # => 1
4703 * $~ #=> #<MatchData "oo">
4704 *
4705 * Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4706 * string to _end_ the search:
4707 *
4708 * 'foo'.rindex('o', 0) # => nil
4709 * 'foo'.rindex('o', 1) # => 1
4710 * 'foo'.rindex('o', 2) # => 2
4711 * 'foo'.rindex('o', 3) # => 2
4712 *
4713 * If +offset+ is a negative Integer, the maximum starting position in the
4714 * string to _end_ the search is the sum of the string's length and +offset+:
4715 *
4716 * 'foo'.rindex('o', -1) # => 2
4717 * 'foo'.rindex('o', -2) # => 1
4718 * 'foo'.rindex('o', -3) # => nil
4719 * 'foo'.rindex('o', -4) # => nil
4720 *
4721 * Related: String#index.
4722 */
4723
4724static VALUE
4725rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4726{
4727 VALUE sub;
4728 VALUE initpos;
4729 rb_encoding *enc = STR_ENC_GET(str);
4730 long pos, len = str_strlen(str, enc); /* str's enc */
4731
4732 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4733 pos = NUM2LONG(initpos);
4734 if (pos < 0 && (pos += len) < 0) {
4735 if (RB_TYPE_P(sub, T_REGEXP)) {
4737 }
4738 return Qnil;
4739 }
4740 if (pos > len) pos = len;
4741 }
4742 else {
4743 pos = len;
4744 }
4745
4746 if (RB_TYPE_P(sub, T_REGEXP)) {
4747 /* enc = rb_enc_check(str, sub); */
4748 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4749 enc, single_byte_optimizable(str));
4750
4751 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4752 VALUE match = rb_backref_get();
4753 struct re_registers *regs = RMATCH_REGS(match);
4754 pos = rb_str_sublen(str, BEG(0));
4755 return LONG2NUM(pos);
4756 }
4757 }
4758 else {
4759 StringValue(sub);
4760 pos = rb_str_rindex(str, sub, pos);
4761 if (pos >= 0) {
4762 pos = rb_str_sublen(str, pos);
4763 return LONG2NUM(pos);
4764 }
4765 }
4766 return Qnil;
4767}
4768
4769static long
4770rb_str_byterindex(VALUE str, VALUE sub, long pos)
4771{
4772 long len, slen;
4773 char *sbeg, *s;
4774 rb_encoding *enc;
4775
4776 enc = rb_enc_check(str, sub);
4777 if (is_broken_string(sub)) return -1;
4778 len = RSTRING_LEN(str);
4779 slen = RSTRING_LEN(sub);
4780
4781 /* substring longer than string */
4782 if (len < slen) return -1;
4783 if (len - pos < slen) pos = len - slen;
4784 if (len == 0) return pos;
4785
4786 sbeg = RSTRING_PTR(str);
4787
4788 if (pos == 0) {
4789 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4790 return 0;
4791 else
4792 return -1;
4793 }
4794
4795 s = sbeg + pos;
4796 return str_rindex(str, sub, s, enc);
4797}
4798
4799
4800/*
4801 * call-seq:
4802 * byterindex(substring, offset = self.bytesize) -> integer or nil
4803 * byterindex(regexp, offset = self.bytesize) -> integer or nil
4804 *
4805 * Returns the Integer byte-based index of the _last_ occurrence of the given +substring+,
4806 * or +nil+ if none found:
4807 *
4808 * 'foo'.byterindex('f') # => 0
4809 * 'foo'.byterindex('o') # => 2
4810 * 'foo'.byterindex('oo') # => 1
4811 * 'foo'.byterindex('ooo') # => nil
4812 *
4813 * Returns the Integer byte-based index of the _last_ match for the given Regexp +regexp+,
4814 * or +nil+ if none found:
4815 *
4816 * 'foo'.byterindex(/f/) # => 0
4817 * 'foo'.byterindex(/o/) # => 2
4818 * 'foo'.byterindex(/oo/) # => 1
4819 * 'foo'.byterindex(/ooo/) # => nil
4820 *
4821 * The _last_ match means starting at the possible last position, not
4822 * the last of longest matches.
4823 *
4824 * 'foo'.byterindex(/o+/) # => 2
4825 * $~ #=> #<MatchData "o">
4826 *
4827 * To get the last longest match, needs to combine with negative
4828 * lookbehind.
4829 *
4830 * 'foo'.byterindex(/(?<!o)o+/) # => 1
4831 * $~ #=> #<MatchData "oo">
4832 *
4833 * Or String#byteindex with negative lookforward.
4834 *
4835 * 'foo'.byteindex(/o+(?!.*o)/) # => 1
4836 * $~ #=> #<MatchData "oo">
4837 *
4838 * Integer argument +offset+, if given and non-negative, specifies the maximum starting byte-based position in the
4839 * string to _end_ the search:
4840 *
4841 * 'foo'.byterindex('o', 0) # => nil
4842 * 'foo'.byterindex('o', 1) # => 1
4843 * 'foo'.byterindex('o', 2) # => 2
4844 * 'foo'.byterindex('o', 3) # => 2
4845 *
4846 * If +offset+ is a negative Integer, the maximum starting position in the
4847 * string to _end_ the search is the sum of the string's length and +offset+:
4848 *
4849 * 'foo'.byterindex('o', -1) # => 2
4850 * 'foo'.byterindex('o', -2) # => 1
4851 * 'foo'.byterindex('o', -3) # => nil
4852 * 'foo'.byterindex('o', -4) # => nil
4853 *
4854 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4855 * raised.
4856 *
4857 * Related: String#byteindex.
4858 */
4859
4860static VALUE
4861rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
4862{
4863 VALUE sub;
4864 VALUE initpos;
4865 long pos, len = RSTRING_LEN(str);
4866
4867 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4868 pos = NUM2LONG(initpos);
4869 if (pos < 0 && (pos += len) < 0) {
4870 if (RB_TYPE_P(sub, T_REGEXP)) {
4872 }
4873 return Qnil;
4874 }
4875 if (pos > len) pos = len;
4876 }
4877 else {
4878 pos = len;
4879 }
4880
4881 str_ensure_byte_pos(str, pos);
4882
4883 if (RB_TYPE_P(sub, T_REGEXP)) {
4884 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4885 VALUE match = rb_backref_get();
4886 struct re_registers *regs = RMATCH_REGS(match);
4887 pos = BEG(0);
4888 return LONG2NUM(pos);
4889 }
4890 }
4891 else {
4892 StringValue(sub);
4893 pos = rb_str_byterindex(str, sub, pos);
4894 if (pos >= 0) return LONG2NUM(pos);
4895 }
4896 return Qnil;
4897}
4898
4899/*
4900 * call-seq:
4901 * string =~ regexp -> integer or nil
4902 * string =~ object -> integer or nil
4903 *
4904 * Returns the Integer index of the first substring that matches
4905 * the given +regexp+, or +nil+ if no match found:
4906 *
4907 * 'foo' =~ /f/ # => 0
4908 * 'foo' =~ /o/ # => 1
4909 * 'foo' =~ /x/ # => nil
4910 *
4911 * Note: also updates Regexp@Global+Variables.
4912 *
4913 * If the given +object+ is not a Regexp, returns the value
4914 * returned by <tt>object =~ self</tt>.
4915 *
4916 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
4917 * (see Regexp#=~):
4918 *
4919 * number= nil
4920 * "no. 9" =~ /(?<number>\d+)/
4921 * number # => nil (not assigned)
4922 * /(?<number>\d+)/ =~ "no. 9"
4923 * number #=> "9"
4924 *
4925 */
4926
4927static VALUE
4928rb_str_match(VALUE x, VALUE y)
4929{
4930 switch (OBJ_BUILTIN_TYPE(y)) {
4931 case T_STRING:
4932 rb_raise(rb_eTypeError, "type mismatch: String given");
4933
4934 case T_REGEXP:
4935 return rb_reg_match(y, x);
4936
4937 default:
4938 return rb_funcall(y, idEqTilde, 1, x);
4939 }
4940}
4941
4942
4943static VALUE get_pat(VALUE);
4944
4945
4946/*
4947 * call-seq:
4948 * match(pattern, offset = 0) -> matchdata or nil
4949 * match(pattern, offset = 0) {|matchdata| ... } -> object
4950 *
4951 * Returns a MatchData object (or +nil+) based on +self+ and the given +pattern+.
4952 *
4953 * Note: also updates Regexp@Global+Variables.
4954 *
4955 * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
4956 * regexp = Regexp.new(pattern)
4957 * - Computes +matchdata+, which will be either a MatchData object or +nil+
4958 * (see Regexp#match):
4959 * matchdata = <tt>regexp.match(self)
4960 *
4961 * With no block given, returns the computed +matchdata+:
4962 *
4963 * 'foo'.match('f') # => #<MatchData "f">
4964 * 'foo'.match('o') # => #<MatchData "o">
4965 * 'foo'.match('x') # => nil
4966 *
4967 * If Integer argument +offset+ is given, the search begins at index +offset+:
4968 *
4969 * 'foo'.match('f', 1) # => nil
4970 * 'foo'.match('o', 1) # => #<MatchData "o">
4971 *
4972 * With a block given, calls the block with the computed +matchdata+
4973 * and returns the block's return value:
4974 *
4975 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
4976 * 'foo'.match(/x/) {|matchdata| matchdata } # => nil
4977 * 'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
4978 *
4979 */
4980
4981static VALUE
4982rb_str_match_m(int argc, VALUE *argv, VALUE str)
4983{
4984 VALUE re, result;
4985 if (argc < 1)
4986 rb_check_arity(argc, 1, 2);
4987 re = argv[0];
4988 argv[0] = str;
4989 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
4990 if (!NIL_P(result) && rb_block_given_p()) {
4991 return rb_yield(result);
4992 }
4993 return result;
4994}
4995
4996/*
4997 * call-seq:
4998 * match?(pattern, offset = 0) -> true or false
4999 *
5000 * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
5001 *
5002 * Note: does not update Regexp@Global+Variables.
5003 *
5004 * Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5005 * regexp = Regexp.new(pattern)
5006 *
5007 * Returns +true+ if <tt>self+.match(regexp)</tt> returns a MatchData object,
5008 * +false+ otherwise:
5009 *
5010 * 'foo'.match?(/o/) # => true
5011 * 'foo'.match?('o') # => true
5012 * 'foo'.match?(/x/) # => false
5013 *
5014 * If Integer argument +offset+ is given, the search begins at index +offset+:
5015 * 'foo'.match?('f', 1) # => false
5016 * 'foo'.match?('o', 1) # => true
5017 *
5018 */
5019
5020static VALUE
5021rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
5022{
5023 VALUE re;
5024 rb_check_arity(argc, 1, 2);
5025 re = get_pat(argv[0]);
5026 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
5027}
5028
5029enum neighbor_char {
5030 NEIGHBOR_NOT_CHAR,
5031 NEIGHBOR_FOUND,
5032 NEIGHBOR_WRAPPED
5033};
5034
5035static enum neighbor_char
5036enc_succ_char(char *p, long len, rb_encoding *enc)
5037{
5038 long i;
5039 int l;
5040
5041 if (rb_enc_mbminlen(enc) > 1) {
5042 /* wchar, trivial case */
5043 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5044 if (!MBCLEN_CHARFOUND_P(r)) {
5045 return NEIGHBOR_NOT_CHAR;
5046 }
5047 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
5048 l = rb_enc_code_to_mbclen(c, enc);
5049 if (!l) return NEIGHBOR_NOT_CHAR;
5050 if (l != len) return NEIGHBOR_WRAPPED;
5051 rb_enc_mbcput(c, p, enc);
5052 r = rb_enc_precise_mbclen(p, p + len, enc);
5053 if (!MBCLEN_CHARFOUND_P(r)) {
5054 return NEIGHBOR_NOT_CHAR;
5055 }
5056 return NEIGHBOR_FOUND;
5057 }
5058 while (1) {
5059 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
5060 p[i] = '\0';
5061 if (i < 0)
5062 return NEIGHBOR_WRAPPED;
5063 ++((unsigned char*)p)[i];
5064 l = rb_enc_precise_mbclen(p, p+len, enc);
5065 if (MBCLEN_CHARFOUND_P(l)) {
5066 l = MBCLEN_CHARFOUND_LEN(l);
5067 if (l == len) {
5068 return NEIGHBOR_FOUND;
5069 }
5070 else {
5071 memset(p+l, 0xff, len-l);
5072 }
5073 }
5074 if (MBCLEN_INVALID_P(l) && i < len-1) {
5075 long len2;
5076 int l2;
5077 for (len2 = len-1; 0 < len2; len2--) {
5078 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5079 if (!MBCLEN_INVALID_P(l2))
5080 break;
5081 }
5082 memset(p+len2+1, 0xff, len-(len2+1));
5083 }
5084 }
5085}
5086
5087static enum neighbor_char
5088enc_pred_char(char *p, long len, rb_encoding *enc)
5089{
5090 long i;
5091 int l;
5092 if (rb_enc_mbminlen(enc) > 1) {
5093 /* wchar, trivial case */
5094 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5095 if (!MBCLEN_CHARFOUND_P(r)) {
5096 return NEIGHBOR_NOT_CHAR;
5097 }
5098 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
5099 if (!c) return NEIGHBOR_NOT_CHAR;
5100 --c;
5101 l = rb_enc_code_to_mbclen(c, enc);
5102 if (!l) return NEIGHBOR_NOT_CHAR;
5103 if (l != len) return NEIGHBOR_WRAPPED;
5104 rb_enc_mbcput(c, p, enc);
5105 r = rb_enc_precise_mbclen(p, p + len, enc);
5106 if (!MBCLEN_CHARFOUND_P(r)) {
5107 return NEIGHBOR_NOT_CHAR;
5108 }
5109 return NEIGHBOR_FOUND;
5110 }
5111 while (1) {
5112 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
5113 p[i] = '\xff';
5114 if (i < 0)
5115 return NEIGHBOR_WRAPPED;
5116 --((unsigned char*)p)[i];
5117 l = rb_enc_precise_mbclen(p, p+len, enc);
5118 if (MBCLEN_CHARFOUND_P(l)) {
5119 l = MBCLEN_CHARFOUND_LEN(l);
5120 if (l == len) {
5121 return NEIGHBOR_FOUND;
5122 }
5123 else {
5124 memset(p+l, 0, len-l);
5125 }
5126 }
5127 if (MBCLEN_INVALID_P(l) && i < len-1) {
5128 long len2;
5129 int l2;
5130 for (len2 = len-1; 0 < len2; len2--) {
5131 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5132 if (!MBCLEN_INVALID_P(l2))
5133 break;
5134 }
5135 memset(p+len2+1, 0, len-(len2+1));
5136 }
5137 }
5138}
5139
5140/*
5141 overwrite +p+ by succeeding letter in +enc+ and returns
5142 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
5143 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
5144 assuming each ranges are successive, and mbclen
5145 never change in each ranges.
5146 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
5147 character.
5148 */
5149static enum neighbor_char
5150enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
5151{
5152 enum neighbor_char ret;
5153 unsigned int c;
5154 int ctype;
5155 int range;
5156 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5157
5158 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
5159 int try;
5160 const int max_gaps = 1;
5161
5162 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5163 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
5164 ctype = ONIGENC_CTYPE_DIGIT;
5165 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
5166 ctype = ONIGENC_CTYPE_ALPHA;
5167 else
5168 return NEIGHBOR_NOT_CHAR;
5169
5170 MEMCPY(save, p, char, len);
5171 for (try = 0; try <= max_gaps; ++try) {
5172 ret = enc_succ_char(p, len, enc);
5173 if (ret == NEIGHBOR_FOUND) {
5174 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5175 if (rb_enc_isctype(c, ctype, enc))
5176 return NEIGHBOR_FOUND;
5177 }
5178 }
5179 MEMCPY(p, save, char, len);
5180 range = 1;
5181 while (1) {
5182 MEMCPY(save, p, char, len);
5183 ret = enc_pred_char(p, len, enc);
5184 if (ret == NEIGHBOR_FOUND) {
5185 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5186 if (!rb_enc_isctype(c, ctype, enc)) {
5187 MEMCPY(p, save, char, len);
5188 break;
5189 }
5190 }
5191 else {
5192 MEMCPY(p, save, char, len);
5193 break;
5194 }
5195 range++;
5196 }
5197 if (range == 1) {
5198 return NEIGHBOR_NOT_CHAR;
5199 }
5200
5201 if (ctype != ONIGENC_CTYPE_DIGIT) {
5202 MEMCPY(carry, p, char, len);
5203 return NEIGHBOR_WRAPPED;
5204 }
5205
5206 MEMCPY(carry, p, char, len);
5207 enc_succ_char(carry, len, enc);
5208 return NEIGHBOR_WRAPPED;
5209}
5210
5211
5212static VALUE str_succ(VALUE str);
5213
5214/*
5215 * call-seq:
5216 * succ -> new_str
5217 *
5218 * Returns the successor to +self+. The successor is calculated by
5219 * incrementing characters.
5220 *
5221 * The first character to be incremented is the rightmost alphanumeric:
5222 * or, if no alphanumerics, the rightmost character:
5223 *
5224 * 'THX1138'.succ # => "THX1139"
5225 * '<<koala>>'.succ # => "<<koalb>>"
5226 * '***'.succ # => '**+'
5227 *
5228 * The successor to a digit is another digit, "carrying" to the next-left
5229 * character for a "rollover" from 9 to 0, and prepending another digit
5230 * if necessary:
5231 *
5232 * '00'.succ # => "01"
5233 * '09'.succ # => "10"
5234 * '99'.succ # => "100"
5235 *
5236 * The successor to a letter is another letter of the same case,
5237 * carrying to the next-left character for a rollover,
5238 * and prepending another same-case letter if necessary:
5239 *
5240 * 'aa'.succ # => "ab"
5241 * 'az'.succ # => "ba"
5242 * 'zz'.succ # => "aaa"
5243 * 'AA'.succ # => "AB"
5244 * 'AZ'.succ # => "BA"
5245 * 'ZZ'.succ # => "AAA"
5246 *
5247 * The successor to a non-alphanumeric character is the next character
5248 * in the underlying character set's collating sequence,
5249 * carrying to the next-left character for a rollover,
5250 * and prepending another character if necessary:
5251 *
5252 * s = 0.chr * 3
5253 * s # => "\x00\x00\x00"
5254 * s.succ # => "\x00\x00\x01"
5255 * s = 255.chr * 3
5256 * s # => "\xFF\xFF\xFF"
5257 * s.succ # => "\x01\x00\x00\x00"
5258 *
5259 * Carrying can occur between and among mixtures of alphanumeric characters:
5260 *
5261 * s = 'zz99zz99'
5262 * s.succ # => "aaa00aa00"
5263 * s = '99zz99zz'
5264 * s.succ # => "100aa00aa"
5265 *
5266 * The successor to an empty +String+ is a new empty +String+:
5267 *
5268 * ''.succ # => ""
5269 *
5270 */
5271
5272VALUE
5274{
5275 VALUE str;
5276 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5277 rb_enc_cr_str_copy_for_substr(str, orig);
5278 return str_succ(str);
5279}
5280
5281static VALUE
5282str_succ(VALUE str)
5283{
5284 rb_encoding *enc;
5285 char *sbeg, *s, *e, *last_alnum = 0;
5286 int found_alnum = 0;
5287 long l, slen;
5288 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
5289 long carry_pos = 0, carry_len = 1;
5290 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5291
5292 slen = RSTRING_LEN(str);
5293 if (slen == 0) return str;
5294
5295 enc = STR_ENC_GET(str);
5296 sbeg = RSTRING_PTR(str);
5297 s = e = sbeg + slen;
5298
5299 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5300 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5301 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
5302 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
5303 break;
5304 }
5305 }
5306 l = rb_enc_precise_mbclen(s, e, enc);
5307 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5308 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5309 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5310 switch (neighbor) {
5311 case NEIGHBOR_NOT_CHAR:
5312 continue;
5313 case NEIGHBOR_FOUND:
5314 return str;
5315 case NEIGHBOR_WRAPPED:
5316 last_alnum = s;
5317 break;
5318 }
5319 found_alnum = 1;
5320 carry_pos = s - sbeg;
5321 carry_len = l;
5322 }
5323 if (!found_alnum) { /* str contains no alnum */
5324 s = e;
5325 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5326 enum neighbor_char neighbor;
5327 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5328 l = rb_enc_precise_mbclen(s, e, enc);
5329 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5330 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5331 MEMCPY(tmp, s, char, l);
5332 neighbor = enc_succ_char(tmp, l, enc);
5333 switch (neighbor) {
5334 case NEIGHBOR_FOUND:
5335 MEMCPY(s, tmp, char, l);
5336 return str;
5337 break;
5338 case NEIGHBOR_WRAPPED:
5339 MEMCPY(s, tmp, char, l);
5340 break;
5341 case NEIGHBOR_NOT_CHAR:
5342 break;
5343 }
5344 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5345 /* wrapped to \0...\0. search next valid char. */
5346 enc_succ_char(s, l, enc);
5347 }
5348 if (!rb_enc_asciicompat(enc)) {
5349 MEMCPY(carry, s, char, l);
5350 carry_len = l;
5351 }
5352 carry_pos = s - sbeg;
5353 }
5355 }
5356 RESIZE_CAPA(str, slen + carry_len);
5357 sbeg = RSTRING_PTR(str);
5358 s = sbeg + carry_pos;
5359 memmove(s + carry_len, s, slen - carry_pos);
5360 memmove(s, carry, carry_len);
5361 slen += carry_len;
5362 STR_SET_LEN(str, slen);
5363 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5365 return str;
5366}
5367
5368
5369/*
5370 * call-seq:
5371 * succ! -> self
5372 *
5373 * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
5374 */
5375
5376static VALUE
5377rb_str_succ_bang(VALUE str)
5378{
5379 rb_str_modify(str);
5380 str_succ(str);
5381 return str;
5382}
5383
5384static int
5385all_digits_p(const char *s, long len)
5386{
5387 while (len-- > 0) {
5388 if (!ISDIGIT(*s)) return 0;
5389 s++;
5390 }
5391 return 1;
5392}
5393
5394static int
5395str_upto_i(VALUE str, VALUE arg)
5396{
5397 rb_yield(str);
5398 return 0;
5399}
5400
5401/*
5402 * call-seq:
5403 * upto(other_string, exclusive = false) {|string| ... } -> self
5404 * upto(other_string, exclusive = false) -> new_enumerator
5405 *
5406 * With a block given, calls the block with each +String+ value
5407 * returned by successive calls to String#succ;
5408 * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
5409 * the sequence terminates when value +other_string+ is reached;
5410 * returns +self+:
5411 *
5412 * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
5413 * Output:
5414 *
5415 * a8 a9 b0 b1 b2 b3 b4 b5 b6
5416 *
5417 * If argument +exclusive+ is given as a truthy object, the last value is omitted:
5418 *
5419 * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
5420 *
5421 * Output:
5422 *
5423 * a8 a9 b0 b1 b2 b3 b4 b5
5424 *
5425 * If +other_string+ would not be reached, does not call the block:
5426 *
5427 * '25'.upto('5') {|s| fail s }
5428 * 'aa'.upto('a') {|s| fail s }
5429 *
5430 * With no block given, returns a new Enumerator:
5431 *
5432 * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
5433 *
5434 */
5435
5436static VALUE
5437rb_str_upto(int argc, VALUE *argv, VALUE beg)
5438{
5439 VALUE end, exclusive;
5440
5441 rb_scan_args(argc, argv, "11", &end, &exclusive);
5442 RETURN_ENUMERATOR(beg, argc, argv);
5443 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5444}
5445
5446VALUE
5447rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5448{
5449 VALUE current, after_end;
5450 ID succ;
5451 int n, ascii;
5452 rb_encoding *enc;
5453
5454 CONST_ID(succ, "succ");
5455 StringValue(end);
5456 enc = rb_enc_check(beg, end);
5457 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5458 /* single character */
5459 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5460 char c = RSTRING_PTR(beg)[0];
5461 char e = RSTRING_PTR(end)[0];
5462
5463 if (c > e || (excl && c == e)) return beg;
5464 for (;;) {
5465 VALUE str = rb_enc_str_new(&c, 1, enc);
5467 if ((*each)(str, arg)) break;
5468 if (!excl && c == e) break;
5469 c++;
5470 if (excl && c == e) break;
5471 }
5472 return beg;
5473 }
5474 /* both edges are all digits */
5475 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5476 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5477 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5478 VALUE b, e;
5479 int width;
5480
5481 width = RSTRING_LENINT(beg);
5482 b = rb_str_to_inum(beg, 10, FALSE);
5483 e = rb_str_to_inum(end, 10, FALSE);
5484 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5485 long bi = FIX2LONG(b);
5486 long ei = FIX2LONG(e);
5487 rb_encoding *usascii = rb_usascii_encoding();
5488
5489 while (bi <= ei) {
5490 if (excl && bi == ei) break;
5491 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5492 bi++;
5493 }
5494 }
5495 else {
5496 ID op = excl ? '<' : idLE;
5497 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5498
5499 args[0] = INT2FIX(width);
5500 while (rb_funcall(b, op, 1, e)) {
5501 args[1] = b;
5502 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5503 b = rb_funcallv(b, succ, 0, 0);
5504 }
5505 }
5506 return beg;
5507 }
5508 /* normal case */
5509 n = rb_str_cmp(beg, end);
5510 if (n > 0 || (excl && n == 0)) return beg;
5511
5512 after_end = rb_funcallv(end, succ, 0, 0);
5513 current = str_duplicate(rb_cString, beg);
5514 while (!rb_str_equal(current, after_end)) {
5515 VALUE next = Qnil;
5516 if (excl || !rb_str_equal(current, end))
5517 next = rb_funcallv(current, succ, 0, 0);
5518 if ((*each)(current, arg)) break;
5519 if (NIL_P(next)) break;
5520 current = next;
5521 StringValue(current);
5522 if (excl && rb_str_equal(current, end)) break;
5523 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5524 break;
5525 }
5526
5527 return beg;
5528}
5529
5530VALUE
5531rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5532{
5533 VALUE current;
5534 ID succ;
5535
5536 CONST_ID(succ, "succ");
5537 /* both edges are all digits */
5538 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5539 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5540 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5541 int width = RSTRING_LENINT(beg);
5542 b = rb_str_to_inum(beg, 10, FALSE);
5543 if (FIXNUM_P(b)) {
5544 long bi = FIX2LONG(b);
5545 rb_encoding *usascii = rb_usascii_encoding();
5546
5547 while (FIXABLE(bi)) {
5548 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5549 bi++;
5550 }
5551 b = LONG2NUM(bi);
5552 }
5553 args[0] = INT2FIX(width);
5554 while (1) {
5555 args[1] = b;
5556 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5557 b = rb_funcallv(b, succ, 0, 0);
5558 }
5559 }
5560 /* normal case */
5561 current = str_duplicate(rb_cString, beg);
5562 while (1) {
5563 VALUE next = rb_funcallv(current, succ, 0, 0);
5564 if ((*each)(current, arg)) break;
5565 current = next;
5566 StringValue(current);
5567 if (RSTRING_LEN(current) == 0)
5568 break;
5569 }
5570
5571 return beg;
5572}
5573
5574static int
5575include_range_i(VALUE str, VALUE arg)
5576{
5577 VALUE *argp = (VALUE *)arg;
5578 if (!rb_equal(str, *argp)) return 0;
5579 *argp = Qnil;
5580 return 1;
5581}
5582
5583VALUE
5584rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5585{
5586 beg = rb_str_new_frozen(beg);
5587 StringValue(end);
5588 end = rb_str_new_frozen(end);
5589 if (NIL_P(val)) return Qfalse;
5590 val = rb_check_string_type(val);
5591 if (NIL_P(val)) return Qfalse;
5592 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5593 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5594 rb_enc_asciicompat(STR_ENC_GET(val))) {
5595 const char *bp = RSTRING_PTR(beg);
5596 const char *ep = RSTRING_PTR(end);
5597 const char *vp = RSTRING_PTR(val);
5598 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5599 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5600 return Qfalse;
5601 else {
5602 char b = *bp;
5603 char e = *ep;
5604 char v = *vp;
5605
5606 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5607 if (b <= v && v < e) return Qtrue;
5608 return RBOOL(!RTEST(exclusive) && v == e);
5609 }
5610 }
5611 }
5612#if 0
5613 /* both edges are all digits */
5614 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5615 all_digits_p(bp, RSTRING_LEN(beg)) &&
5616 all_digits_p(ep, RSTRING_LEN(end))) {
5617 /* TODO */
5618 }
5619#endif
5620 }
5621 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5622
5623 return RBOOL(NIL_P(val));
5624}
5625
5626static VALUE
5627rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5628{
5629 if (rb_reg_search(re, str, 0, 0) >= 0) {
5630 VALUE match = rb_backref_get();
5631 int nth = rb_reg_backref_number(match, backref);
5632 return rb_reg_nth_match(nth, match);
5633 }
5634 return Qnil;
5635}
5636
5637static VALUE
5638rb_str_aref(VALUE str, VALUE indx)
5639{
5640 long idx;
5641
5642 if (FIXNUM_P(indx)) {
5643 idx = FIX2LONG(indx);
5644 }
5645 else if (RB_TYPE_P(indx, T_REGEXP)) {
5646 return rb_str_subpat(str, indx, INT2FIX(0));
5647 }
5648 else if (RB_TYPE_P(indx, T_STRING)) {
5649 if (rb_str_index(str, indx, 0) != -1)
5650 return str_duplicate(rb_cString, indx);
5651 return Qnil;
5652 }
5653 else {
5654 /* check if indx is Range */
5655 long beg, len = str_strlen(str, NULL);
5656 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5657 case Qfalse:
5658 break;
5659 case Qnil:
5660 return Qnil;
5661 default:
5662 return rb_str_substr(str, beg, len);
5663 }
5664 idx = NUM2LONG(indx);
5665 }
5666
5667 return str_substr(str, idx, 1, FALSE);
5668}
5669
5670
5671/*
5672 * call-seq:
5673 * string[index] -> new_string or nil
5674 * string[start, length] -> new_string or nil
5675 * string[range] -> new_string or nil
5676 * string[regexp, capture = 0] -> new_string or nil
5677 * string[substring] -> new_string or nil
5678 *
5679 * Returns the substring of +self+ specified by the arguments.
5680 * See examples at {String Slices}[rdoc-ref:String@String+Slices].
5681 *
5682 *
5683 */
5684
5685static VALUE
5686rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5687{
5688 if (argc == 2) {
5689 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5690 return rb_str_subpat(str, argv[0], argv[1]);
5691 }
5692 else {
5693 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5694 }
5695 }
5696 rb_check_arity(argc, 1, 2);
5697 return rb_str_aref(str, argv[0]);
5698}
5699
5700VALUE
5702{
5703 char *ptr = RSTRING_PTR(str);
5704 long olen = RSTRING_LEN(str), nlen;
5705
5706 str_modifiable(str);
5707 if (len > olen) len = olen;
5708 nlen = olen - len;
5709 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5710 char *oldptr = ptr;
5711 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5712 STR_SET_EMBED(str);
5713 ptr = RSTRING(str)->as.embed.ary;
5714 memmove(ptr, oldptr + len, nlen);
5715 if (fl == STR_NOEMBED) xfree(oldptr);
5716 }
5717 else {
5718 if (!STR_SHARED_P(str)) {
5719 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5720 rb_enc_cr_str_exact_copy(shared, str);
5721 OBJ_FREEZE(shared);
5722 }
5723 ptr = RSTRING(str)->as.heap.ptr += len;
5724 }
5725 STR_SET_LEN(str, nlen);
5726
5727 if (!SHARABLE_MIDDLE_SUBSTRING) {
5728 TERM_FILL(ptr + nlen, TERM_LEN(str));
5729 }
5731 return str;
5732}
5733
5734static void
5735rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5736{
5737 char *sptr;
5738 long slen;
5739 int cr;
5740
5741 if (beg == 0 && vlen == 0) {
5742 rb_str_drop_bytes(str, len);
5743 return;
5744 }
5745
5746 str_modify_keep_cr(str);
5747 RSTRING_GETMEM(str, sptr, slen);
5748 if (len < vlen) {
5749 /* expand string */
5750 RESIZE_CAPA(str, slen + vlen - len);
5751 sptr = RSTRING_PTR(str);
5752 }
5753
5755 cr = rb_enc_str_coderange(val);
5756 else
5758
5759 if (vlen != len) {
5760 memmove(sptr + beg + vlen,
5761 sptr + beg + len,
5762 slen - (beg + len));
5763 }
5764 if (vlen < beg && len < 0) {
5765 MEMZERO(sptr + slen, char, -len);
5766 }
5767 if (vlen > 0) {
5768 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5769 }
5770 slen += vlen - len;
5771 STR_SET_LEN(str, slen);
5772 TERM_FILL(&sptr[slen], TERM_LEN(str));
5773 ENC_CODERANGE_SET(str, cr);
5774}
5775
5776static inline void
5777rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5778{
5779 rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5780}
5781
5782void
5783rb_str_update(VALUE str, long beg, long len, VALUE val)
5784{
5785 long slen;
5786 char *p, *e;
5787 rb_encoding *enc;
5788 int singlebyte = single_byte_optimizable(str);
5789 int cr;
5790
5791 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5792
5793 StringValue(val);
5794 enc = rb_enc_check(str, val);
5795 slen = str_strlen(str, enc); /* rb_enc_check */
5796
5797 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5798 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5799 }
5800 if (beg < 0) {
5801 beg += slen;
5802 }
5803 RUBY_ASSERT(beg >= 0);
5804 RUBY_ASSERT(beg <= slen);
5805
5806 if (len > slen - beg) {
5807 len = slen - beg;
5808 }
5809 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5810 if (!p) p = RSTRING_END(str);
5811 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5812 if (!e) e = RSTRING_END(str);
5813 /* error check */
5814 beg = p - RSTRING_PTR(str); /* physical position */
5815 len = e - p; /* physical length */
5816 rb_str_update_0(str, beg, len, val);
5817 rb_enc_associate(str, enc);
5819 if (cr != ENC_CODERANGE_BROKEN)
5820 ENC_CODERANGE_SET(str, cr);
5821}
5822
5823static void
5824rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5825{
5826 int nth;
5827 VALUE match;
5828 long start, end, len;
5829 rb_encoding *enc;
5830 struct re_registers *regs;
5831
5832 if (rb_reg_search(re, str, 0, 0) < 0) {
5833 rb_raise(rb_eIndexError, "regexp not matched");
5834 }
5835 match = rb_backref_get();
5836 nth = rb_reg_backref_number(match, backref);
5837 regs = RMATCH_REGS(match);
5838 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5839 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5840 }
5841 if (nth < 0) {
5842 nth += regs->num_regs;
5843 }
5844
5845 start = BEG(nth);
5846 if (start == -1) {
5847 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5848 }
5849 end = END(nth);
5850 len = end - start;
5851 StringValue(val);
5852 enc = rb_enc_check_str(str, val);
5853 rb_str_update_0(str, start, len, val);
5854 rb_enc_associate(str, enc);
5855}
5856
5857static VALUE
5858rb_str_aset(VALUE str, VALUE indx, VALUE val)
5859{
5860 long idx, beg;
5861
5862 switch (TYPE(indx)) {
5863 case T_REGEXP:
5864 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5865 return val;
5866
5867 case T_STRING:
5868 beg = rb_str_index(str, indx, 0);
5869 if (beg < 0) {
5870 rb_raise(rb_eIndexError, "string not matched");
5871 }
5872 beg = rb_str_sublen(str, beg);
5873 rb_str_update(str, beg, str_strlen(indx, NULL), val);
5874 return val;
5875
5876 default:
5877 /* check if indx is Range */
5878 {
5879 long beg, len;
5880 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5881 rb_str_update(str, beg, len, val);
5882 return val;
5883 }
5884 }
5885 /* FALLTHROUGH */
5886
5887 case T_FIXNUM:
5888 idx = NUM2LONG(indx);
5889 rb_str_update(str, idx, 1, val);
5890 return val;
5891 }
5892}
5893
5894/*
5895 * call-seq:
5896 * string[index] = new_string
5897 * string[start, length] = new_string
5898 * string[range] = new_string
5899 * string[regexp, capture = 0] = new_string
5900 * string[substring] = new_string
5901 *
5902 * Replaces all, some, or none of the contents of +self+; returns +new_string+.
5903 * See {String Slices}[rdoc-ref:String@String+Slices].
5904 *
5905 * A few examples:
5906 *
5907 * s = 'foo'
5908 * s[2] = 'rtune' # => "rtune"
5909 * s # => "fortune"
5910 * s[1, 5] = 'init' # => "init"
5911 * s # => "finite"
5912 * s[3..4] = 'al' # => "al"
5913 * s # => "finale"
5914 * s[/e$/] = 'ly' # => "ly"
5915 * s # => "finally"
5916 * s['lly'] = 'ncial' # => "ncial"
5917 * s # => "financial"
5918 *
5919 */
5920
5921static VALUE
5922rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5923{
5924 if (argc == 3) {
5925 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5926 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5927 }
5928 else {
5929 rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5930 }
5931 return argv[2];
5932 }
5933 rb_check_arity(argc, 2, 3);
5934 return rb_str_aset(str, argv[0], argv[1]);
5935}
5936
5937/*
5938 * call-seq:
5939 * insert(index, other_string) -> self
5940 *
5941 * Inserts the given +other_string+ into +self+; returns +self+.
5942 *
5943 * If the Integer +index+ is positive, inserts +other_string+ at offset +index+:
5944 *
5945 * 'foo'.insert(1, 'bar') # => "fbaroo"
5946 *
5947 * If the Integer +index+ is negative, counts backward from the end of +self+
5948 * and inserts +other_string+ at offset <tt>index+1</tt>
5949 * (that is, _after_ <tt>self[index]</tt>):
5950 *
5951 * 'foo'.insert(-2, 'bar') # => "fobaro"
5952 *
5953 */
5954
5955static VALUE
5956rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5957{
5958 long pos = NUM2LONG(idx);
5959
5960 if (pos == -1) {
5961 return rb_str_append(str, str2);
5962 }
5963 else if (pos < 0) {
5964 pos++;
5965 }
5966 rb_str_update(str, pos, 0, str2);
5967 return str;
5968}
5969
5970
5971/*
5972 * call-seq:
5973 * slice!(index) -> new_string or nil
5974 * slice!(start, length) -> new_string or nil
5975 * slice!(range) -> new_string or nil
5976 * slice!(regexp, capture = 0) -> new_string or nil
5977 * slice!(substring) -> new_string or nil
5978 *
5979 * Removes and returns the substring of +self+ specified by the arguments.
5980 * See {String Slices}[rdoc-ref:String@String+Slices].
5981 *
5982 * A few examples:
5983 *
5984 * string = "This is a string"
5985 * string.slice!(2) #=> "i"
5986 * string.slice!(3..6) #=> " is "
5987 * string.slice!(/s.*t/) #=> "sa st"
5988 * string.slice!("r") #=> "r"
5989 * string #=> "Thing"
5990 *
5991 */
5992
5993static VALUE
5994rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
5995{
5996 VALUE result = Qnil;
5997 VALUE indx;
5998 long beg, len = 1;
5999 char *p;
6000
6001 rb_check_arity(argc, 1, 2);
6002 str_modify_keep_cr(str);
6003 indx = argv[0];
6004 if (RB_TYPE_P(indx, T_REGEXP)) {
6005 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
6006 VALUE match = rb_backref_get();
6007 struct re_registers *regs = RMATCH_REGS(match);
6008 int nth = 0;
6009 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
6010 if ((nth += regs->num_regs) <= 0) return Qnil;
6011 }
6012 else if (nth >= regs->num_regs) return Qnil;
6013 beg = BEG(nth);
6014 len = END(nth) - beg;
6015 goto subseq;
6016 }
6017 else if (argc == 2) {
6018 beg = NUM2LONG(indx);
6019 len = NUM2LONG(argv[1]);
6020 goto num_index;
6021 }
6022 else if (FIXNUM_P(indx)) {
6023 beg = FIX2LONG(indx);
6024 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6025 if (!len) return Qnil;
6026 beg = p - RSTRING_PTR(str);
6027 goto subseq;
6028 }
6029 else if (RB_TYPE_P(indx, T_STRING)) {
6030 beg = rb_str_index(str, indx, 0);
6031 if (beg == -1) return Qnil;
6032 len = RSTRING_LEN(indx);
6033 result = str_duplicate(rb_cString, indx);
6034 goto squash;
6035 }
6036 else {
6037 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
6038 case Qnil:
6039 return Qnil;
6040 case Qfalse:
6041 beg = NUM2LONG(indx);
6042 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6043 if (!len) return Qnil;
6044 beg = p - RSTRING_PTR(str);
6045 goto subseq;
6046 default:
6047 goto num_index;
6048 }
6049 }
6050
6051 num_index:
6052 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6053 beg = p - RSTRING_PTR(str);
6054
6055 subseq:
6056 result = rb_str_new(RSTRING_PTR(str)+beg, len);
6057 rb_enc_cr_str_copy_for_substr(result, str);
6058
6059 squash:
6060 if (len > 0) {
6061 if (beg == 0) {
6062 rb_str_drop_bytes(str, len);
6063 }
6064 else {
6065 char *sptr = RSTRING_PTR(str);
6066 long slen = RSTRING_LEN(str);
6067 if (beg + len > slen) /* pathological check */
6068 len = slen - beg;
6069 memmove(sptr + beg,
6070 sptr + beg + len,
6071 slen - (beg + len));
6072 slen -= len;
6073 STR_SET_LEN(str, slen);
6074 TERM_FILL(&sptr[slen], TERM_LEN(str));
6075 }
6076 }
6077 return result;
6078}
6079
6080static VALUE
6081get_pat(VALUE pat)
6082{
6083 VALUE val;
6084
6085 switch (OBJ_BUILTIN_TYPE(pat)) {
6086 case T_REGEXP:
6087 return pat;
6088
6089 case T_STRING:
6090 break;
6091
6092 default:
6093 val = rb_check_string_type(pat);
6094 if (NIL_P(val)) {
6095 Check_Type(pat, T_REGEXP);
6096 }
6097 pat = val;
6098 }
6099
6100 return rb_reg_regcomp(pat);
6101}
6102
6103static VALUE
6104get_pat_quoted(VALUE pat, int check)
6105{
6106 VALUE val;
6107
6108 switch (OBJ_BUILTIN_TYPE(pat)) {
6109 case T_REGEXP:
6110 return pat;
6111
6112 case T_STRING:
6113 break;
6114
6115 default:
6116 val = rb_check_string_type(pat);
6117 if (NIL_P(val)) {
6118 Check_Type(pat, T_REGEXP);
6119 }
6120 pat = val;
6121 }
6122 if (check && is_broken_string(pat)) {
6123 rb_exc_raise(rb_reg_check_preprocess(pat));
6124 }
6125 return pat;
6126}
6127
6128static long
6129rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
6130{
6131 if (BUILTIN_TYPE(pat) == T_STRING) {
6132 pos = rb_str_byteindex(str, pat, pos);
6133 if (set_backref_str) {
6134 if (pos >= 0) {
6135 str = rb_str_new_frozen_String(str);
6136 rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6137 }
6138 else {
6140 }
6141 }
6142 return pos;
6143 }
6144 else {
6145 return rb_reg_search0(pat, str, pos, 0, set_backref_str);
6146 }
6147}
6148
6149
6150/*
6151 * call-seq:
6152 * sub!(pattern, replacement) -> self or nil
6153 * sub!(pattern) {|match| ... } -> self or nil
6154 *
6155 * Replaces the first occurrence (not all occurrences) of the given +pattern+
6156 * on +self+; returns +self+ if a replacement occurred, +nil+ otherwise.
6157 *
6158 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6159 *
6160 * Related: String#sub, String#gsub, String#gsub!.
6161 *
6162 */
6163
6164static VALUE
6165rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
6166{
6167 VALUE pat, repl, hash = Qnil;
6168 int iter = 0;
6169 long plen;
6170 int min_arity = rb_block_given_p() ? 1 : 2;
6171 long beg;
6172
6173 rb_check_arity(argc, min_arity, 2);
6174 if (argc == 1) {
6175 iter = 1;
6176 }
6177 else {
6178 repl = argv[1];
6179 hash = rb_check_hash_type(argv[1]);
6180 if (NIL_P(hash)) {
6181 StringValue(repl);
6182 }
6183 }
6184
6185 pat = get_pat_quoted(argv[0], 1);
6186
6187 str_modifiable(str);
6188 beg = rb_pat_search(pat, str, 0, 1);
6189 if (beg >= 0) {
6190 rb_encoding *enc;
6191 int cr = ENC_CODERANGE(str);
6192 long beg0, end0;
6193 VALUE match, match0 = Qnil;
6194 struct re_registers *regs;
6195 char *p, *rp;
6196 long len, rlen;
6197
6198 match = rb_backref_get();
6199 regs = RMATCH_REGS(match);
6200 if (RB_TYPE_P(pat, T_STRING)) {
6201 beg0 = beg;
6202 end0 = beg0 + RSTRING_LEN(pat);
6203 match0 = pat;
6204 }
6205 else {
6206 beg0 = BEG(0);
6207 end0 = END(0);
6208 if (iter) match0 = rb_reg_nth_match(0, match);
6209 }
6210
6211 if (iter || !NIL_P(hash)) {
6212 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6213
6214 if (iter) {
6215 repl = rb_obj_as_string(rb_yield(match0));
6216 }
6217 else {
6218 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6219 repl = rb_obj_as_string(repl);
6220 }
6221 str_mod_check(str, p, len);
6222 rb_check_frozen(str);
6223 }
6224 else {
6225 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6226 }
6227
6228 enc = rb_enc_compatible(str, repl);
6229 if (!enc) {
6230 rb_encoding *str_enc = STR_ENC_GET(str);
6231 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6232 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
6233 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
6234 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
6235 rb_enc_inspect_name(str_enc),
6236 rb_enc_inspect_name(STR_ENC_GET(repl)));
6237 }
6238 enc = STR_ENC_GET(repl);
6239 }
6240 rb_str_modify(str);
6241 rb_enc_associate(str, enc);
6243 int cr2 = ENC_CODERANGE(repl);
6244 if (cr2 == ENC_CODERANGE_BROKEN ||
6245 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
6247 else
6248 cr = cr2;
6249 }
6250 plen = end0 - beg0;
6251 rlen = RSTRING_LEN(repl);
6252 len = RSTRING_LEN(str);
6253 if (rlen > plen) {
6254 RESIZE_CAPA(str, len + rlen - plen);
6255 }
6256 p = RSTRING_PTR(str);
6257 if (rlen != plen) {
6258 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
6259 }
6260 rp = RSTRING_PTR(repl);
6261 memmove(p + beg0, rp, rlen);
6262 len += rlen - plen;
6263 STR_SET_LEN(str, len);
6264 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
6265 ENC_CODERANGE_SET(str, cr);
6266
6267 RB_GC_GUARD(match);
6268
6269 return str;
6270 }
6271 return Qnil;
6272}
6273
6274
6275/*
6276 * call-seq:
6277 * sub(pattern, replacement) -> new_string
6278 * sub(pattern) {|match| ... } -> new_string
6279 *
6280 * Returns a copy of +self+ with only the first occurrence
6281 * (not all occurrences) of the given +pattern+ replaced.
6282 *
6283 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6284 *
6285 * Related: String#sub!, String#gsub, String#gsub!.
6286 *
6287 */
6288
6289static VALUE
6290rb_str_sub(int argc, VALUE *argv, VALUE str)
6291{
6292 str = str_duplicate(rb_cString, str);
6293 rb_str_sub_bang(argc, argv, str);
6294 return str;
6295}
6296
6297static VALUE
6298str_gsub(int argc, VALUE *argv, VALUE str, int bang)
6299{
6300 VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil;
6301 long beg, beg0, end0;
6302 long offset, blen, slen, len, last;
6303 enum {STR, ITER, MAP} mode = STR;
6304 char *sp, *cp;
6305 int need_backref = -1;
6306 rb_encoding *str_enc;
6307
6308 switch (argc) {
6309 case 1:
6310 RETURN_ENUMERATOR(str, argc, argv);
6311 mode = ITER;
6312 break;
6313 case 2:
6314 repl = argv[1];
6315 hash = rb_check_hash_type(argv[1]);
6316 if (NIL_P(hash)) {
6317 StringValue(repl);
6318 }
6319 else {
6320 mode = MAP;
6321 }
6322 break;
6323 default:
6324 rb_error_arity(argc, 1, 2);
6325 }
6326
6327 pat = get_pat_quoted(argv[0], 1);
6328 beg = rb_pat_search(pat, str, 0, need_backref);
6329 if (beg < 0) {
6330 if (bang) return Qnil; /* no match, no substitution */
6331 return str_duplicate(rb_cString, str);
6332 }
6333
6334 offset = 0;
6335 blen = RSTRING_LEN(str) + 30; /* len + margin */
6336 dest = rb_str_buf_new(blen);
6337 sp = RSTRING_PTR(str);
6338 slen = RSTRING_LEN(str);
6339 cp = sp;
6340 str_enc = STR_ENC_GET(str);
6341 rb_enc_associate(dest, str_enc);
6342 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
6343
6344 do {
6345 VALUE match = rb_backref_get();
6346 struct re_registers *regs = RMATCH_REGS(match);
6347 if (RB_TYPE_P(pat, T_STRING)) {
6348 beg0 = beg;
6349 end0 = beg0 + RSTRING_LEN(pat);
6350 match0 = pat;
6351 }
6352 else {
6353 beg0 = BEG(0);
6354 end0 = END(0);
6355 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
6356 }
6357
6358 if (mode) {
6359 if (mode == ITER) {
6360 val = rb_obj_as_string(rb_yield(match0));
6361 }
6362 else {
6363 val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6364 val = rb_obj_as_string(val);
6365 }
6366 str_mod_check(str, sp, slen);
6367 if (val == dest) { /* paranoid check [ruby-dev:24827] */
6368 rb_raise(rb_eRuntimeError, "block should not cheat");
6369 }
6370 }
6371 else if (need_backref) {
6372 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6373 if (need_backref < 0) {
6374 need_backref = val != repl;
6375 }
6376 }
6377 else {
6378 val = repl;
6379 }
6380
6381 len = beg0 - offset; /* copy pre-match substr */
6382 if (len) {
6383 rb_enc_str_buf_cat(dest, cp, len, str_enc);
6384 }
6385
6386 rb_str_buf_append(dest, val);
6387
6388 last = offset;
6389 offset = end0;
6390 if (beg0 == end0) {
6391 /*
6392 * Always consume at least one character of the input string
6393 * in order to prevent infinite loops.
6394 */
6395 if (RSTRING_LEN(str) <= end0) break;
6396 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
6397 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
6398 offset = end0 + len;
6399 }
6400 cp = RSTRING_PTR(str) + offset;
6401 if (offset > RSTRING_LEN(str)) break;
6402 beg = rb_pat_search(pat, str, offset, need_backref);
6403
6404 RB_GC_GUARD(match);
6405 } while (beg >= 0);
6406 if (RSTRING_LEN(str) > offset) {
6407 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
6408 }
6409 rb_pat_search(pat, str, last, 1);
6410 if (bang) {
6411 str_shared_replace(str, dest);
6412 }
6413 else {
6414 str = dest;
6415 }
6416
6417 return str;
6418}
6419
6420
6421/*
6422 * call-seq:
6423 * gsub!(pattern, replacement) -> self or nil
6424 * gsub!(pattern) {|match| ... } -> self or nil
6425 * gsub!(pattern) -> an_enumerator
6426 *
6427 * Performs the specified substring replacement(s) on +self+;
6428 * returns +self+ if any replacement occurred, +nil+ otherwise.
6429 *
6430 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6431 *
6432 * Returns an Enumerator if no +replacement+ and no block given.
6433 *
6434 * Related: String#sub, String#gsub, String#sub!.
6435 *
6436 */
6437
6438static VALUE
6439rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6440{
6441 str_modify_keep_cr(str);
6442 return str_gsub(argc, argv, str, 1);
6443}
6444
6445
6446/*
6447 * call-seq:
6448 * gsub(pattern, replacement) -> new_string
6449 * gsub(pattern) {|match| ... } -> new_string
6450 * gsub(pattern) -> enumerator
6451 *
6452 * Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
6453 *
6454 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6455 *
6456 * Returns an Enumerator if no +replacement+ and no block given.
6457 *
6458 * Related: String#sub, String#sub!, String#gsub!.
6459 *
6460 */
6461
6462static VALUE
6463rb_str_gsub(int argc, VALUE *argv, VALUE str)
6464{
6465 return str_gsub(argc, argv, str, 0);
6466}
6467
6468
6469/*
6470 * call-seq:
6471 * replace(other_string) -> self
6472 *
6473 * Replaces the contents of +self+ with the contents of +other_string+:
6474 *
6475 * s = 'foo' # => "foo"
6476 * s.replace('bar') # => "bar"
6477 *
6478 */
6479
6480VALUE
6482{
6483 str_modifiable(str);
6484 if (str == str2) return str;
6485
6486 StringValue(str2);
6487 str_discard(str);
6488 return str_replace(str, str2);
6489}
6490
6491/*
6492 * call-seq:
6493 * clear -> self
6494 *
6495 * Removes the contents of +self+:
6496 *
6497 * s = 'foo' # => "foo"
6498 * s.clear # => ""
6499 *
6500 */
6501
6502static VALUE
6503rb_str_clear(VALUE str)
6504{
6505 str_discard(str);
6506 STR_SET_EMBED(str);
6507 STR_SET_LEN(str, 0);
6508 RSTRING_PTR(str)[0] = 0;
6509 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6511 else
6513 return str;
6514}
6515
6516/*
6517 * call-seq:
6518 * chr -> string
6519 *
6520 * Returns a string containing the first character of +self+:
6521 *
6522 * s = 'foo' # => "foo"
6523 * s.chr # => "f"
6524 *
6525 */
6526
6527static VALUE
6528rb_str_chr(VALUE str)
6529{
6530 return rb_str_substr(str, 0, 1);
6531}
6532
6533/*
6534 * call-seq:
6535 * getbyte(index) -> integer or nil
6536 *
6537 * Returns the byte at zero-based +index+ as an integer, or +nil+ if +index+ is out of range:
6538 *
6539 * s = 'abcde' # => "abcde"
6540 * s.getbyte(0) # => 97
6541 * s.getbyte(-1) # => 101
6542 * s.getbyte(5) # => nil
6543 *
6544 * Related: String#setbyte.
6545 */
6546VALUE
6547rb_str_getbyte(VALUE str, VALUE index)
6548{
6549 long pos = NUM2LONG(index);
6550
6551 if (pos < 0)
6552 pos += RSTRING_LEN(str);
6553 if (pos < 0 || RSTRING_LEN(str) <= pos)
6554 return Qnil;
6555
6556 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6557}
6558
6559/*
6560 * call-seq:
6561 * setbyte(index, integer) -> integer
6562 *
6563 * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
6564 *
6565 * s = 'abcde' # => "abcde"
6566 * s.setbyte(0, 98) # => 98
6567 * s # => "bbcde"
6568 *
6569 * Related: String#getbyte.
6570 */
6571VALUE
6572rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6573{
6574 long pos = NUM2LONG(index);
6575 long len = RSTRING_LEN(str);
6576 char *ptr, *head, *left = 0;
6577 rb_encoding *enc;
6578 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6579
6580 if (pos < -len || len <= pos)
6581 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6582 if (pos < 0)
6583 pos += len;
6584
6585 VALUE v = rb_to_int(value);
6586 VALUE w = rb_int_and(v, INT2FIX(0xff));
6587 char byte = (char)(NUM2INT(w) & 0xFF);
6588
6589 if (!str_independent(str))
6590 str_make_independent(str);
6591 enc = STR_ENC_GET(str);
6592 head = RSTRING_PTR(str);
6593 ptr = &head[pos];
6594 if (!STR_EMBED_P(str)) {
6595 cr = ENC_CODERANGE(str);
6596 switch (cr) {
6597 case ENC_CODERANGE_7BIT:
6598 left = ptr;
6599 *ptr = byte;
6600 if (ISASCII(byte)) goto end;
6601 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6602 if (!MBCLEN_CHARFOUND_P(nlen))
6604 else
6606 goto end;
6608 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6609 width = rb_enc_precise_mbclen(left, head+len, enc);
6610 *ptr = byte;
6611 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6612 if (!MBCLEN_CHARFOUND_P(nlen))
6614 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6616 goto end;
6617 }
6618 }
6620 *ptr = byte;
6621
6622 end:
6623 return value;
6624}
6625
6626static VALUE
6627str_byte_substr(VALUE str, long beg, long len, int empty)
6628{
6629 long n = RSTRING_LEN(str);
6630
6631 if (beg > n || len < 0) return Qnil;
6632 if (beg < 0) {
6633 beg += n;
6634 if (beg < 0) return Qnil;
6635 }
6636 if (len > n - beg)
6637 len = n - beg;
6638 if (len <= 0) {
6639 if (!empty) return Qnil;
6640 len = 0;
6641 }
6642
6643 VALUE str2 = str_subseq(str, beg, len);
6644
6645 str_enc_copy_direct(str2, str);
6646
6647 if (RSTRING_LEN(str2) == 0) {
6648 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6650 else
6652 }
6653 else {
6654 switch (ENC_CODERANGE(str)) {
6655 case ENC_CODERANGE_7BIT:
6657 break;
6658 default:
6660 break;
6661 }
6662 }
6663
6664 return str2;
6665}
6666
6667VALUE
6668rb_str_byte_substr(VALUE str, VALUE beg, VALUE len)
6669{
6670 return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE);
6671}
6672
6673static VALUE
6674str_byte_aref(VALUE str, VALUE indx)
6675{
6676 long idx;
6677 if (FIXNUM_P(indx)) {
6678 idx = FIX2LONG(indx);
6679 }
6680 else {
6681 /* check if indx is Range */
6682 long beg, len = RSTRING_LEN(str);
6683
6684 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6685 case Qfalse:
6686 break;
6687 case Qnil:
6688 return Qnil;
6689 default:
6690 return str_byte_substr(str, beg, len, TRUE);
6691 }
6692
6693 idx = NUM2LONG(indx);
6694 }
6695 return str_byte_substr(str, idx, 1, FALSE);
6696}
6697
6698/*
6699 * call-seq:
6700 * byteslice(index, length = 1) -> string or nil
6701 * byteslice(range) -> string or nil
6702 *
6703 * Returns a substring of +self+, or +nil+ if the substring cannot be constructed.
6704 *
6705 * With integer arguments +index+ and +length+ given,
6706 * returns the substring beginning at the given +index+
6707 * of the given +length+ (if possible),
6708 * or +nil+ if +length+ is negative or +index+ falls outside of +self+:
6709 *
6710 * s = '0123456789' # => "0123456789"
6711 * s.byteslice(2) # => "2"
6712 * s.byteslice(200) # => nil
6713 * s.byteslice(4, 3) # => "456"
6714 * s.byteslice(4, 30) # => "456789"
6715 * s.byteslice(4, -1) # => nil
6716 * s.byteslice(40, 2) # => nil
6717 *
6718 * In either case above, counts backwards from the end of +self+
6719 * if +index+ is negative:
6720 *
6721 * s = '0123456789' # => "0123456789"
6722 * s.byteslice(-4) # => "6"
6723 * s.byteslice(-4, 3) # => "678"
6724 *
6725 * With Range argument +range+ given, returns
6726 * <tt>byteslice(range.begin, range.size)</tt>:
6727 *
6728 * s = '0123456789' # => "0123456789"
6729 * s.byteslice(4..6) # => "456"
6730 * s.byteslice(-6..-4) # => "456"
6731 * s.byteslice(5..2) # => "" # range.size is zero.
6732 * s.byteslice(40..42) # => nil
6733 *
6734 * In all cases, a returned string has the same encoding as +self+:
6735 *
6736 * s.encoding # => #<Encoding:UTF-8>
6737 * s.byteslice(4).encoding # => #<Encoding:UTF-8>
6738 *
6739 */
6740
6741static VALUE
6742rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6743{
6744 if (argc == 2) {
6745 long beg = NUM2LONG(argv[0]);
6746 long len = NUM2LONG(argv[1]);
6747 return str_byte_substr(str, beg, len, TRUE);
6748 }
6749 rb_check_arity(argc, 1, 2);
6750 return str_byte_aref(str, argv[0]);
6751}
6752
6753static void
6754str_check_beg_len(VALUE str, long *beg, long *len)
6755{
6756 long end, slen = RSTRING_LEN(str);
6757
6758 if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6759 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6760 rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6761 }
6762 if (*beg < 0) {
6763 *beg += slen;
6764 }
6765 RUBY_ASSERT(*beg >= 0);
6766 RUBY_ASSERT(*beg <= slen);
6767
6768 if (*len > slen - *beg) {
6769 *len = slen - *beg;
6770 }
6771 end = *beg + *len;
6772 str_ensure_byte_pos(str, *beg);
6773 str_ensure_byte_pos(str, end);
6774}
6775
6776/*
6777 * call-seq:
6778 * bytesplice(index, length, str) -> string
6779 * bytesplice(index, length, str, str_index, str_length) -> string
6780 * bytesplice(range, str) -> string
6781 * bytesplice(range, str, str_range) -> string
6782 *
6783 * Replaces some or all of the content of +self+ with +str+, and returns +self+.
6784 * The portion of the string affected is determined using
6785 * the same criteria as String#byteslice, except that +length+ cannot be omitted.
6786 * If the replacement string is not the same length as the text it is replacing,
6787 * the string will be adjusted accordingly.
6788 *
6789 * If +str_index+ and +str_length+, or +str_range+ are given, the content of +self+ is replaced by str.byteslice(str_index, str_length) or str.byteslice(str_range); however the substring of +str+ is not allocated as a new string.
6790 *
6791 * The form that take an Integer will raise an IndexError if the value is out
6792 * of range; the Range form will raise a RangeError.
6793 * If the beginning or ending offset does not land on character (codepoint)
6794 * boundary, an IndexError will be raised.
6795 */
6796
6797static VALUE
6798rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6799{
6800 long beg, len, vbeg, vlen;
6801 VALUE val;
6802 int cr;
6803
6804 rb_check_arity(argc, 2, 5);
6805 if (!(argc == 2 || argc == 3 || argc == 5)) {
6806 rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6807 }
6808 if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6809 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6810 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6811 rb_builtin_class_name(argv[0]));
6812 }
6813 val = argv[1];
6814 StringValue(val);
6815 if (argc == 2) {
6816 /* bytesplice(range, str) */
6817 vbeg = 0;
6818 vlen = RSTRING_LEN(val);
6819 }
6820 else {
6821 /* bytesplice(range, str, str_range) */
6822 if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6823 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6824 rb_builtin_class_name(argv[2]));
6825 }
6826 }
6827 }
6828 else {
6829 beg = NUM2LONG(argv[0]);
6830 len = NUM2LONG(argv[1]);
6831 val = argv[2];
6832 StringValue(val);
6833 if (argc == 3) {
6834 /* bytesplice(index, length, str) */
6835 vbeg = 0;
6836 vlen = RSTRING_LEN(val);
6837 }
6838 else {
6839 /* bytesplice(index, length, str, str_index, str_length) */
6840 vbeg = NUM2LONG(argv[3]);
6841 vlen = NUM2LONG(argv[4]);
6842 }
6843 }
6844 str_check_beg_len(str, &beg, &len);
6845 str_check_beg_len(val, &vbeg, &vlen);
6846 str_modify_keep_cr(str);
6847
6848 if (RB_UNLIKELY(ENCODING_GET_INLINED(str) != ENCODING_GET_INLINED(val))) {
6849 rb_enc_associate(str, rb_enc_check(str, val));
6850 }
6851
6852 rb_str_update_1(str, beg, len, val, vbeg, vlen);
6854 if (cr != ENC_CODERANGE_BROKEN)
6855 ENC_CODERANGE_SET(str, cr);
6856 return str;
6857}
6858
6859/*
6860 * call-seq:
6861 * reverse -> string
6862 *
6863 * Returns a new string with the characters from +self+ in reverse order.
6864 *
6865 * 'stressed'.reverse # => "desserts"
6866 *
6867 */
6868
6869static VALUE
6870rb_str_reverse(VALUE str)
6871{
6872 rb_encoding *enc;
6873 VALUE rev;
6874 char *s, *e, *p;
6875 int cr;
6876
6877 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6878 enc = STR_ENC_GET(str);
6879 rev = rb_str_new(0, RSTRING_LEN(str));
6880 s = RSTRING_PTR(str); e = RSTRING_END(str);
6881 p = RSTRING_END(rev);
6882 cr = ENC_CODERANGE(str);
6883
6884 if (RSTRING_LEN(str) > 1) {
6885 if (single_byte_optimizable(str)) {
6886 while (s < e) {
6887 *--p = *s++;
6888 }
6889 }
6890 else if (cr == ENC_CODERANGE_VALID) {
6891 while (s < e) {
6892 int clen = rb_enc_fast_mbclen(s, e, enc);
6893
6894 p -= clen;
6895 memcpy(p, s, clen);
6896 s += clen;
6897 }
6898 }
6899 else {
6900 cr = rb_enc_asciicompat(enc) ?
6902 while (s < e) {
6903 int clen = rb_enc_mbclen(s, e, enc);
6904
6905 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6906 p -= clen;
6907 memcpy(p, s, clen);
6908 s += clen;
6909 }
6910 }
6911 }
6912 STR_SET_LEN(rev, RSTRING_LEN(str));
6913 str_enc_copy_direct(rev, str);
6914 ENC_CODERANGE_SET(rev, cr);
6915
6916 return rev;
6917}
6918
6919
6920/*
6921 * call-seq:
6922 * reverse! -> self
6923 *
6924 * Returns +self+ with its characters reversed:
6925 *
6926 * s = 'stressed'
6927 * s.reverse! # => "desserts"
6928 * s # => "desserts"
6929 *
6930 */
6931
6932static VALUE
6933rb_str_reverse_bang(VALUE str)
6934{
6935 if (RSTRING_LEN(str) > 1) {
6936 if (single_byte_optimizable(str)) {
6937 char *s, *e, c;
6938
6939 str_modify_keep_cr(str);
6940 s = RSTRING_PTR(str);
6941 e = RSTRING_END(str) - 1;
6942 while (s < e) {
6943 c = *s;
6944 *s++ = *e;
6945 *e-- = c;
6946 }
6947 }
6948 else {
6949 str_shared_replace(str, rb_str_reverse(str));
6950 }
6951 }
6952 else {
6953 str_modify_keep_cr(str);
6954 }
6955 return str;
6956}
6957
6958
6959/*
6960 * call-seq:
6961 * include?(other_string) -> true or false
6962 *
6963 * Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
6964 *
6965 * s = 'foo'
6966 * s.include?('f') # => true
6967 * s.include?('fo') # => true
6968 * s.include?('food') # => false
6969 *
6970 */
6971
6972VALUE
6973rb_str_include(VALUE str, VALUE arg)
6974{
6975 long i;
6976
6977 StringValue(arg);
6978 i = rb_str_index(str, arg, 0);
6979
6980 return RBOOL(i != -1);
6981}
6982
6983
6984/*
6985 * call-seq:
6986 * to_i(base = 10) -> integer
6987 *
6988 * Returns the result of interpreting leading characters in +self+
6989 * as an integer in the given +base+ (which must be in (0, 2..36)):
6990 *
6991 * '123456'.to_i # => 123456
6992 * '123def'.to_i(16) # => 1195503
6993 *
6994 * With +base+ zero, string +object+ may contain leading characters
6995 * to specify the actual base:
6996 *
6997 * '123def'.to_i(0) # => 123
6998 * '0123def'.to_i(0) # => 83
6999 * '0b123def'.to_i(0) # => 1
7000 * '0o123def'.to_i(0) # => 83
7001 * '0d123def'.to_i(0) # => 123
7002 * '0x123def'.to_i(0) # => 1195503
7003 *
7004 * Characters past a leading valid number (in the given +base+) are ignored:
7005 *
7006 * '12.345'.to_i # => 12
7007 * '12345'.to_i(2) # => 1
7008 *
7009 * Returns zero if there is no leading valid number:
7010 *
7011 * 'abcdef'.to_i # => 0
7012 * '2'.to_i(2) # => 0
7013 *
7014 */
7015
7016static VALUE
7017rb_str_to_i(int argc, VALUE *argv, VALUE str)
7018{
7019 int base = 10;
7020
7021 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
7022 rb_raise(rb_eArgError, "invalid radix %d", base);
7023 }
7024 return rb_str_to_inum(str, base, FALSE);
7025}
7026
7027
7028/*
7029 * call-seq:
7030 * to_f -> float
7031 *
7032 * Returns the result of interpreting leading characters in +self+ as a Float:
7033 *
7034 * '3.14159'.to_f # => 3.14159
7035 * '1.234e-2'.to_f # => 0.01234
7036 *
7037 * Characters past a leading valid number (in the given +base+) are ignored:
7038 *
7039 * '3.14 (pi to two places)'.to_f # => 3.14
7040 *
7041 * Returns zero if there is no leading valid number:
7042 *
7043 * 'abcdef'.to_f # => 0.0
7044 *
7045 */
7046
7047static VALUE
7048rb_str_to_f(VALUE str)
7049{
7050 return DBL2NUM(rb_str_to_dbl(str, FALSE));
7051}
7052
7053
7054/*
7055 * call-seq:
7056 * to_s -> self or string
7057 *
7058 * Returns +self+ if +self+ is a +String+,
7059 * or +self+ converted to a +String+ if +self+ is a subclass of +String+.
7060 */
7061
7062static VALUE
7063rb_str_to_s(VALUE str)
7064{
7065 if (rb_obj_class(str) != rb_cString) {
7066 return str_duplicate(rb_cString, str);
7067 }
7068 return str;
7069}
7070
7071#if 0
7072static void
7073str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
7074{
7075 char s[RUBY_MAX_CHAR_LEN];
7076 int n = rb_enc_codelen(c, enc);
7077
7078 rb_enc_mbcput(c, s, enc);
7079 rb_enc_str_buf_cat(str, s, n, enc);
7080}
7081#endif
7082
7083#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
7084
7085int
7086rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
7087{
7088 char buf[CHAR_ESC_LEN + 1];
7089 int l;
7090
7091#if SIZEOF_INT > 4
7092 c &= 0xffffffff;
7093#endif
7094 if (unicode_p) {
7095 if (c < 0x7F && ISPRINT(c)) {
7096 snprintf(buf, CHAR_ESC_LEN, "%c", c);
7097 }
7098 else if (c < 0x10000) {
7099 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
7100 }
7101 else {
7102 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
7103 }
7104 }
7105 else {
7106 if (c < 0x100) {
7107 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
7108 }
7109 else {
7110 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
7111 }
7112 }
7113 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
7114 rb_str_buf_cat(result, buf, l);
7115 return l;
7116}
7117
7118const char *
7119ruby_escaped_char(int c)
7120{
7121 switch (c) {
7122 case '\0': return "\\0";
7123 case '\n': return "\\n";
7124 case '\r': return "\\r";
7125 case '\t': return "\\t";
7126 case '\f': return "\\f";
7127 case '\013': return "\\v";
7128 case '\010': return "\\b";
7129 case '\007': return "\\a";
7130 case '\033': return "\\e";
7131 case '\x7f': return "\\c?";
7132 }
7133 return NULL;
7134}
7135
7136VALUE
7137rb_str_escape(VALUE str)
7138{
7139 int encidx = ENCODING_GET(str);
7140 rb_encoding *enc = rb_enc_from_index(encidx);
7141 const char *p = RSTRING_PTR(str);
7142 const char *pend = RSTRING_END(str);
7143 const char *prev = p;
7144 char buf[CHAR_ESC_LEN + 1];
7145 VALUE result = rb_str_buf_new(0);
7146 int unicode_p = rb_enc_unicode_p(enc);
7147 int asciicompat = rb_enc_asciicompat(enc);
7148
7149 while (p < pend) {
7150 unsigned int c;
7151 const char *cc;
7152 int n = rb_enc_precise_mbclen(p, pend, enc);
7153 if (!MBCLEN_CHARFOUND_P(n)) {
7154 if (p > prev) str_buf_cat(result, prev, p - prev);
7155 n = rb_enc_mbminlen(enc);
7156 if (pend < p + n)
7157 n = (int)(pend - p);
7158 while (n--) {
7159 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7160 str_buf_cat(result, buf, strlen(buf));
7161 prev = ++p;
7162 }
7163 continue;
7164 }
7165 n = MBCLEN_CHARFOUND_LEN(n);
7166 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7167 p += n;
7168 cc = ruby_escaped_char(c);
7169 if (cc) {
7170 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7171 str_buf_cat(result, cc, strlen(cc));
7172 prev = p;
7173 }
7174 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
7175 }
7176 else {
7177 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7178 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7179 prev = p;
7180 }
7181 }
7182 if (p > prev) str_buf_cat(result, prev, p - prev);
7184
7185 return result;
7186}
7187
7188/*
7189 * call-seq:
7190 * inspect -> string
7191 *
7192 * Returns a printable version of +self+, enclosed in double-quotes,
7193 * and with special characters escaped:
7194 *
7195 * s = "foo\tbar\tbaz\n"
7196 * s.inspect
7197 * # => "\"foo\\tbar\\tbaz\\n\""
7198 *
7199 */
7200
7201VALUE
7203{
7204 int encidx = ENCODING_GET(str);
7205 rb_encoding *enc = rb_enc_from_index(encidx);
7206 const char *p, *pend, *prev;
7207 char buf[CHAR_ESC_LEN + 1];
7208 VALUE result = rb_str_buf_new(0);
7209 rb_encoding *resenc = rb_default_internal_encoding();
7210 int unicode_p = rb_enc_unicode_p(enc);
7211 int asciicompat = rb_enc_asciicompat(enc);
7212
7213 if (resenc == NULL) resenc = rb_default_external_encoding();
7214 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7215 rb_enc_associate(result, resenc);
7216 str_buf_cat2(result, "\"");
7217
7218 p = RSTRING_PTR(str); pend = RSTRING_END(str);
7219 prev = p;
7220 while (p < pend) {
7221 unsigned int c, cc;
7222 int n;
7223
7224 n = rb_enc_precise_mbclen(p, pend, enc);
7225 if (!MBCLEN_CHARFOUND_P(n)) {
7226 if (p > prev) str_buf_cat(result, prev, p - prev);
7227 n = rb_enc_mbminlen(enc);
7228 if (pend < p + n)
7229 n = (int)(pend - p);
7230 while (n--) {
7231 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7232 str_buf_cat(result, buf, strlen(buf));
7233 prev = ++p;
7234 }
7235 continue;
7236 }
7237 n = MBCLEN_CHARFOUND_LEN(n);
7238 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7239 p += n;
7240 if ((asciicompat || unicode_p) &&
7241 (c == '"'|| c == '\\' ||
7242 (c == '#' &&
7243 p < pend &&
7244 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
7245 (cc = rb_enc_codepoint(p,pend,enc),
7246 (cc == '$' || cc == '@' || cc == '{'))))) {
7247 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7248 str_buf_cat2(result, "\\");
7249 if (asciicompat || enc == resenc) {
7250 prev = p - n;
7251 continue;
7252 }
7253 }
7254 switch (c) {
7255 case '\n': cc = 'n'; break;
7256 case '\r': cc = 'r'; break;
7257 case '\t': cc = 't'; break;
7258 case '\f': cc = 'f'; break;
7259 case '\013': cc = 'v'; break;
7260 case '\010': cc = 'b'; break;
7261 case '\007': cc = 'a'; break;
7262 case 033: cc = 'e'; break;
7263 default: cc = 0; break;
7264 }
7265 if (cc) {
7266 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7267 buf[0] = '\\';
7268 buf[1] = (char)cc;
7269 str_buf_cat(result, buf, 2);
7270 prev = p;
7271 continue;
7272 }
7273 /* The special casing of 0x85 (NEXT_LINE) here is because
7274 * Oniguruma historically treats it as printable, but it
7275 * doesn't match the print POSIX bracket class or character
7276 * property in regexps.
7277 *
7278 * See Ruby Bug #16842 for details:
7279 * https://bugs.ruby-lang.org/issues/16842
7280 */
7281 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
7282 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
7283 continue;
7284 }
7285 else {
7286 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7287 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7288 prev = p;
7289 continue;
7290 }
7291 }
7292 if (p > prev) str_buf_cat(result, prev, p - prev);
7293 str_buf_cat2(result, "\"");
7294
7295 return result;
7296}
7297
7298#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7299
7300/*
7301 * call-seq:
7302 * dump -> string
7303 *
7304 * Returns a printable version of +self+, enclosed in double-quotes,
7305 * with special characters escaped, and with non-printing characters
7306 * replaced by hexadecimal notation:
7307 *
7308 * "hello \n ''".dump # => "\"hello \\n ''\""
7309 * "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7310 *
7311 * Related: String#undump (inverse of String#dump).
7312 *
7313 */
7314
7315VALUE
7317{
7318 int encidx = rb_enc_get_index(str);
7319 rb_encoding *enc = rb_enc_from_index(encidx);
7320 long len;
7321 const char *p, *pend;
7322 char *q, *qend;
7323 VALUE result;
7324 int u8 = (encidx == rb_utf8_encindex());
7325 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
7326
7327 len = 2; /* "" */
7328 if (!rb_enc_asciicompat(enc)) {
7329 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
7330 len += strlen(enc->name);
7331 }
7332
7333 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7334 while (p < pend) {
7335 int clen;
7336 unsigned char c = *p++;
7337
7338 switch (c) {
7339 case '"': case '\\':
7340 case '\n': case '\r':
7341 case '\t': case '\f':
7342 case '\013': case '\010': case '\007': case '\033':
7343 clen = 2;
7344 break;
7345
7346 case '#':
7347 clen = IS_EVSTR(p, pend) ? 2 : 1;
7348 break;
7349
7350 default:
7351 if (ISPRINT(c)) {
7352 clen = 1;
7353 }
7354 else {
7355 if (u8 && c > 0x7F) { /* \u notation */
7356 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7357 if (MBCLEN_CHARFOUND_P(n)) {
7358 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7359 if (cc <= 0xFFFF)
7360 clen = 6; /* \uXXXX */
7361 else if (cc <= 0xFFFFF)
7362 clen = 9; /* \u{XXXXX} */
7363 else
7364 clen = 10; /* \u{XXXXXX} */
7365 p += MBCLEN_CHARFOUND_LEN(n)-1;
7366 break;
7367 }
7368 }
7369 clen = 4; /* \xNN */
7370 }
7371 break;
7372 }
7373
7374 if (clen > LONG_MAX - len) {
7375 rb_raise(rb_eRuntimeError, "string size too big");
7376 }
7377 len += clen;
7378 }
7379
7380 result = rb_str_new(0, len);
7381 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7382 q = RSTRING_PTR(result); qend = q + len + 1;
7383
7384 *q++ = '"';
7385 while (p < pend) {
7386 unsigned char c = *p++;
7387
7388 if (c == '"' || c == '\\') {
7389 *q++ = '\\';
7390 *q++ = c;
7391 }
7392 else if (c == '#') {
7393 if (IS_EVSTR(p, pend)) *q++ = '\\';
7394 *q++ = '#';
7395 }
7396 else if (c == '\n') {
7397 *q++ = '\\';
7398 *q++ = 'n';
7399 }
7400 else if (c == '\r') {
7401 *q++ = '\\';
7402 *q++ = 'r';
7403 }
7404 else if (c == '\t') {
7405 *q++ = '\\';
7406 *q++ = 't';
7407 }
7408 else if (c == '\f') {
7409 *q++ = '\\';
7410 *q++ = 'f';
7411 }
7412 else if (c == '\013') {
7413 *q++ = '\\';
7414 *q++ = 'v';
7415 }
7416 else if (c == '\010') {
7417 *q++ = '\\';
7418 *q++ = 'b';
7419 }
7420 else if (c == '\007') {
7421 *q++ = '\\';
7422 *q++ = 'a';
7423 }
7424 else if (c == '\033') {
7425 *q++ = '\\';
7426 *q++ = 'e';
7427 }
7428 else if (ISPRINT(c)) {
7429 *q++ = c;
7430 }
7431 else {
7432 *q++ = '\\';
7433 if (u8) {
7434 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7435 if (MBCLEN_CHARFOUND_P(n)) {
7436 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7437 p += n;
7438 if (cc <= 0xFFFF)
7439 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7440 else
7441 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7442 q += strlen(q);
7443 continue;
7444 }
7445 }
7446 snprintf(q, qend-q, "x%02X", c);
7447 q += 3;
7448 }
7449 }
7450 *q++ = '"';
7451 *q = '\0';
7452 if (!rb_enc_asciicompat(enc)) {
7453 snprintf(q, qend-q, nonascii_suffix, enc->name);
7454 encidx = rb_ascii8bit_encindex();
7455 }
7456 /* result from dump is ASCII */
7457 rb_enc_associate_index(result, encidx);
7459 return result;
7460}
7461
7462static int
7463unescape_ascii(unsigned int c)
7464{
7465 switch (c) {
7466 case 'n':
7467 return '\n';
7468 case 'r':
7469 return '\r';
7470 case 't':
7471 return '\t';
7472 case 'f':
7473 return '\f';
7474 case 'v':
7475 return '\13';
7476 case 'b':
7477 return '\010';
7478 case 'a':
7479 return '\007';
7480 case 'e':
7481 return 033;
7482 }
7484}
7485
7486static void
7487undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7488{
7489 const char *s = *ss;
7490 unsigned int c;
7491 int codelen;
7492 size_t hexlen;
7493 unsigned char buf[6];
7494 static rb_encoding *enc_utf8 = NULL;
7495
7496 switch (*s) {
7497 case '\\':
7498 case '"':
7499 case '#':
7500 rb_str_cat(undumped, s, 1); /* cat itself */
7501 s++;
7502 break;
7503 case 'n':
7504 case 'r':
7505 case 't':
7506 case 'f':
7507 case 'v':
7508 case 'b':
7509 case 'a':
7510 case 'e':
7511 *buf = unescape_ascii(*s);
7512 rb_str_cat(undumped, (char *)buf, 1);
7513 s++;
7514 break;
7515 case 'u':
7516 if (*binary) {
7517 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7518 }
7519 *utf8 = true;
7520 if (++s >= s_end) {
7521 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7522 }
7523 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7524 if (*penc != enc_utf8) {
7525 *penc = enc_utf8;
7526 rb_enc_associate(undumped, enc_utf8);
7527 }
7528 if (*s == '{') { /* handle \u{...} form */
7529 s++;
7530 for (;;) {
7531 if (s >= s_end) {
7532 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7533 }
7534 if (*s == '}') {
7535 s++;
7536 break;
7537 }
7538 if (ISSPACE(*s)) {
7539 s++;
7540 continue;
7541 }
7542 c = scan_hex(s, s_end-s, &hexlen);
7543 if (hexlen == 0 || hexlen > 6) {
7544 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7545 }
7546 if (c > 0x10ffff) {
7547 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7548 }
7549 if (0xd800 <= c && c <= 0xdfff) {
7550 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7551 }
7552 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7553 rb_str_cat(undumped, (char *)buf, codelen);
7554 s += hexlen;
7555 }
7556 }
7557 else { /* handle \uXXXX form */
7558 c = scan_hex(s, 4, &hexlen);
7559 if (hexlen != 4) {
7560 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7561 }
7562 if (0xd800 <= c && c <= 0xdfff) {
7563 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7564 }
7565 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7566 rb_str_cat(undumped, (char *)buf, codelen);
7567 s += hexlen;
7568 }
7569 break;
7570 case 'x':
7571 if (*utf8) {
7572 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7573 }
7574 *binary = true;
7575 if (++s >= s_end) {
7576 rb_raise(rb_eRuntimeError, "invalid hex escape");
7577 }
7578 *buf = scan_hex(s, 2, &hexlen);
7579 if (hexlen != 2) {
7580 rb_raise(rb_eRuntimeError, "invalid hex escape");
7581 }
7582 rb_str_cat(undumped, (char *)buf, 1);
7583 s += hexlen;
7584 break;
7585 default:
7586 rb_str_cat(undumped, s-1, 2);
7587 s++;
7588 }
7589
7590 *ss = s;
7591}
7592
7593static VALUE rb_str_is_ascii_only_p(VALUE str);
7594
7595/*
7596 * call-seq:
7597 * undump -> string
7598 *
7599 * Returns an unescaped version of +self+:
7600 *
7601 * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
7602 * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7603 * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
7604 * s_undumped == s_orig # => true
7605 *
7606 * Related: String#dump (inverse of String#undump).
7607 *
7608 */
7609
7610static VALUE
7611str_undump(VALUE str)
7612{
7613 const char *s = RSTRING_PTR(str);
7614 const char *s_end = RSTRING_END(str);
7615 rb_encoding *enc = rb_enc_get(str);
7616 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7617 bool utf8 = false;
7618 bool binary = false;
7619 int w;
7620
7622 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7623 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7624 }
7625 if (!str_null_check(str, &w)) {
7626 rb_raise(rb_eRuntimeError, "string contains null byte");
7627 }
7628 if (RSTRING_LEN(str) < 2) goto invalid_format;
7629 if (*s != '"') goto invalid_format;
7630
7631 /* strip '"' at the start */
7632 s++;
7633
7634 for (;;) {
7635 if (s >= s_end) {
7636 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7637 }
7638
7639 if (*s == '"') {
7640 /* epilogue */
7641 s++;
7642 if (s == s_end) {
7643 /* ascii compatible dumped string */
7644 break;
7645 }
7646 else {
7647 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7648 static const char dup_suffix[] = ".dup";
7649 const char *encname;
7650 int encidx;
7651 ptrdiff_t size;
7652
7653 /* check separately for strings dumped by older versions */
7654 size = sizeof(dup_suffix) - 1;
7655 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7656
7657 size = sizeof(force_encoding_suffix) - 1;
7658 if (s_end - s <= size) goto invalid_format;
7659 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7660 s += size;
7661
7662 if (utf8) {
7663 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7664 }
7665
7666 encname = s;
7667 s = memchr(s, '"', s_end-s);
7668 size = s - encname;
7669 if (!s) goto invalid_format;
7670 if (s_end - s != 2) goto invalid_format;
7671 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7672
7673 encidx = rb_enc_find_index2(encname, (long)size);
7674 if (encidx < 0) {
7675 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7676 }
7677 rb_enc_associate_index(undumped, encidx);
7678 }
7679 break;
7680 }
7681
7682 if (*s == '\\') {
7683 s++;
7684 if (s >= s_end) {
7685 rb_raise(rb_eRuntimeError, "invalid escape");
7686 }
7687 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7688 }
7689 else {
7690 rb_str_cat(undumped, s++, 1);
7691 }
7692 }
7693
7694 RB_GC_GUARD(str);
7695
7696 return undumped;
7697invalid_format:
7698 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7699}
7700
7701static void
7702rb_str_check_dummy_enc(rb_encoding *enc)
7703{
7704 if (rb_enc_dummy_p(enc)) {
7705 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7706 rb_enc_name(enc));
7707 }
7708}
7709
7710static rb_encoding *
7711str_true_enc(VALUE str)
7712{
7713 rb_encoding *enc = STR_ENC_GET(str);
7714 rb_str_check_dummy_enc(enc);
7715 return enc;
7716}
7717
7718static OnigCaseFoldType
7719check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7720{
7721 if (argc==0)
7722 return flags;
7723 if (argc>2)
7724 rb_raise(rb_eArgError, "too many options");
7725 if (argv[0]==sym_turkic) {
7726 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7727 if (argc==2) {
7728 if (argv[1]==sym_lithuanian)
7729 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7730 else
7731 rb_raise(rb_eArgError, "invalid second option");
7732 }
7733 }
7734 else if (argv[0]==sym_lithuanian) {
7735 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7736 if (argc==2) {
7737 if (argv[1]==sym_turkic)
7738 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7739 else
7740 rb_raise(rb_eArgError, "invalid second option");
7741 }
7742 }
7743 else if (argc>1)
7744 rb_raise(rb_eArgError, "too many options");
7745 else if (argv[0]==sym_ascii)
7746 flags |= ONIGENC_CASE_ASCII_ONLY;
7747 else if (argv[0]==sym_fold) {
7748 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7749 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7750 else
7751 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7752 }
7753 else
7754 rb_raise(rb_eArgError, "invalid option");
7755 return flags;
7756}
7757
7758static inline bool
7759case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7760{
7761 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7762 return true;
7763 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7764}
7765
7766/* 16 should be long enough to absorb any kind of single character length increase */
7767#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7768#ifndef CASEMAP_DEBUG
7769# define CASEMAP_DEBUG 0
7770#endif
7771
7772struct mapping_buffer;
7773typedef struct mapping_buffer {
7774 size_t capa;
7775 size_t used;
7776 struct mapping_buffer *next;
7777 OnigUChar space[FLEX_ARY_LEN];
7779
7780static void
7781mapping_buffer_free(void *p)
7782{
7783 mapping_buffer *previous_buffer;
7784 mapping_buffer *current_buffer = p;
7785 while (current_buffer) {
7786 previous_buffer = current_buffer;
7787 current_buffer = current_buffer->next;
7788 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7789 }
7790}
7791
7792static const rb_data_type_t mapping_buffer_type = {
7793 "mapping_buffer",
7794 {0, mapping_buffer_free,},
7795 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7796};
7797
7798static VALUE
7799rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7800{
7801 VALUE target;
7802
7803 const OnigUChar *source_current, *source_end;
7804 int target_length = 0;
7805 VALUE buffer_anchor;
7806 mapping_buffer *current_buffer = 0;
7807 mapping_buffer **pre_buffer;
7808 size_t buffer_count = 0;
7809 int buffer_length_or_invalid;
7810
7811 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7812
7813 source_current = (OnigUChar*)RSTRING_PTR(source);
7814 source_end = (OnigUChar*)RSTRING_END(source);
7815
7816 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7817 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7818 while (source_current < source_end) {
7819 /* increase multiplier using buffer count to converge quickly */
7820 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7821 if (CASEMAP_DEBUG) {
7822 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7823 }
7824 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7825 *pre_buffer = current_buffer;
7826 pre_buffer = &current_buffer->next;
7827 current_buffer->next = NULL;
7828 current_buffer->capa = capa;
7829 buffer_length_or_invalid = enc->case_map(flags,
7830 &source_current, source_end,
7831 current_buffer->space,
7832 current_buffer->space+current_buffer->capa,
7833 enc);
7834 if (buffer_length_or_invalid < 0) {
7835 current_buffer = DATA_PTR(buffer_anchor);
7836 DATA_PTR(buffer_anchor) = 0;
7837 mapping_buffer_free(current_buffer);
7838 rb_raise(rb_eArgError, "input string invalid");
7839 }
7840 target_length += current_buffer->used = buffer_length_or_invalid;
7841 }
7842 if (CASEMAP_DEBUG) {
7843 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7844 }
7845
7846 if (buffer_count==1) {
7847 target = rb_str_new((const char*)current_buffer->space, target_length);
7848 }
7849 else {
7850 char *target_current;
7851
7852 target = rb_str_new(0, target_length);
7853 target_current = RSTRING_PTR(target);
7854 current_buffer = DATA_PTR(buffer_anchor);
7855 while (current_buffer) {
7856 memcpy(target_current, current_buffer->space, current_buffer->used);
7857 target_current += current_buffer->used;
7858 current_buffer = current_buffer->next;
7859 }
7860 }
7861 current_buffer = DATA_PTR(buffer_anchor);
7862 DATA_PTR(buffer_anchor) = 0;
7863 mapping_buffer_free(current_buffer);
7864
7865 RB_GC_GUARD(buffer_anchor);
7866
7867 /* TODO: check about string terminator character */
7868 str_enc_copy_direct(target, source);
7869 /*ENC_CODERANGE_SET(mapped, cr);*/
7870
7871 return target;
7872}
7873
7874static VALUE
7875rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7876{
7877 const OnigUChar *source_current, *source_end;
7878 OnigUChar *target_current, *target_end;
7879 long old_length = RSTRING_LEN(source);
7880 int length_or_invalid;
7881
7882 if (old_length == 0) return Qnil;
7883
7884 source_current = (OnigUChar*)RSTRING_PTR(source);
7885 source_end = (OnigUChar*)RSTRING_END(source);
7886 if (source == target) {
7887 target_current = (OnigUChar*)source_current;
7888 target_end = (OnigUChar*)source_end;
7889 }
7890 else {
7891 target_current = (OnigUChar*)RSTRING_PTR(target);
7892 target_end = (OnigUChar*)RSTRING_END(target);
7893 }
7894
7895 length_or_invalid = onigenc_ascii_only_case_map(flags,
7896 &source_current, source_end,
7897 target_current, target_end, enc);
7898 if (length_or_invalid < 0)
7899 rb_raise(rb_eArgError, "input string invalid");
7900 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7901 fprintf(stderr, "problem with rb_str_ascii_casemap"
7902 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7903 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7904 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7905 }
7906
7907 str_enc_copy(target, source);
7908
7909 return target;
7910}
7911
7912static bool
7913upcase_single(VALUE str)
7914{
7915 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7916 bool modified = false;
7917
7918 while (s < send) {
7919 unsigned int c = *(unsigned char*)s;
7920
7921 if ('a' <= c && c <= 'z') {
7922 *s = 'A' + (c - 'a');
7923 modified = true;
7924 }
7925 s++;
7926 }
7927 return modified;
7928}
7929
7930/*
7931 * call-seq:
7932 * upcase!(*options) -> self or nil
7933 *
7934 * Upcases the characters in +self+;
7935 * returns +self+ if any changes were made, +nil+ otherwise:
7936 *
7937 * s = 'Hello World!' # => "Hello World!"
7938 * s.upcase! # => "HELLO WORLD!"
7939 * s # => "HELLO WORLD!"
7940 * s.upcase! # => nil
7941 *
7942 * The casing may be affected by the given +options+;
7943 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7944 *
7945 * Related: String#upcase, String#downcase, String#downcase!.
7946 *
7947 */
7948
7949static VALUE
7950rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7951{
7952 rb_encoding *enc;
7953 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7954
7955 flags = check_case_options(argc, argv, flags);
7956 str_modify_keep_cr(str);
7957 enc = str_true_enc(str);
7958 if (case_option_single_p(flags, enc, str)) {
7959 if (upcase_single(str))
7960 flags |= ONIGENC_CASE_MODIFIED;
7961 }
7962 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7963 rb_str_ascii_casemap(str, str, &flags, enc);
7964 else
7965 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7966
7967 if (ONIGENC_CASE_MODIFIED&flags) return str;
7968 return Qnil;
7969}
7970
7971
7972/*
7973 * call-seq:
7974 * upcase(*options) -> string
7975 *
7976 * Returns a string containing the upcased characters in +self+:
7977 *
7978 * s = 'Hello World!' # => "Hello World!"
7979 * s.upcase # => "HELLO WORLD!"
7980 *
7981 * The casing may be affected by the given +options+;
7982 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7983 *
7984 * Related: String#upcase!, String#downcase, String#downcase!.
7985 *
7986 */
7987
7988static VALUE
7989rb_str_upcase(int argc, VALUE *argv, VALUE str)
7990{
7991 rb_encoding *enc;
7992 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7993 VALUE ret;
7994
7995 flags = check_case_options(argc, argv, flags);
7996 enc = str_true_enc(str);
7997 if (case_option_single_p(flags, enc, str)) {
7998 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7999 str_enc_copy_direct(ret, str);
8000 upcase_single(ret);
8001 }
8002 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8003 ret = rb_str_new(0, RSTRING_LEN(str));
8004 rb_str_ascii_casemap(str, ret, &flags, enc);
8005 }
8006 else {
8007 ret = rb_str_casemap(str, &flags, enc);
8008 }
8009
8010 return ret;
8011}
8012
8013static bool
8014downcase_single(VALUE str)
8015{
8016 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8017 bool modified = false;
8018
8019 while (s < send) {
8020 unsigned int c = *(unsigned char*)s;
8021
8022 if ('A' <= c && c <= 'Z') {
8023 *s = 'a' + (c - 'A');
8024 modified = true;
8025 }
8026 s++;
8027 }
8028
8029 return modified;
8030}
8031
8032/*
8033 * call-seq:
8034 * downcase!(*options) -> self or nil
8035 *
8036 * Downcases the characters in +self+;
8037 * returns +self+ if any changes were made, +nil+ otherwise:
8038 *
8039 * s = 'Hello World!' # => "Hello World!"
8040 * s.downcase! # => "hello world!"
8041 * s # => "hello world!"
8042 * s.downcase! # => nil
8043 *
8044 * The casing may be affected by the given +options+;
8045 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8046 *
8047 * Related: String#downcase, String#upcase, String#upcase!.
8048 *
8049 */
8050
8051static VALUE
8052rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
8053{
8054 rb_encoding *enc;
8055 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8056
8057 flags = check_case_options(argc, argv, flags);
8058 str_modify_keep_cr(str);
8059 enc = str_true_enc(str);
8060 if (case_option_single_p(flags, enc, str)) {
8061 if (downcase_single(str))
8062 flags |= ONIGENC_CASE_MODIFIED;
8063 }
8064 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8065 rb_str_ascii_casemap(str, str, &flags, enc);
8066 else
8067 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8068
8069 if (ONIGENC_CASE_MODIFIED&flags) return str;
8070 return Qnil;
8071}
8072
8073
8074/*
8075 * call-seq:
8076 * downcase(*options) -> string
8077 *
8078 * Returns a string containing the downcased characters in +self+:
8079 *
8080 * s = 'Hello World!' # => "Hello World!"
8081 * s.downcase # => "hello world!"
8082 *
8083 * The casing may be affected by the given +options+;
8084 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8085 *
8086 * Related: String#downcase!, String#upcase, String#upcase!.
8087 *
8088 */
8089
8090static VALUE
8091rb_str_downcase(int argc, VALUE *argv, VALUE str)
8092{
8093 rb_encoding *enc;
8094 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8095 VALUE ret;
8096
8097 flags = check_case_options(argc, argv, flags);
8098 enc = str_true_enc(str);
8099 if (case_option_single_p(flags, enc, str)) {
8100 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8101 str_enc_copy_direct(ret, str);
8102 downcase_single(ret);
8103 }
8104 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8105 ret = rb_str_new(0, RSTRING_LEN(str));
8106 rb_str_ascii_casemap(str, ret, &flags, enc);
8107 }
8108 else {
8109 ret = rb_str_casemap(str, &flags, enc);
8110 }
8111
8112 return ret;
8113}
8114
8115
8116/*
8117 * call-seq:
8118 * capitalize!(*options) -> self or nil
8119 *
8120 * Upcases the first character in +self+;
8121 * downcases the remaining characters;
8122 * returns +self+ if any changes were made, +nil+ otherwise:
8123 *
8124 * s = 'hello World!' # => "hello World!"
8125 * s.capitalize! # => "Hello world!"
8126 * s # => "Hello world!"
8127 * s.capitalize! # => nil
8128 *
8129 * The casing may be affected by the given +options+;
8130 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8131 *
8132 * Related: String#capitalize.
8133 *
8134 */
8135
8136static VALUE
8137rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
8138{
8139 rb_encoding *enc;
8140 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8141
8142 flags = check_case_options(argc, argv, flags);
8143 str_modify_keep_cr(str);
8144 enc = str_true_enc(str);
8145 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8146 if (flags&ONIGENC_CASE_ASCII_ONLY)
8147 rb_str_ascii_casemap(str, str, &flags, enc);
8148 else
8149 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8150
8151 if (ONIGENC_CASE_MODIFIED&flags) return str;
8152 return Qnil;
8153}
8154
8155
8156/*
8157 * call-seq:
8158 * capitalize(*options) -> string
8159 *
8160 * Returns a string containing the characters in +self+;
8161 * the first character is upcased;
8162 * the remaining characters are downcased:
8163 *
8164 * s = 'hello World!' # => "hello World!"
8165 * s.capitalize # => "Hello world!"
8166 *
8167 * The casing may be affected by the given +options+;
8168 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8169 *
8170 * Related: String#capitalize!.
8171 *
8172 */
8173
8174static VALUE
8175rb_str_capitalize(int argc, VALUE *argv, VALUE str)
8176{
8177 rb_encoding *enc;
8178 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8179 VALUE ret;
8180
8181 flags = check_case_options(argc, argv, flags);
8182 enc = str_true_enc(str);
8183 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
8184 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8185 ret = rb_str_new(0, RSTRING_LEN(str));
8186 rb_str_ascii_casemap(str, ret, &flags, enc);
8187 }
8188 else {
8189 ret = rb_str_casemap(str, &flags, enc);
8190 }
8191 return ret;
8192}
8193
8194
8195/*
8196 * call-seq:
8197 * swapcase!(*options) -> self or nil
8198 *
8199 * Upcases each lowercase character in +self+;
8200 * downcases uppercase character;
8201 * returns +self+ if any changes were made, +nil+ otherwise:
8202 *
8203 * s = 'Hello World!' # => "Hello World!"
8204 * s.swapcase! # => "hELLO wORLD!"
8205 * s # => "hELLO wORLD!"
8206 * ''.swapcase! # => nil
8207 *
8208 * The casing may be affected by the given +options+;
8209 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8210 *
8211 * Related: String#swapcase.
8212 *
8213 */
8214
8215static VALUE
8216rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
8217{
8218 rb_encoding *enc;
8219 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8220
8221 flags = check_case_options(argc, argv, flags);
8222 str_modify_keep_cr(str);
8223 enc = str_true_enc(str);
8224 if (flags&ONIGENC_CASE_ASCII_ONLY)
8225 rb_str_ascii_casemap(str, str, &flags, enc);
8226 else
8227 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8228
8229 if (ONIGENC_CASE_MODIFIED&flags) return str;
8230 return Qnil;
8231}
8232
8233
8234/*
8235 * call-seq:
8236 * swapcase(*options) -> string
8237 *
8238 * Returns a string containing the characters in +self+, with cases reversed;
8239 * each uppercase character is downcased;
8240 * each lowercase character is upcased:
8241 *
8242 * s = 'Hello World!' # => "Hello World!"
8243 * s.swapcase # => "hELLO wORLD!"
8244 *
8245 * The casing may be affected by the given +options+;
8246 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8247 *
8248 * Related: String#swapcase!.
8249 *
8250 */
8251
8252static VALUE
8253rb_str_swapcase(int argc, VALUE *argv, VALUE str)
8254{
8255 rb_encoding *enc;
8256 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8257 VALUE ret;
8258
8259 flags = check_case_options(argc, argv, flags);
8260 enc = str_true_enc(str);
8261 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
8262 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8263 ret = rb_str_new(0, RSTRING_LEN(str));
8264 rb_str_ascii_casemap(str, ret, &flags, enc);
8265 }
8266 else {
8267 ret = rb_str_casemap(str, &flags, enc);
8268 }
8269 return ret;
8270}
8271
8272typedef unsigned char *USTR;
8273
8274struct tr {
8275 int gen;
8276 unsigned int now, max;
8277 char *p, *pend;
8278};
8279
8280static unsigned int
8281trnext(struct tr *t, rb_encoding *enc)
8282{
8283 int n;
8284
8285 for (;;) {
8286 nextpart:
8287 if (!t->gen) {
8288 if (t->p == t->pend) return -1;
8289 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
8290 t->p += n;
8291 }
8292 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8293 t->p += n;
8294 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
8295 t->p += n;
8296 if (t->p < t->pend) {
8297 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8298 t->p += n;
8299 if (t->now > c) {
8300 if (t->now < 0x80 && c < 0x80) {
8301 rb_raise(rb_eArgError,
8302 "invalid range \"%c-%c\" in string transliteration",
8303 t->now, c);
8304 }
8305 else {
8306 rb_raise(rb_eArgError, "invalid range in string transliteration");
8307 }
8308 continue; /* not reached */
8309 }
8310 else if (t->now < c) {
8311 t->gen = 1;
8312 t->max = c;
8313 }
8314 }
8315 }
8316 return t->now;
8317 }
8318 else {
8319 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8320 if (t->now == t->max) {
8321 t->gen = 0;
8322 goto nextpart;
8323 }
8324 }
8325 if (t->now < t->max) {
8326 return t->now;
8327 }
8328 else {
8329 t->gen = 0;
8330 return t->max;
8331 }
8332 }
8333 }
8334}
8335
8336static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
8337
8338static VALUE
8339tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
8340{
8341 const unsigned int errc = -1;
8342 unsigned int trans[256];
8343 rb_encoding *enc, *e1, *e2;
8344 struct tr trsrc, trrepl;
8345 int cflag = 0;
8346 unsigned int c, c0, last = 0;
8347 int modify = 0, i, l;
8348 unsigned char *s, *send;
8349 VALUE hash = 0;
8350 int singlebyte = single_byte_optimizable(str);
8351 int termlen;
8352 int cr;
8353
8354#define CHECK_IF_ASCII(c) \
8355 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8356 (cr = ENC_CODERANGE_VALID) : 0)
8357
8358 StringValue(src);
8359 StringValue(repl);
8360 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8361 if (RSTRING_LEN(repl) == 0) {
8362 return rb_str_delete_bang(1, &src, str);
8363 }
8364
8365 cr = ENC_CODERANGE(str);
8366 e1 = rb_enc_check(str, src);
8367 e2 = rb_enc_check(str, repl);
8368 if (e1 == e2) {
8369 enc = e1;
8370 }
8371 else {
8372 enc = rb_enc_check(src, repl);
8373 }
8374 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8375 if (RSTRING_LEN(src) > 1 &&
8376 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
8377 trsrc.p + l < trsrc.pend) {
8378 cflag = 1;
8379 trsrc.p += l;
8380 }
8381 trrepl.p = RSTRING_PTR(repl);
8382 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8383 trsrc.gen = trrepl.gen = 0;
8384 trsrc.now = trrepl.now = 0;
8385 trsrc.max = trrepl.max = 0;
8386
8387 if (cflag) {
8388 for (i=0; i<256; i++) {
8389 trans[i] = 1;
8390 }
8391 while ((c = trnext(&trsrc, enc)) != errc) {
8392 if (c < 256) {
8393 trans[c] = errc;
8394 }
8395 else {
8396 if (!hash) hash = rb_hash_new();
8397 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
8398 }
8399 }
8400 while ((c = trnext(&trrepl, enc)) != errc)
8401 /* retrieve last replacer */;
8402 last = trrepl.now;
8403 for (i=0; i<256; i++) {
8404 if (trans[i] != errc) {
8405 trans[i] = last;
8406 }
8407 }
8408 }
8409 else {
8410 unsigned int r;
8411
8412 for (i=0; i<256; i++) {
8413 trans[i] = errc;
8414 }
8415 while ((c = trnext(&trsrc, enc)) != errc) {
8416 r = trnext(&trrepl, enc);
8417 if (r == errc) r = trrepl.now;
8418 if (c < 256) {
8419 trans[c] = r;
8420 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8421 }
8422 else {
8423 if (!hash) hash = rb_hash_new();
8424 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8425 }
8426 }
8427 }
8428
8429 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8430 cr = ENC_CODERANGE_7BIT;
8431 str_modify_keep_cr(str);
8432 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8433 termlen = rb_enc_mbminlen(enc);
8434 if (sflag) {
8435 int clen, tlen;
8436 long offset, max = RSTRING_LEN(str);
8437 unsigned int save = -1;
8438 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8439
8440 while (s < send) {
8441 int may_modify = 0;
8442
8443 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8444 if (!MBCLEN_CHARFOUND_P(r)) {
8445 xfree(buf);
8446 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8447 }
8448 clen = MBCLEN_CHARFOUND_LEN(r);
8449 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8450
8451 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8452
8453 s += clen;
8454 if (c < 256) {
8455 c = trans[c];
8456 }
8457 else if (hash) {
8458 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8459 if (NIL_P(tmp)) {
8460 if (cflag) c = last;
8461 else c = errc;
8462 }
8463 else if (cflag) c = errc;
8464 else c = NUM2INT(tmp);
8465 }
8466 else {
8467 c = errc;
8468 }
8469 if (c != (unsigned int)-1) {
8470 if (save == c) {
8471 CHECK_IF_ASCII(c);
8472 continue;
8473 }
8474 save = c;
8475 tlen = rb_enc_codelen(c, enc);
8476 modify = 1;
8477 }
8478 else {
8479 save = -1;
8480 c = c0;
8481 if (enc != e1) may_modify = 1;
8482 }
8483 if ((offset = t - buf) + tlen > max) {
8484 size_t MAYBE_UNUSED(old) = max + termlen;
8485 max = offset + tlen + (send - s);
8486 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8487 t = buf + offset;
8488 }
8489 rb_enc_mbcput(c, t, enc);
8490 if (may_modify && memcmp(s, t, tlen) != 0) {
8491 modify = 1;
8492 }
8493 CHECK_IF_ASCII(c);
8494 t += tlen;
8495 }
8496 if (!STR_EMBED_P(str)) {
8497 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8498 }
8499 TERM_FILL((char *)t, termlen);
8500 RSTRING(str)->as.heap.ptr = (char *)buf;
8501 STR_SET_LEN(str, t - buf);
8502 STR_SET_NOEMBED(str);
8503 RSTRING(str)->as.heap.aux.capa = max;
8504 }
8505 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8506 while (s < send) {
8507 c = (unsigned char)*s;
8508 if (trans[c] != errc) {
8509 if (!cflag) {
8510 c = trans[c];
8511 *s = c;
8512 modify = 1;
8513 }
8514 else {
8515 *s = last;
8516 modify = 1;
8517 }
8518 }
8519 CHECK_IF_ASCII(c);
8520 s++;
8521 }
8522 }
8523 else {
8524 int clen, tlen;
8525 long offset, max = (long)((send - s) * 1.2);
8526 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8527
8528 while (s < send) {
8529 int may_modify = 0;
8530
8531 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8532 if (!MBCLEN_CHARFOUND_P(r)) {
8533 xfree(buf);
8534 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8535 }
8536 clen = MBCLEN_CHARFOUND_LEN(r);
8537 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8538
8539 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8540
8541 if (c < 256) {
8542 c = trans[c];
8543 }
8544 else if (hash) {
8545 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8546 if (NIL_P(tmp)) {
8547 if (cflag) c = last;
8548 else c = errc;
8549 }
8550 else if (cflag) c = errc;
8551 else c = NUM2INT(tmp);
8552 }
8553 else {
8554 c = cflag ? last : errc;
8555 }
8556 if (c != errc) {
8557 tlen = rb_enc_codelen(c, enc);
8558 modify = 1;
8559 }
8560 else {
8561 c = c0;
8562 if (enc != e1) may_modify = 1;
8563 }
8564 if ((offset = t - buf) + tlen > max) {
8565 size_t MAYBE_UNUSED(old) = max + termlen;
8566 max = offset + tlen + (long)((send - s) * 1.2);
8567 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8568 t = buf + offset;
8569 }
8570 if (s != t) {
8571 rb_enc_mbcput(c, t, enc);
8572 if (may_modify && memcmp(s, t, tlen) != 0) {
8573 modify = 1;
8574 }
8575 }
8576 CHECK_IF_ASCII(c);
8577 s += clen;
8578 t += tlen;
8579 }
8580 if (!STR_EMBED_P(str)) {
8581 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8582 }
8583 TERM_FILL((char *)t, termlen);
8584 RSTRING(str)->as.heap.ptr = (char *)buf;
8585 STR_SET_LEN(str, t - buf);
8586 STR_SET_NOEMBED(str);
8587 RSTRING(str)->as.heap.aux.capa = max;
8588 }
8589
8590 if (modify) {
8591 if (cr != ENC_CODERANGE_BROKEN)
8592 ENC_CODERANGE_SET(str, cr);
8593 rb_enc_associate(str, enc);
8594 return str;
8595 }
8596 return Qnil;
8597}
8598
8599
8600/*
8601 * call-seq:
8602 * tr!(selector, replacements) -> self or nil
8603 *
8604 * Like String#tr, but modifies +self+ in place.
8605 * Returns +self+ if any changes were made, +nil+ otherwise.
8606 *
8607 */
8608
8609static VALUE
8610rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8611{
8612 return tr_trans(str, src, repl, 0);
8613}
8614
8615
8616/*
8617 * call-seq:
8618 * tr(selector, replacements) -> new_string
8619 *
8620 * Returns a copy of +self+ with each character specified by string +selector+
8621 * translated to the corresponding character in string +replacements+.
8622 * The correspondence is _positional_:
8623 *
8624 * - Each occurrence of the first character specified by +selector+
8625 * is translated to the first character in +replacements+.
8626 * - Each occurrence of the second character specified by +selector+
8627 * is translated to the second character in +replacements+.
8628 * - And so on.
8629 *
8630 * Example:
8631 *
8632 * 'hello'.tr('el', 'ip') #=> "hippo"
8633 *
8634 * If +replacements+ is shorter than +selector+,
8635 * it is implicitly padded with its own last character:
8636 *
8637 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8638 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8639 *
8640 * Arguments +selector+ and +replacements+ must be valid character selectors
8641 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8642 * and may use any of its valid forms, including negation, ranges, and escaping:
8643 *
8644 * # Negation.
8645 * 'hello'.tr('^aeiou', '-') # => "-e--o"
8646 * # Ranges.
8647 * 'ibm'.tr('b-z', 'a-z') # => "hal"
8648 * # Escapes.
8649 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8650 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8651 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8652 *
8653 */
8654
8655static VALUE
8656rb_str_tr(VALUE str, VALUE src, VALUE repl)
8657{
8658 str = str_duplicate(rb_cString, str);
8659 tr_trans(str, src, repl, 0);
8660 return str;
8661}
8662
8663#define TR_TABLE_MAX (UCHAR_MAX+1)
8664#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8665static void
8666tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8667 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8668{
8669 const unsigned int errc = -1;
8670 char buf[TR_TABLE_MAX];
8671 struct tr tr;
8672 unsigned int c;
8673 VALUE table = 0, ptable = 0;
8674 int i, l, cflag = 0;
8675
8676 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8677 tr.gen = tr.now = tr.max = 0;
8678
8679 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8680 cflag = 1;
8681 tr.p += l;
8682 }
8683 if (first) {
8684 for (i=0; i<TR_TABLE_MAX; i++) {
8685 stable[i] = 1;
8686 }
8687 stable[TR_TABLE_MAX] = cflag;
8688 }
8689 else if (stable[TR_TABLE_MAX] && !cflag) {
8690 stable[TR_TABLE_MAX] = 0;
8691 }
8692 for (i=0; i<TR_TABLE_MAX; i++) {
8693 buf[i] = cflag;
8694 }
8695
8696 while ((c = trnext(&tr, enc)) != errc) {
8697 if (c < TR_TABLE_MAX) {
8698 buf[(unsigned char)c] = !cflag;
8699 }
8700 else {
8701 VALUE key = UINT2NUM(c);
8702
8703 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8704 if (cflag) {
8705 ptable = *ctablep;
8706 table = ptable ? ptable : rb_hash_new();
8707 *ctablep = table;
8708 }
8709 else {
8710 table = rb_hash_new();
8711 ptable = *tablep;
8712 *tablep = table;
8713 }
8714 }
8715 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8716 rb_hash_aset(table, key, Qtrue);
8717 }
8718 }
8719 }
8720 for (i=0; i<TR_TABLE_MAX; i++) {
8721 stable[i] = stable[i] && buf[i];
8722 }
8723 if (!table && !cflag) {
8724 *tablep = 0;
8725 }
8726}
8727
8728
8729static int
8730tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8731{
8732 if (c < TR_TABLE_MAX) {
8733 return table[c] != 0;
8734 }
8735 else {
8736 VALUE v = UINT2NUM(c);
8737
8738 if (del) {
8739 if (!NIL_P(rb_hash_lookup(del, v)) &&
8740 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8741 return TRUE;
8742 }
8743 }
8744 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8745 return FALSE;
8746 }
8747 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8748 }
8749}
8750
8751/*
8752 * call-seq:
8753 * delete!(*selectors) -> self or nil
8754 *
8755 * Like String#delete, but modifies +self+ in place.
8756 * Returns +self+ if any changes were made, +nil+ otherwise.
8757 *
8758 */
8759
8760static VALUE
8761rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8762{
8763 char squeez[TR_TABLE_SIZE];
8764 rb_encoding *enc = 0;
8765 char *s, *send, *t;
8766 VALUE del = 0, nodel = 0;
8767 int modify = 0;
8768 int i, ascompat, cr;
8769
8770 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8772 for (i=0; i<argc; i++) {
8773 VALUE s = argv[i];
8774
8775 StringValue(s);
8776 enc = rb_enc_check(str, s);
8777 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8778 }
8779
8780 str_modify_keep_cr(str);
8781 ascompat = rb_enc_asciicompat(enc);
8782 s = t = RSTRING_PTR(str);
8783 send = RSTRING_END(str);
8784 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8785 while (s < send) {
8786 unsigned int c;
8787 int clen;
8788
8789 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8790 if (squeez[c]) {
8791 modify = 1;
8792 }
8793 else {
8794 if (t != s) *t = c;
8795 t++;
8796 }
8797 s++;
8798 }
8799 else {
8800 c = rb_enc_codepoint_len(s, send, &clen, enc);
8801
8802 if (tr_find(c, squeez, del, nodel)) {
8803 modify = 1;
8804 }
8805 else {
8806 if (t != s) rb_enc_mbcput(c, t, enc);
8807 t += clen;
8809 }
8810 s += clen;
8811 }
8812 }
8813 TERM_FILL(t, TERM_LEN(str));
8814 STR_SET_LEN(str, t - RSTRING_PTR(str));
8815 ENC_CODERANGE_SET(str, cr);
8816
8817 if (modify) return str;
8818 return Qnil;
8819}
8820
8821
8822/*
8823 * call-seq:
8824 * delete(*selectors) -> new_string
8825 *
8826 * Returns a copy of +self+ with characters specified by +selectors+ removed
8827 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8828 *
8829 * "hello".delete "l","lo" #=> "heo"
8830 * "hello".delete "lo" #=> "he"
8831 * "hello".delete "aeiou", "^e" #=> "hell"
8832 * "hello".delete "ej-m" #=> "ho"
8833 *
8834 */
8835
8836static VALUE
8837rb_str_delete(int argc, VALUE *argv, VALUE str)
8838{
8839 str = str_duplicate(rb_cString, str);
8840 rb_str_delete_bang(argc, argv, str);
8841 return str;
8842}
8843
8844
8845/*
8846 * call-seq:
8847 * squeeze!(*selectors) -> self or nil
8848 *
8849 * Like String#squeeze, but modifies +self+ in place.
8850 * Returns +self+ if any changes were made, +nil+ otherwise.
8851 */
8852
8853static VALUE
8854rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8855{
8856 char squeez[TR_TABLE_SIZE];
8857 rb_encoding *enc = 0;
8858 VALUE del = 0, nodel = 0;
8859 unsigned char *s, *send, *t;
8860 int i, modify = 0;
8861 int ascompat, singlebyte = single_byte_optimizable(str);
8862 unsigned int save;
8863
8864 if (argc == 0) {
8865 enc = STR_ENC_GET(str);
8866 }
8867 else {
8868 for (i=0; i<argc; i++) {
8869 VALUE s = argv[i];
8870
8871 StringValue(s);
8872 enc = rb_enc_check(str, s);
8873 if (singlebyte && !single_byte_optimizable(s))
8874 singlebyte = 0;
8875 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8876 }
8877 }
8878
8879 str_modify_keep_cr(str);
8880 s = t = (unsigned char *)RSTRING_PTR(str);
8881 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8882 send = (unsigned char *)RSTRING_END(str);
8883 save = -1;
8884 ascompat = rb_enc_asciicompat(enc);
8885
8886 if (singlebyte) {
8887 while (s < send) {
8888 unsigned int c = *s++;
8889 if (c != save || (argc > 0 && !squeez[c])) {
8890 *t++ = save = c;
8891 }
8892 }
8893 }
8894 else {
8895 while (s < send) {
8896 unsigned int c;
8897 int clen;
8898
8899 if (ascompat && (c = *s) < 0x80) {
8900 if (c != save || (argc > 0 && !squeez[c])) {
8901 *t++ = save = c;
8902 }
8903 s++;
8904 }
8905 else {
8906 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8907
8908 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8909 if (t != s) rb_enc_mbcput(c, t, enc);
8910 save = c;
8911 t += clen;
8912 }
8913 s += clen;
8914 }
8915 }
8916 }
8917
8918 TERM_FILL((char *)t, TERM_LEN(str));
8919 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8920 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8921 modify = 1;
8922 }
8923
8924 if (modify) return str;
8925 return Qnil;
8926}
8927
8928
8929/*
8930 * call-seq:
8931 * squeeze(*selectors) -> new_string
8932 *
8933 * Returns a copy of +self+ with characters specified by +selectors+ "squeezed"
8934 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8935 *
8936 * "Squeezed" means that each multiple-character run of a selected character
8937 * is squeezed down to a single character;
8938 * with no arguments given, squeezes all characters:
8939 *
8940 * "yellow moon".squeeze #=> "yelow mon"
8941 * " now is the".squeeze(" ") #=> " now is the"
8942 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
8943 *
8944 */
8945
8946static VALUE
8947rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8948{
8949 str = str_duplicate(rb_cString, str);
8950 rb_str_squeeze_bang(argc, argv, str);
8951 return str;
8952}
8953
8954
8955/*
8956 * call-seq:
8957 * tr_s!(selector, replacements) -> self or nil
8958 *
8959 * Like String#tr_s, but modifies +self+ in place.
8960 * Returns +self+ if any changes were made, +nil+ otherwise.
8961 *
8962 * Related: String#squeeze!.
8963 */
8964
8965static VALUE
8966rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8967{
8968 return tr_trans(str, src, repl, 1);
8969}
8970
8971
8972/*
8973 * call-seq:
8974 * tr_s(selector, replacements) -> string
8975 *
8976 * Like String#tr, but also squeezes the modified portions of the translated string;
8977 * returns a new string (translated and squeezed).
8978 *
8979 * 'hello'.tr_s('l', 'r') #=> "hero"
8980 * 'hello'.tr_s('el', '-') #=> "h-o"
8981 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
8982 *
8983 * Related: String#squeeze.
8984 *
8985 */
8986
8987static VALUE
8988rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8989{
8990 str = str_duplicate(rb_cString, str);
8991 tr_trans(str, src, repl, 1);
8992 return str;
8993}
8994
8995
8996/*
8997 * call-seq:
8998 * count(*selectors) -> integer
8999 *
9000 * Returns the total number of characters in +self+
9001 * that are specified by the given +selectors+
9002 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
9003 *
9004 * a = "hello world"
9005 * a.count "lo" #=> 5
9006 * a.count "lo", "o" #=> 2
9007 * a.count "hello", "^l" #=> 4
9008 * a.count "ej-m" #=> 4
9009 *
9010 * "hello^world".count "\\^aeiou" #=> 4
9011 * "hello-world".count "a\\-eo" #=> 4
9012 *
9013 * c = "hello world\\r\\n"
9014 * c.count "\\" #=> 2
9015 * c.count "\\A" #=> 0
9016 * c.count "X-\\w" #=> 3
9017 */
9018
9019static VALUE
9020rb_str_count(int argc, VALUE *argv, VALUE str)
9021{
9022 char table[TR_TABLE_SIZE];
9023 rb_encoding *enc = 0;
9024 VALUE del = 0, nodel = 0, tstr;
9025 char *s, *send;
9026 int i;
9027 int ascompat;
9028 size_t n = 0;
9029
9031
9032 tstr = argv[0];
9033 StringValue(tstr);
9034 enc = rb_enc_check(str, tstr);
9035 if (argc == 1) {
9036 const char *ptstr;
9037 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
9038 (ptstr = RSTRING_PTR(tstr),
9039 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
9040 !is_broken_string(str)) {
9041 int clen;
9042 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9043
9044 s = RSTRING_PTR(str);
9045 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9046 send = RSTRING_END(str);
9047 while (s < send) {
9048 if (*(unsigned char*)s++ == c) n++;
9049 }
9050 return SIZET2NUM(n);
9051 }
9052 }
9053
9054 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9055 for (i=1; i<argc; i++) {
9056 tstr = argv[i];
9057 StringValue(tstr);
9058 enc = rb_enc_check(str, tstr);
9059 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9060 }
9061
9062 s = RSTRING_PTR(str);
9063 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9064 send = RSTRING_END(str);
9065 ascompat = rb_enc_asciicompat(enc);
9066 while (s < send) {
9067 unsigned int c;
9068
9069 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
9070 if (table[c]) {
9071 n++;
9072 }
9073 s++;
9074 }
9075 else {
9076 int clen;
9077 c = rb_enc_codepoint_len(s, send, &clen, enc);
9078 if (tr_find(c, table, del, nodel)) {
9079 n++;
9080 }
9081 s += clen;
9082 }
9083 }
9084
9085 return SIZET2NUM(n);
9086}
9087
9088static VALUE
9089rb_fs_check(VALUE val)
9090{
9091 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
9092 val = rb_check_string_type(val);
9093 if (NIL_P(val)) return 0;
9094 }
9095 return val;
9096}
9097
9098static const char isspacetable[256] = {
9099 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9101 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9105 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9106 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9107 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9108 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9109 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9110 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9111 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9112 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9113 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9114 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9115};
9116
9117#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9118
9119static long
9120split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
9121{
9122 if (empty_count >= 0 && len == 0) {
9123 return empty_count + 1;
9124 }
9125 if (empty_count > 0) {
9126 /* make different substrings */
9127 if (result) {
9128 do {
9129 rb_ary_push(result, str_new_empty_String(str));
9130 } while (--empty_count > 0);
9131 }
9132 else {
9133 do {
9134 rb_yield(str_new_empty_String(str));
9135 } while (--empty_count > 0);
9136 }
9137 }
9138 str = rb_str_subseq(str, beg, len);
9139 if (result) {
9140 rb_ary_push(result, str);
9141 }
9142 else {
9143 rb_yield(str);
9144 }
9145 return empty_count;
9146}
9147
9148typedef enum {
9149 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9150} split_type_t;
9151
9152static split_type_t
9153literal_split_pattern(VALUE spat, split_type_t default_type)
9154{
9155 rb_encoding *enc = STR_ENC_GET(spat);
9156 const char *ptr;
9157 long len;
9158 RSTRING_GETMEM(spat, ptr, len);
9159 if (len == 0) {
9160 /* Special case - split into chars */
9161 return SPLIT_TYPE_CHARS;
9162 }
9163 else if (rb_enc_asciicompat(enc)) {
9164 if (len == 1 && ptr[0] == ' ') {
9165 return SPLIT_TYPE_AWK;
9166 }
9167 }
9168 else {
9169 int l;
9170 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
9171 return SPLIT_TYPE_AWK;
9172 }
9173 }
9174 return default_type;
9175}
9176
9177/*
9178 * call-seq:
9179 * split(field_sep = $;, limit = 0) -> array
9180 * split(field_sep = $;, limit = 0) {|substring| ... } -> self
9181 *
9182 * :include: doc/string/split.rdoc
9183 *
9184 */
9185
9186static VALUE
9187rb_str_split_m(int argc, VALUE *argv, VALUE str)
9188{
9189 rb_encoding *enc;
9190 VALUE spat;
9191 VALUE limit;
9192 split_type_t split_type;
9193 long beg, end, i = 0, empty_count = -1;
9194 int lim = 0;
9195 VALUE result, tmp;
9196
9197 result = rb_block_given_p() ? Qfalse : Qnil;
9198 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
9199 lim = NUM2INT(limit);
9200 if (lim <= 0) limit = Qnil;
9201 else if (lim == 1) {
9202 if (RSTRING_LEN(str) == 0)
9203 return result ? rb_ary_new2(0) : str;
9204 tmp = str_duplicate(rb_cString, str);
9205 if (!result) {
9206 rb_yield(tmp);
9207 return str;
9208 }
9209 return rb_ary_new3(1, tmp);
9210 }
9211 i = 1;
9212 }
9213 if (NIL_P(limit) && !lim) empty_count = 0;
9214
9215 enc = STR_ENC_GET(str);
9216 split_type = SPLIT_TYPE_REGEXP;
9217 if (!NIL_P(spat)) {
9218 spat = get_pat_quoted(spat, 0);
9219 }
9220 else if (NIL_P(spat = rb_fs)) {
9221 split_type = SPLIT_TYPE_AWK;
9222 }
9223 else if (!(spat = rb_fs_check(spat))) {
9224 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
9225 }
9226 else {
9227 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
9228 }
9229 if (split_type != SPLIT_TYPE_AWK) {
9230 switch (BUILTIN_TYPE(spat)) {
9231 case T_REGEXP:
9232 rb_reg_options(spat); /* check if uninitialized */
9233 tmp = RREGEXP_SRC(spat);
9234 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9235 if (split_type == SPLIT_TYPE_AWK) {
9236 spat = tmp;
9237 split_type = SPLIT_TYPE_STRING;
9238 }
9239 break;
9240
9241 case T_STRING:
9242 mustnot_broken(spat);
9243 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9244 break;
9245
9246 default:
9248 }
9249 }
9250
9251#define SPLIT_STR(beg, len) ( \
9252 empty_count = split_string(result, str, beg, len, empty_count), \
9253 str_mod_check(str, str_start, str_len))
9254
9255 beg = 0;
9256 char *ptr = RSTRING_PTR(str);
9257 char *const str_start = ptr;
9258 const long str_len = RSTRING_LEN(str);
9259 char *const eptr = str_start + str_len;
9260 if (split_type == SPLIT_TYPE_AWK) {
9261 char *bptr = ptr;
9262 int skip = 1;
9263 unsigned int c;
9264
9265 if (result) result = rb_ary_new();
9266 end = beg;
9267 if (is_ascii_string(str)) {
9268 while (ptr < eptr) {
9269 c = (unsigned char)*ptr++;
9270 if (skip) {
9271 if (ascii_isspace(c)) {
9272 beg = ptr - bptr;
9273 }
9274 else {
9275 end = ptr - bptr;
9276 skip = 0;
9277 if (!NIL_P(limit) && lim <= i) break;
9278 }
9279 }
9280 else if (ascii_isspace(c)) {
9281 SPLIT_STR(beg, end-beg);
9282 skip = 1;
9283 beg = ptr - bptr;
9284 if (!NIL_P(limit)) ++i;
9285 }
9286 else {
9287 end = ptr - bptr;
9288 }
9289 }
9290 }
9291 else {
9292 while (ptr < eptr) {
9293 int n;
9294
9295 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9296 ptr += n;
9297 if (skip) {
9298 if (rb_isspace(c)) {
9299 beg = ptr - bptr;
9300 }
9301 else {
9302 end = ptr - bptr;
9303 skip = 0;
9304 if (!NIL_P(limit) && lim <= i) break;
9305 }
9306 }
9307 else if (rb_isspace(c)) {
9308 SPLIT_STR(beg, end-beg);
9309 skip = 1;
9310 beg = ptr - bptr;
9311 if (!NIL_P(limit)) ++i;
9312 }
9313 else {
9314 end = ptr - bptr;
9315 }
9316 }
9317 }
9318 }
9319 else if (split_type == SPLIT_TYPE_STRING) {
9320 char *substr_start = ptr;
9321 char *sptr = RSTRING_PTR(spat);
9322 long slen = RSTRING_LEN(spat);
9323
9324 if (result) result = rb_ary_new();
9325 mustnot_broken(str);
9326 enc = rb_enc_check(str, spat);
9327 while (ptr < eptr &&
9328 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9329 /* Check we are at the start of a char */
9330 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
9331 if (t != ptr + end) {
9332 ptr = t;
9333 continue;
9334 }
9335 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9336 str_mod_check(spat, sptr, slen);
9337 ptr += end + slen;
9338 substr_start = ptr;
9339 if (!NIL_P(limit) && lim <= ++i) break;
9340 }
9341 beg = ptr - str_start;
9342 }
9343 else if (split_type == SPLIT_TYPE_CHARS) {
9344 int n;
9345
9346 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9347 mustnot_broken(str);
9348 enc = rb_enc_get(str);
9349 while (ptr < eptr &&
9350 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9351 SPLIT_STR(ptr - str_start, n);
9352 ptr += n;
9353 if (!NIL_P(limit) && lim <= ++i) break;
9354 }
9355 beg = ptr - str_start;
9356 }
9357 else {
9358 if (result) result = rb_ary_new();
9359 long len = RSTRING_LEN(str);
9360 long start = beg;
9361 long idx;
9362 int last_null = 0;
9363 struct re_registers *regs;
9364 VALUE match = 0;
9365
9366 for (; rb_reg_search(spat, str, start, 0) >= 0;
9367 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
9368 match = rb_backref_get();
9369 if (!result) rb_match_busy(match);
9370 regs = RMATCH_REGS(match);
9371 end = BEG(0);
9372 if (start == end && BEG(0) == END(0)) {
9373 if (!ptr) {
9374 SPLIT_STR(0, 0);
9375 break;
9376 }
9377 else if (last_null == 1) {
9378 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9379 beg = start;
9380 }
9381 else {
9382 if (start == len)
9383 start++;
9384 else
9385 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9386 last_null = 1;
9387 continue;
9388 }
9389 }
9390 else {
9391 SPLIT_STR(beg, end-beg);
9392 beg = start = END(0);
9393 }
9394 last_null = 0;
9395
9396 for (idx=1; idx < regs->num_regs; idx++) {
9397 if (BEG(idx) == -1) continue;
9398 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9399 }
9400 if (!NIL_P(limit) && lim <= ++i) break;
9401 }
9402 if (match) rb_match_unbusy(match);
9403 }
9404 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9405 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9406 }
9407
9408 return result ? result : str;
9409}
9410
9411VALUE
9412rb_str_split(VALUE str, const char *sep0)
9413{
9414 VALUE sep;
9415
9416 StringValue(str);
9417 sep = rb_str_new_cstr(sep0);
9418 return rb_str_split_m(1, &sep, str);
9419}
9420
9421#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9422
9423static inline int
9424enumerator_element(VALUE ary, VALUE e)
9425{
9426 if (ary) {
9427 rb_ary_push(ary, e);
9428 return 0;
9429 }
9430 else {
9431 rb_yield(e);
9432 return 1;
9433 }
9434}
9435
9436#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9437
9438static const char *
9439chomp_newline(const char *p, const char *e, rb_encoding *enc)
9440{
9441 const char *prev = rb_enc_prev_char(p, e, e, enc);
9442 if (rb_enc_is_newline(prev, e, enc)) {
9443 e = prev;
9444 prev = rb_enc_prev_char(p, e, e, enc);
9445 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9446 e = prev;
9447 }
9448 return e;
9449}
9450
9451static VALUE
9452get_rs(void)
9453{
9454 VALUE rs = rb_rs;
9455 if (!NIL_P(rs) &&
9456 (!RB_TYPE_P(rs, T_STRING) ||
9457 RSTRING_LEN(rs) != 1 ||
9458 RSTRING_PTR(rs)[0] != '\n')) {
9459 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9460 }
9461 return rs;
9462}
9463
9464#define rb_rs get_rs()
9465
9466static VALUE
9467rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9468{
9469 rb_encoding *enc;
9470 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9471 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9472 long pos, len, rslen;
9473 int rsnewline = 0;
9474
9475 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9476 rs = rb_rs;
9477 if (!NIL_P(opts)) {
9478 static ID keywords[1];
9479 if (!keywords[0]) {
9480 keywords[0] = rb_intern_const("chomp");
9481 }
9482 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9483 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9484 }
9485
9486 if (NIL_P(rs)) {
9487 if (!ENUM_ELEM(ary, str)) {
9488 return ary;
9489 }
9490 else {
9491 return orig;
9492 }
9493 }
9494
9495 if (!RSTRING_LEN(str)) goto end;
9496 str = rb_str_new_frozen(str);
9497 ptr = subptr = RSTRING_PTR(str);
9498 pend = RSTRING_END(str);
9499 len = RSTRING_LEN(str);
9500 StringValue(rs);
9501 rslen = RSTRING_LEN(rs);
9502
9503 if (rs == rb_default_rs)
9504 enc = rb_enc_get(str);
9505 else
9506 enc = rb_enc_check(str, rs);
9507
9508 if (rslen == 0) {
9509 /* paragraph mode */
9510 int n;
9511 const char *eol = NULL;
9512 subend = subptr;
9513 while (subend < pend) {
9514 long chomp_rslen = 0;
9515 do {
9516 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9517 n = 0;
9518 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9519 if (rb_enc_is_newline(subend + n, pend, enc)) {
9520 if (eol == subend) break;
9521 subend += rslen;
9522 if (subptr) {
9523 eol = subend;
9524 chomp_rslen = -rslen;
9525 }
9526 }
9527 else {
9528 if (!subptr) subptr = subend;
9529 subend += rslen;
9530 }
9531 rslen = 0;
9532 } while (subend < pend);
9533 if (!subptr) break;
9534 if (rslen == 0) chomp_rslen = 0;
9535 line = rb_str_subseq(str, subptr - ptr,
9536 subend - subptr + (chomp ? chomp_rslen : rslen));
9537 if (ENUM_ELEM(ary, line)) {
9538 str_mod_check(str, ptr, len);
9539 }
9540 subptr = eol = NULL;
9541 }
9542 goto end;
9543 }
9544 else {
9545 rsptr = RSTRING_PTR(rs);
9546 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9547 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9548 rsnewline = 1;
9549 }
9550 }
9551
9552 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9553 rs = rb_str_new(rsptr, rslen);
9554 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9555 rsptr = RSTRING_PTR(rs);
9556 rslen = RSTRING_LEN(rs);
9557 }
9558
9559 while (subptr < pend) {
9560 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9561 if (pos < 0) break;
9562 hit = subptr + pos;
9563 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9564 if (hit != adjusted) {
9565 subptr = adjusted;
9566 continue;
9567 }
9568 subend = hit += rslen;
9569 if (chomp) {
9570 if (rsnewline) {
9571 subend = chomp_newline(subptr, subend, enc);
9572 }
9573 else {
9574 subend -= rslen;
9575 }
9576 }
9577 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9578 if (ENUM_ELEM(ary, line)) {
9579 str_mod_check(str, ptr, len);
9580 }
9581 subptr = hit;
9582 }
9583
9584 if (subptr != pend) {
9585 if (chomp) {
9586 if (rsnewline) {
9587 pend = chomp_newline(subptr, pend, enc);
9588 }
9589 else if (pend - subptr >= rslen &&
9590 memcmp(pend - rslen, rsptr, rslen) == 0) {
9591 pend -= rslen;
9592 }
9593 }
9594 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9595 ENUM_ELEM(ary, line);
9596 RB_GC_GUARD(str);
9597 }
9598
9599 end:
9600 if (ary)
9601 return ary;
9602 else
9603 return orig;
9604}
9605
9606/*
9607 * call-seq:
9608 * each_line(line_sep = $/, chomp: false) {|substring| ... } -> self
9609 * each_line(line_sep = $/, chomp: false) -> enumerator
9610 *
9611 * :include: doc/string/each_line.rdoc
9612 *
9613 */
9614
9615static VALUE
9616rb_str_each_line(int argc, VALUE *argv, VALUE str)
9617{
9618 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9619 return rb_str_enumerate_lines(argc, argv, str, 0);
9620}
9621
9622/*
9623 * call-seq:
9624 * lines(Line_sep = $/, chomp: false) -> array_of_strings
9625 *
9626 * Forms substrings ("lines") of +self+ according to the given arguments
9627 * (see String#each_line for details); returns the lines in an array.
9628 *
9629 */
9630
9631static VALUE
9632rb_str_lines(int argc, VALUE *argv, VALUE str)
9633{
9634 VALUE ary = WANTARRAY("lines", 0);
9635 return rb_str_enumerate_lines(argc, argv, str, ary);
9636}
9637
9638static VALUE
9639rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9640{
9641 return LONG2FIX(RSTRING_LEN(str));
9642}
9643
9644static VALUE
9645rb_str_enumerate_bytes(VALUE str, VALUE ary)
9646{
9647 long i;
9648
9649 for (i=0; i<RSTRING_LEN(str); i++) {
9650 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9651 }
9652 if (ary)
9653 return ary;
9654 else
9655 return str;
9656}
9657
9658/*
9659 * call-seq:
9660 * each_byte {|byte| ... } -> self
9661 * each_byte -> enumerator
9662 *
9663 * :include: doc/string/each_byte.rdoc
9664 *
9665 */
9666
9667static VALUE
9668rb_str_each_byte(VALUE str)
9669{
9670 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9671 return rb_str_enumerate_bytes(str, 0);
9672}
9673
9674/*
9675 * call-seq:
9676 * bytes -> array_of_bytes
9677 *
9678 * :include: doc/string/bytes.rdoc
9679 *
9680 */
9681
9682static VALUE
9683rb_str_bytes(VALUE str)
9684{
9685 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9686 return rb_str_enumerate_bytes(str, ary);
9687}
9688
9689static VALUE
9690rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9691{
9692 return rb_str_length(str);
9693}
9694
9695static VALUE
9696rb_str_enumerate_chars(VALUE str, VALUE ary)
9697{
9698 VALUE orig = str;
9699 long i, len, n;
9700 const char *ptr;
9701 rb_encoding *enc;
9702
9703 str = rb_str_new_frozen(str);
9704 ptr = RSTRING_PTR(str);
9705 len = RSTRING_LEN(str);
9706 enc = rb_enc_get(str);
9707
9709 for (i = 0; i < len; i += n) {
9710 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9711 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9712 }
9713 }
9714 else {
9715 for (i = 0; i < len; i += n) {
9716 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9717 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9718 }
9719 }
9720 RB_GC_GUARD(str);
9721 if (ary)
9722 return ary;
9723 else
9724 return orig;
9725}
9726
9727/*
9728 * call-seq:
9729 * each_char {|c| ... } -> self
9730 * each_char -> enumerator
9731 *
9732 * :include: doc/string/each_char.rdoc
9733 *
9734 */
9735
9736static VALUE
9737rb_str_each_char(VALUE str)
9738{
9739 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9740 return rb_str_enumerate_chars(str, 0);
9741}
9742
9743/*
9744 * call-seq:
9745 * chars -> array_of_characters
9746 *
9747 * :include: doc/string/chars.rdoc
9748 *
9749 */
9750
9751static VALUE
9752rb_str_chars(VALUE str)
9753{
9754 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9755 return rb_str_enumerate_chars(str, ary);
9756}
9757
9758static VALUE
9759rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9760{
9761 VALUE orig = str;
9762 int n;
9763 unsigned int c;
9764 const char *ptr, *end;
9765 rb_encoding *enc;
9766
9767 if (single_byte_optimizable(str))
9768 return rb_str_enumerate_bytes(str, ary);
9769
9770 str = rb_str_new_frozen(str);
9771 ptr = RSTRING_PTR(str);
9772 end = RSTRING_END(str);
9773 enc = STR_ENC_GET(str);
9774
9775 while (ptr < end) {
9776 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9777 ENUM_ELEM(ary, UINT2NUM(c));
9778 ptr += n;
9779 }
9780 RB_GC_GUARD(str);
9781 if (ary)
9782 return ary;
9783 else
9784 return orig;
9785}
9786
9787/*
9788 * call-seq:
9789 * each_codepoint {|integer| ... } -> self
9790 * each_codepoint -> enumerator
9791 *
9792 * :include: doc/string/each_codepoint.rdoc
9793 *
9794 */
9795
9796static VALUE
9797rb_str_each_codepoint(VALUE str)
9798{
9799 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9800 return rb_str_enumerate_codepoints(str, 0);
9801}
9802
9803/*
9804 * call-seq:
9805 * codepoints -> array_of_integers
9806 *
9807 * :include: doc/string/codepoints.rdoc
9808 *
9809 */
9810
9811static VALUE
9812rb_str_codepoints(VALUE str)
9813{
9814 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9815 return rb_str_enumerate_codepoints(str, ary);
9816}
9817
9818static regex_t *
9819get_reg_grapheme_cluster(rb_encoding *enc)
9820{
9821 int encidx = rb_enc_to_index(enc);
9822
9823 const OnigUChar source_ascii[] = "\\X";
9824 const OnigUChar *source = source_ascii;
9825 size_t source_len = sizeof(source_ascii) - 1;
9826
9827 switch (encidx) {
9828#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9829#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9830#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9831#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9832#define CASE_UTF(e) \
9833 case ENCINDEX_UTF_##e: { \
9834 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9835 source = source_UTF_##e; \
9836 source_len = sizeof(source_UTF_##e); \
9837 break; \
9838 }
9839 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9840#undef CASE_UTF
9841#undef CHARS_16BE
9842#undef CHARS_16LE
9843#undef CHARS_32BE
9844#undef CHARS_32LE
9845 }
9846
9847 regex_t *reg_grapheme_cluster;
9848 OnigErrorInfo einfo;
9849 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9850 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9851 if (r) {
9852 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9853 onig_error_code_to_str(message, r, &einfo);
9854 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9855 }
9856
9857 return reg_grapheme_cluster;
9858}
9859
9860static regex_t *
9861get_cached_reg_grapheme_cluster(rb_encoding *enc)
9862{
9863 int encidx = rb_enc_to_index(enc);
9864 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9865
9866 if (encidx == rb_utf8_encindex()) {
9867 if (!reg_grapheme_cluster_utf8) {
9868 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9869 }
9870
9871 return reg_grapheme_cluster_utf8;
9872 }
9873
9874 return NULL;
9875}
9876
9877static VALUE
9878rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9879{
9880 size_t grapheme_cluster_count = 0;
9881 rb_encoding *enc = get_encoding(str);
9882 const char *ptr, *end;
9883
9884 if (!rb_enc_unicode_p(enc)) {
9885 return rb_str_length(str);
9886 }
9887
9888 bool cached_reg_grapheme_cluster = true;
9889 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9890 if (!reg_grapheme_cluster) {
9891 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9892 cached_reg_grapheme_cluster = false;
9893 }
9894
9895 ptr = RSTRING_PTR(str);
9896 end = RSTRING_END(str);
9897
9898 while (ptr < end) {
9899 OnigPosition len = onig_match(reg_grapheme_cluster,
9900 (const OnigUChar *)ptr, (const OnigUChar *)end,
9901 (const OnigUChar *)ptr, NULL, 0);
9902 if (len <= 0) break;
9903 grapheme_cluster_count++;
9904 ptr += len;
9905 }
9906
9907 if (!cached_reg_grapheme_cluster) {
9908 onig_free(reg_grapheme_cluster);
9909 }
9910
9911 return SIZET2NUM(grapheme_cluster_count);
9912}
9913
9914static VALUE
9915rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9916{
9917 VALUE orig = str;
9918 rb_encoding *enc = get_encoding(str);
9919 const char *ptr0, *ptr, *end;
9920
9921 if (!rb_enc_unicode_p(enc)) {
9922 return rb_str_enumerate_chars(str, ary);
9923 }
9924
9925 if (!ary) str = rb_str_new_frozen(str);
9926
9927 bool cached_reg_grapheme_cluster = true;
9928 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9929 if (!reg_grapheme_cluster) {
9930 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9931 cached_reg_grapheme_cluster = false;
9932 }
9933
9934 ptr0 = ptr = RSTRING_PTR(str);
9935 end = RSTRING_END(str);
9936
9937 while (ptr < end) {
9938 OnigPosition len = onig_match(reg_grapheme_cluster,
9939 (const OnigUChar *)ptr, (const OnigUChar *)end,
9940 (const OnigUChar *)ptr, NULL, 0);
9941 if (len <= 0) break;
9942 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9943 ptr += len;
9944 }
9945
9946 if (!cached_reg_grapheme_cluster) {
9947 onig_free(reg_grapheme_cluster);
9948 }
9949
9950 RB_GC_GUARD(str);
9951 if (ary)
9952 return ary;
9953 else
9954 return orig;
9955}
9956
9957/*
9958 * call-seq:
9959 * each_grapheme_cluster {|gc| ... } -> self
9960 * each_grapheme_cluster -> enumerator
9961 *
9962 * :include: doc/string/each_grapheme_cluster.rdoc
9963 *
9964 */
9965
9966static VALUE
9967rb_str_each_grapheme_cluster(VALUE str)
9968{
9969 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9970 return rb_str_enumerate_grapheme_clusters(str, 0);
9971}
9972
9973/*
9974 * call-seq:
9975 * grapheme_clusters -> array_of_grapheme_clusters
9976 *
9977 * :include: doc/string/grapheme_clusters.rdoc
9978 *
9979 */
9980
9981static VALUE
9982rb_str_grapheme_clusters(VALUE str)
9983{
9984 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9985 return rb_str_enumerate_grapheme_clusters(str, ary);
9986}
9987
9988static long
9989chopped_length(VALUE str)
9990{
9991 rb_encoding *enc = STR_ENC_GET(str);
9992 const char *p, *p2, *beg, *end;
9993
9994 beg = RSTRING_PTR(str);
9995 end = beg + RSTRING_LEN(str);
9996 if (beg >= end) return 0;
9997 p = rb_enc_prev_char(beg, end, end, enc);
9998 if (!p) return 0;
9999 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
10000 p2 = rb_enc_prev_char(beg, p, end, enc);
10001 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
10002 }
10003 return p - beg;
10004}
10005
10006/*
10007 * call-seq:
10008 * chop! -> self or nil
10009 *
10010 * Like String#chop, but modifies +self+ in place;
10011 * returns +nil+ if +self+ is empty, +self+ otherwise.
10012 *
10013 * Related: String#chomp!.
10014 */
10015
10016static VALUE
10017rb_str_chop_bang(VALUE str)
10018{
10019 str_modify_keep_cr(str);
10020 if (RSTRING_LEN(str) > 0) {
10021 long len;
10022 len = chopped_length(str);
10023 STR_SET_LEN(str, len);
10024 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10025 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10027 }
10028 return str;
10029 }
10030 return Qnil;
10031}
10032
10033
10034/*
10035 * call-seq:
10036 * chop -> new_string
10037 *
10038 * :include: doc/string/chop.rdoc
10039 *
10040 */
10041
10042static VALUE
10043rb_str_chop(VALUE str)
10044{
10045 return rb_str_subseq(str, 0, chopped_length(str));
10046}
10047
10048static long
10049smart_chomp(VALUE str, const char *e, const char *p)
10050{
10051 rb_encoding *enc = rb_enc_get(str);
10052 if (rb_enc_mbminlen(enc) > 1) {
10053 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10054 if (rb_enc_is_newline(pp, e, enc)) {
10055 e = pp;
10056 }
10057 pp = e - rb_enc_mbminlen(enc);
10058 if (pp >= p) {
10059 pp = rb_enc_left_char_head(p, pp, e, enc);
10060 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10061 e = pp;
10062 }
10063 }
10064 }
10065 else {
10066 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
10067 case '\n':
10068 if (--e > p && *(e-1) == '\r') {
10069 --e;
10070 }
10071 break;
10072 case '\r':
10073 --e;
10074 break;
10075 }
10076 }
10077 return e - p;
10078}
10079
10080static long
10081chompped_length(VALUE str, VALUE rs)
10082{
10083 rb_encoding *enc;
10084 int newline;
10085 char *pp, *e, *rsptr;
10086 long rslen;
10087 char *const p = RSTRING_PTR(str);
10088 long len = RSTRING_LEN(str);
10089
10090 if (len == 0) return 0;
10091 e = p + len;
10092 if (rs == rb_default_rs) {
10093 return smart_chomp(str, e, p);
10094 }
10095
10096 enc = rb_enc_get(str);
10097 RSTRING_GETMEM(rs, rsptr, rslen);
10098 if (rslen == 0) {
10099 if (rb_enc_mbminlen(enc) > 1) {
10100 while (e > p) {
10101 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10102 if (!rb_enc_is_newline(pp, e, enc)) break;
10103 e = pp;
10104 pp -= rb_enc_mbminlen(enc);
10105 if (pp >= p) {
10106 pp = rb_enc_left_char_head(p, pp, e, enc);
10107 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10108 e = pp;
10109 }
10110 }
10111 }
10112 }
10113 else {
10114 while (e > p && *(e-1) == '\n') {
10115 --e;
10116 if (e > p && *(e-1) == '\r')
10117 --e;
10118 }
10119 }
10120 return e - p;
10121 }
10122 if (rslen > len) return len;
10123
10124 enc = rb_enc_get(rs);
10125 newline = rsptr[rslen-1];
10126 if (rslen == rb_enc_mbminlen(enc)) {
10127 if (rslen == 1) {
10128 if (newline == '\n')
10129 return smart_chomp(str, e, p);
10130 }
10131 else {
10132 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
10133 return smart_chomp(str, e, p);
10134 }
10135 }
10136
10137 enc = rb_enc_check(str, rs);
10138 if (is_broken_string(rs)) {
10139 return len;
10140 }
10141 pp = e - rslen;
10142 if (p[len-1] == newline &&
10143 (rslen <= 1 ||
10144 memcmp(rsptr, pp, rslen) == 0)) {
10145 if (at_char_boundary(p, pp, e, enc))
10146 return len - rslen;
10147 RB_GC_GUARD(rs);
10148 }
10149 return len;
10150}
10151
10157static VALUE
10158chomp_rs(int argc, const VALUE *argv)
10159{
10160 rb_check_arity(argc, 0, 1);
10161 if (argc > 0) {
10162 VALUE rs = argv[0];
10163 if (!NIL_P(rs)) StringValue(rs);
10164 return rs;
10165 }
10166 else {
10167 return rb_rs;
10168 }
10169}
10170
10171VALUE
10172rb_str_chomp_string(VALUE str, VALUE rs)
10173{
10174 long olen = RSTRING_LEN(str);
10175 long len = chompped_length(str, rs);
10176 if (len >= olen) return Qnil;
10177 str_modify_keep_cr(str);
10178 STR_SET_LEN(str, len);
10179 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10180 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10182 }
10183 return str;
10184}
10185
10186/*
10187 * call-seq:
10188 * chomp!(line_sep = $/) -> self or nil
10189 *
10190 * Like String#chomp, but modifies +self+ in place;
10191 * returns +nil+ if no modification made, +self+ otherwise.
10192 *
10193 */
10194
10195static VALUE
10196rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
10197{
10198 VALUE rs;
10199 str_modifiable(str);
10200 if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
10201 rs = chomp_rs(argc, argv);
10202 if (NIL_P(rs)) return Qnil;
10203 return rb_str_chomp_string(str, rs);
10204}
10205
10206
10207/*
10208 * call-seq:
10209 * chomp(line_sep = $/) -> new_string
10210 *
10211 * :include: doc/string/chomp.rdoc
10212 *
10213 */
10214
10215static VALUE
10216rb_str_chomp(int argc, VALUE *argv, VALUE str)
10217{
10218 VALUE rs = chomp_rs(argc, argv);
10219 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
10220 return rb_str_subseq(str, 0, chompped_length(str, rs));
10221}
10222
10223static long
10224lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10225{
10226 const char *const start = s;
10227
10228 if (!s || s >= e) return 0;
10229
10230 /* remove spaces at head */
10231 if (single_byte_optimizable(str)) {
10232 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
10233 }
10234 else {
10235 while (s < e) {
10236 int n;
10237 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10238
10239 if (cc && !rb_isspace(cc)) break;
10240 s += n;
10241 }
10242 }
10243 return s - start;
10244}
10245
10246/*
10247 * call-seq:
10248 * lstrip! -> self or nil
10249 *
10250 * Like String#lstrip, except that any modifications are made in +self+;
10251 * returns +self+ if any modification are made, +nil+ otherwise.
10252 *
10253 * Related: String#rstrip!, String#strip!.
10254 */
10255
10256static VALUE
10257rb_str_lstrip_bang(VALUE str)
10258{
10259 rb_encoding *enc;
10260 char *start, *s;
10261 long olen, loffset;
10262
10263 str_modify_keep_cr(str);
10264 enc = STR_ENC_GET(str);
10265 RSTRING_GETMEM(str, start, olen);
10266 loffset = lstrip_offset(str, start, start+olen, enc);
10267 if (loffset > 0) {
10268 long len = olen-loffset;
10269 s = start + loffset;
10270 memmove(start, s, len);
10271 STR_SET_LEN(str, len);
10272 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10273 return str;
10274 }
10275 return Qnil;
10276}
10277
10278
10279/*
10280 * call-seq:
10281 * lstrip -> new_string
10282 *
10283 * Returns a copy of +self+ with leading whitespace removed;
10284 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10285 *
10286 * whitespace = "\x00\t\n\v\f\r "
10287 * s = whitespace + 'abc' + whitespace
10288 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10289 * s.lstrip # => "abc\u0000\t\n\v\f\r "
10290 *
10291 * Related: String#rstrip, String#strip.
10292 */
10293
10294static VALUE
10295rb_str_lstrip(VALUE str)
10296{
10297 char *start;
10298 long len, loffset;
10299 RSTRING_GETMEM(str, start, len);
10300 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
10301 if (loffset <= 0) return str_duplicate(rb_cString, str);
10302 return rb_str_subseq(str, loffset, len - loffset);
10303}
10304
10305static long
10306rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10307{
10308 const char *t;
10309
10310 rb_str_check_dummy_enc(enc);
10312 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10313 }
10314 if (!s || s >= e) return 0;
10315 t = e;
10316
10317 /* remove trailing spaces or '\0's */
10318 if (single_byte_optimizable(str)) {
10319 unsigned char c;
10320 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
10321 }
10322 else {
10323 char *tp;
10324
10325 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10326 unsigned int c = rb_enc_codepoint(tp, e, enc);
10327 if (c && !rb_isspace(c)) break;
10328 t = tp;
10329 }
10330 }
10331 return e - t;
10332}
10333
10334/*
10335 * call-seq:
10336 * rstrip! -> self or nil
10337 *
10338 * Like String#rstrip, except that any modifications are made in +self+;
10339 * returns +self+ if any modification are made, +nil+ otherwise.
10340 *
10341 * Related: String#lstrip!, String#strip!.
10342 */
10343
10344static VALUE
10345rb_str_rstrip_bang(VALUE str)
10346{
10347 rb_encoding *enc;
10348 char *start;
10349 long olen, roffset;
10350
10351 str_modify_keep_cr(str);
10352 enc = STR_ENC_GET(str);
10353 RSTRING_GETMEM(str, start, olen);
10354 roffset = rstrip_offset(str, start, start+olen, enc);
10355 if (roffset > 0) {
10356 long len = olen - roffset;
10357
10358 STR_SET_LEN(str, len);
10359 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10360 return str;
10361 }
10362 return Qnil;
10363}
10364
10365
10366/*
10367 * call-seq:
10368 * rstrip -> new_string
10369 *
10370 * Returns a copy of the receiver with trailing whitespace removed;
10371 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10372 *
10373 * whitespace = "\x00\t\n\v\f\r "
10374 * s = whitespace + 'abc' + whitespace
10375 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10376 * s.rstrip # => "\u0000\t\n\v\f\r abc"
10377 *
10378 * Related: String#lstrip, String#strip.
10379 */
10380
10381static VALUE
10382rb_str_rstrip(VALUE str)
10383{
10384 rb_encoding *enc;
10385 char *start;
10386 long olen, roffset;
10387
10388 enc = STR_ENC_GET(str);
10389 RSTRING_GETMEM(str, start, olen);
10390 roffset = rstrip_offset(str, start, start+olen, enc);
10391
10392 if (roffset <= 0) return str_duplicate(rb_cString, str);
10393 return rb_str_subseq(str, 0, olen-roffset);
10394}
10395
10396
10397/*
10398 * call-seq:
10399 * strip! -> self or nil
10400 *
10401 * Like String#strip, except that any modifications are made in +self+;
10402 * returns +self+ if any modification are made, +nil+ otherwise.
10403 *
10404 * Related: String#lstrip!, String#strip!.
10405 */
10406
10407static VALUE
10408rb_str_strip_bang(VALUE str)
10409{
10410 char *start;
10411 long olen, loffset, roffset;
10412 rb_encoding *enc;
10413
10414 str_modify_keep_cr(str);
10415 enc = STR_ENC_GET(str);
10416 RSTRING_GETMEM(str, start, olen);
10417 loffset = lstrip_offset(str, start, start+olen, enc);
10418 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10419
10420 if (loffset > 0 || roffset > 0) {
10421 long len = olen-roffset;
10422 if (loffset > 0) {
10423 len -= loffset;
10424 memmove(start, start + loffset, len);
10425 }
10426 STR_SET_LEN(str, len);
10427 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10428 return str;
10429 }
10430 return Qnil;
10431}
10432
10433
10434/*
10435 * call-seq:
10436 * strip -> new_string
10437 *
10438 * Returns a copy of the receiver with leading and trailing whitespace removed;
10439 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10440 *
10441 * whitespace = "\x00\t\n\v\f\r "
10442 * s = whitespace + 'abc' + whitespace
10443 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10444 * s.strip # => "abc"
10445 *
10446 * Related: String#lstrip, String#rstrip.
10447 */
10448
10449static VALUE
10450rb_str_strip(VALUE str)
10451{
10452 char *start;
10453 long olen, loffset, roffset;
10454 rb_encoding *enc = STR_ENC_GET(str);
10455
10456 RSTRING_GETMEM(str, start, olen);
10457 loffset = lstrip_offset(str, start, start+olen, enc);
10458 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10459
10460 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10461 return rb_str_subseq(str, loffset, olen-loffset-roffset);
10462}
10463
10464static VALUE
10465scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10466{
10467 VALUE result = Qnil;
10468 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10469 if (pos >= 0) {
10470 VALUE match;
10471 struct re_registers *regs;
10472 if (BUILTIN_TYPE(pat) == T_STRING) {
10473 regs = NULL;
10474 end = pos + RSTRING_LEN(pat);
10475 }
10476 else {
10477 match = rb_backref_get();
10478 regs = RMATCH_REGS(match);
10479 pos = BEG(0);
10480 end = END(0);
10481 }
10482
10483 if (pos == end) {
10484 rb_encoding *enc = STR_ENC_GET(str);
10485 /*
10486 * Always consume at least one character of the input string
10487 */
10488 if (RSTRING_LEN(str) > end)
10489 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10490 RSTRING_END(str), enc);
10491 else
10492 *start = end + 1;
10493 }
10494 else {
10495 *start = end;
10496 }
10497
10498 if (!regs || regs->num_regs == 1) {
10499 result = rb_str_subseq(str, pos, end - pos);
10500 return result;
10501 }
10502 else {
10503 result = rb_ary_new2(regs->num_regs);
10504 for (int i = 1; i < regs->num_regs; i++) {
10505 VALUE s = Qnil;
10506 if (BEG(i) >= 0) {
10507 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10508 }
10509
10510 rb_ary_push(result, s);
10511 }
10512 }
10513
10514 RB_GC_GUARD(match);
10515 }
10516
10517 return result;
10518}
10519
10520
10521/*
10522 * call-seq:
10523 * scan(string_or_regexp) -> array
10524 * scan(string_or_regexp) {|matches| ... } -> self
10525 *
10526 * Matches a pattern against +self+; the pattern is:
10527 *
10528 * - +string_or_regexp+ itself, if it is a Regexp.
10529 * - <tt>Regexp.quote(string_or_regexp)</tt>, if +string_or_regexp+ is a string.
10530 *
10531 * Iterates through +self+, generating a collection of matching results:
10532 *
10533 * - If the pattern contains no groups, each result is the
10534 * matched string, <code>$&</code>.
10535 * - If the pattern contains groups, each result is an array
10536 * containing one entry per group.
10537 *
10538 * With no block given, returns an array of the results:
10539 *
10540 * s = 'cruel world'
10541 * s.scan(/\w+/) # => ["cruel", "world"]
10542 * s.scan(/.../) # => ["cru", "el ", "wor"]
10543 * s.scan(/(...)/) # => [["cru"], ["el "], ["wor"]]
10544 * s.scan(/(..)(..)/) # => [["cr", "ue"], ["l ", "wo"]]
10545 *
10546 * With a block given, calls the block with each result; returns +self+:
10547 *
10548 * s.scan(/\w+/) {|w| print "<<#{w}>> " }
10549 * print "\n"
10550 * s.scan(/(.)(.)/) {|x,y| print y, x }
10551 * print "\n"
10552 *
10553 * Output:
10554 *
10555 * <<cruel>> <<world>>
10556 * rceu lowlr
10557 *
10558 */
10559
10560static VALUE
10561rb_str_scan(VALUE str, VALUE pat)
10562{
10563 VALUE result;
10564 long start = 0;
10565 long last = -1, prev = 0;
10566 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10567
10568 pat = get_pat_quoted(pat, 1);
10569 mustnot_broken(str);
10570 if (!rb_block_given_p()) {
10571 VALUE ary = rb_ary_new();
10572
10573 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10574 last = prev;
10575 prev = start;
10576 rb_ary_push(ary, result);
10577 }
10578 if (last >= 0) rb_pat_search(pat, str, last, 1);
10579 else rb_backref_set(Qnil);
10580 return ary;
10581 }
10582
10583 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10584 last = prev;
10585 prev = start;
10586 rb_yield(result);
10587 str_mod_check(str, p, len);
10588 }
10589 if (last >= 0) rb_pat_search(pat, str, last, 1);
10590 return str;
10591}
10592
10593
10594/*
10595 * call-seq:
10596 * hex -> integer
10597 *
10598 * Interprets the leading substring of +self+ as a string of hexadecimal digits
10599 * (with an optional sign and an optional <code>0x</code>) and returns the
10600 * corresponding number;
10601 * returns zero if there is no such leading substring:
10602 *
10603 * '0x0a'.hex # => 10
10604 * '-1234'.hex # => -4660
10605 * '0'.hex # => 0
10606 * 'non-numeric'.hex # => 0
10607 *
10608 * Related: String#oct.
10609 *
10610 */
10611
10612static VALUE
10613rb_str_hex(VALUE str)
10614{
10615 return rb_str_to_inum(str, 16, FALSE);
10616}
10617
10618
10619/*
10620 * call-seq:
10621 * oct -> integer
10622 *
10623 * Interprets the leading substring of +self+ as a string of octal digits
10624 * (with an optional sign) and returns the corresponding number;
10625 * returns zero if there is no such leading substring:
10626 *
10627 * '123'.oct # => 83
10628 * '-377'.oct # => -255
10629 * '0377non-numeric'.oct # => 255
10630 * 'non-numeric'.oct # => 0
10631 *
10632 * If +self+ starts with <tt>0</tt>, radix indicators are honored;
10633 * see Kernel#Integer.
10634 *
10635 * Related: String#hex.
10636 *
10637 */
10638
10639static VALUE
10640rb_str_oct(VALUE str)
10641{
10642 return rb_str_to_inum(str, -8, FALSE);
10643}
10644
10645#ifndef HAVE_CRYPT_R
10646# include "ruby/thread_native.h"
10647# include "ruby/atomic.h"
10648
10649static struct {
10650 rb_nativethread_lock_t lock;
10651} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10652
10653static void
10654crypt_mutex_initialize(void)
10655{
10656}
10657#endif
10658
10659/*
10660 * call-seq:
10661 * crypt(salt_str) -> new_string
10662 *
10663 * Returns the string generated by calling <code>crypt(3)</code>
10664 * standard library function with <code>str</code> and
10665 * <code>salt_str</code>, in this order, as its arguments. Please do
10666 * not use this method any longer. It is legacy; provided only for
10667 * backward compatibility with ruby scripts in earlier days. It is
10668 * bad to use in contemporary programs for several reasons:
10669 *
10670 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10671 * run. The generated string lacks data portability.
10672 *
10673 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10674 * (i.e. silently ends up in unexpected results).
10675 *
10676 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10677 * thread safe.
10678 *
10679 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10680 * very very weak. According to its manpage, Linux's traditional
10681 * <code>crypt(3)</code> output has only 2**56 variations; too
10682 * easy to brute force today. And this is the default behaviour.
10683 *
10684 * * In order to make things robust some OSes implement so-called
10685 * "modular" usage. To go through, you have to do a complex
10686 * build-up of the <code>salt_str</code> parameter, by hand.
10687 * Failure in generation of a proper salt string tends not to
10688 * yield any errors; typos in parameters are normally not
10689 * detectable.
10690 *
10691 * * For instance, in the following example, the second invocation
10692 * of String#crypt is wrong; it has a typo in "round=" (lacks
10693 * "s"). However the call does not fail and something unexpected
10694 * is generated.
10695 *
10696 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10697 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10698 *
10699 * * Even in the "modular" mode, some hash functions are considered
10700 * archaic and no longer recommended at all; for instance module
10701 * <code>$1$</code> is officially abandoned by its author: see
10702 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10703 * instance module <code>$3$</code> is considered completely
10704 * broken: see the manpage of FreeBSD.
10705 *
10706 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10707 * written above, <code>crypt(3)</code> on Mac OS never fails.
10708 * This means even if you build up a proper salt string it
10709 * generates a traditional DES hash anyways, and there is no way
10710 * for you to be aware of.
10711 *
10712 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10713 *
10714 * If for some reason you cannot migrate to other secure contemporary
10715 * password hashing algorithms, install the string-crypt gem and
10716 * <code>require 'string/crypt'</code> to continue using it.
10717 */
10718
10719static VALUE
10720rb_str_crypt(VALUE str, VALUE salt)
10721{
10722#ifdef HAVE_CRYPT_R
10723 VALUE databuf;
10724 struct crypt_data *data;
10725# define CRYPT_END() ALLOCV_END(databuf)
10726#else
10727 extern char *crypt(const char *, const char *);
10728# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10729#endif
10730 VALUE result;
10731 const char *s, *saltp;
10732 char *res;
10733#ifdef BROKEN_CRYPT
10734 char salt_8bit_clean[3];
10735#endif
10736
10737 StringValue(salt);
10738 mustnot_wchar(str);
10739 mustnot_wchar(salt);
10740 s = StringValueCStr(str);
10741 saltp = RSTRING_PTR(salt);
10742 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10743 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10744 }
10745
10746#ifdef BROKEN_CRYPT
10747 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10748 salt_8bit_clean[0] = saltp[0] & 0x7f;
10749 salt_8bit_clean[1] = saltp[1] & 0x7f;
10750 salt_8bit_clean[2] = '\0';
10751 saltp = salt_8bit_clean;
10752 }
10753#endif
10754#ifdef HAVE_CRYPT_R
10755 data = ALLOCV(databuf, sizeof(struct crypt_data));
10756# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10757 data->initialized = 0;
10758# endif
10759 res = crypt_r(s, saltp, data);
10760#else
10761 crypt_mutex_initialize();
10762 rb_nativethread_lock_lock(&crypt_mutex.lock);
10763 res = crypt(s, saltp);
10764#endif
10765 if (!res) {
10766 int err = errno;
10767 CRYPT_END();
10768 rb_syserr_fail(err, "crypt");
10769 }
10770 result = rb_str_new_cstr(res);
10771 CRYPT_END();
10772 return result;
10773}
10774
10775
10776/*
10777 * call-seq:
10778 * ord -> integer
10779 *
10780 * :include: doc/string/ord.rdoc
10781 *
10782 */
10783
10784static VALUE
10785rb_str_ord(VALUE s)
10786{
10787 unsigned int c;
10788
10789 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10790 return UINT2NUM(c);
10791}
10792/*
10793 * call-seq:
10794 * sum(n = 16) -> integer
10795 *
10796 * :include: doc/string/sum.rdoc
10797 *
10798 */
10799
10800static VALUE
10801rb_str_sum(int argc, VALUE *argv, VALUE str)
10802{
10803 int bits = 16;
10804 char *ptr, *p, *pend;
10805 long len;
10806 VALUE sum = INT2FIX(0);
10807 unsigned long sum0 = 0;
10808
10809 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10810 bits = 0;
10811 }
10812 ptr = p = RSTRING_PTR(str);
10813 len = RSTRING_LEN(str);
10814 pend = p + len;
10815
10816 while (p < pend) {
10817 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10818 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10819 str_mod_check(str, ptr, len);
10820 sum0 = 0;
10821 }
10822 sum0 += (unsigned char)*p;
10823 p++;
10824 }
10825
10826 if (bits == 0) {
10827 if (sum0) {
10828 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10829 }
10830 }
10831 else {
10832 if (sum == INT2FIX(0)) {
10833 if (bits < (int)sizeof(long)*CHAR_BIT) {
10834 sum0 &= (((unsigned long)1)<<bits)-1;
10835 }
10836 sum = LONG2FIX(sum0);
10837 }
10838 else {
10839 VALUE mod;
10840
10841 if (sum0) {
10842 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10843 }
10844
10845 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10846 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10847 sum = rb_funcall(sum, '&', 1, mod);
10848 }
10849 }
10850 return sum;
10851}
10852
10853static VALUE
10854rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10855{
10856 rb_encoding *enc;
10857 VALUE w;
10858 long width, len, flen = 1, fclen = 1;
10859 VALUE res;
10860 char *p;
10861 const char *f = " ";
10862 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10863 VALUE pad;
10864 int singlebyte = 1, cr;
10865 int termlen;
10866
10867 rb_scan_args(argc, argv, "11", &w, &pad);
10868 enc = STR_ENC_GET(str);
10869 termlen = rb_enc_mbminlen(enc);
10870 width = NUM2LONG(w);
10871 if (argc == 2) {
10872 StringValue(pad);
10873 enc = rb_enc_check(str, pad);
10874 f = RSTRING_PTR(pad);
10875 flen = RSTRING_LEN(pad);
10876 fclen = str_strlen(pad, enc); /* rb_enc_check */
10877 singlebyte = single_byte_optimizable(pad);
10878 if (flen == 0 || fclen == 0) {
10879 rb_raise(rb_eArgError, "zero width padding");
10880 }
10881 }
10882 len = str_strlen(str, enc); /* rb_enc_check */
10883 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
10884 n = width - len;
10885 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
10886 rlen = n - llen;
10887 cr = ENC_CODERANGE(str);
10888 if (flen > 1) {
10889 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10890 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10891 }
10892 size = RSTRING_LEN(str);
10893 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10894 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10895 (len += llen2 + rlen2) >= LONG_MAX - size) {
10896 rb_raise(rb_eArgError, "argument too big");
10897 }
10898 len += size;
10899 res = str_enc_new(rb_cString, 0, len, enc);
10900 p = RSTRING_PTR(res);
10901 if (flen <= 1) {
10902 memset(p, *f, llen);
10903 p += llen;
10904 }
10905 else {
10906 while (llen >= fclen) {
10907 memcpy(p,f,flen);
10908 p += flen;
10909 llen -= fclen;
10910 }
10911 if (llen > 0) {
10912 memcpy(p, f, llen2);
10913 p += llen2;
10914 }
10915 }
10916 memcpy(p, RSTRING_PTR(str), size);
10917 p += size;
10918 if (flen <= 1) {
10919 memset(p, *f, rlen);
10920 p += rlen;
10921 }
10922 else {
10923 while (rlen >= fclen) {
10924 memcpy(p,f,flen);
10925 p += flen;
10926 rlen -= fclen;
10927 }
10928 if (rlen > 0) {
10929 memcpy(p, f, rlen2);
10930 p += rlen2;
10931 }
10932 }
10933 TERM_FILL(p, termlen);
10934 STR_SET_LEN(res, p-RSTRING_PTR(res));
10935
10936 if (argc == 2)
10937 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
10938 if (cr != ENC_CODERANGE_BROKEN)
10939 ENC_CODERANGE_SET(res, cr);
10940
10941 RB_GC_GUARD(pad);
10942 return res;
10943}
10944
10945
10946/*
10947 * call-seq:
10948 * ljust(size, pad_string = ' ') -> new_string
10949 *
10950 * :include: doc/string/ljust.rdoc
10951 *
10952 * Related: String#rjust, String#center.
10953 *
10954 */
10955
10956static VALUE
10957rb_str_ljust(int argc, VALUE *argv, VALUE str)
10958{
10959 return rb_str_justify(argc, argv, str, 'l');
10960}
10961
10962/*
10963 * call-seq:
10964 * rjust(size, pad_string = ' ') -> new_string
10965 *
10966 * :include: doc/string/rjust.rdoc
10967 *
10968 * Related: String#ljust, String#center.
10969 *
10970 */
10971
10972static VALUE
10973rb_str_rjust(int argc, VALUE *argv, VALUE str)
10974{
10975 return rb_str_justify(argc, argv, str, 'r');
10976}
10977
10978
10979/*
10980 * call-seq:
10981 * center(size, pad_string = ' ') -> new_string
10982 *
10983 * :include: doc/string/center.rdoc
10984 *
10985 * Related: String#ljust, String#rjust.
10986 *
10987 */
10988
10989static VALUE
10990rb_str_center(int argc, VALUE *argv, VALUE str)
10991{
10992 return rb_str_justify(argc, argv, str, 'c');
10993}
10994
10995/*
10996 * call-seq:
10997 * partition(string_or_regexp) -> [head, match, tail]
10998 *
10999 * :include: doc/string/partition.rdoc
11000 *
11001 */
11002
11003static VALUE
11004rb_str_partition(VALUE str, VALUE sep)
11005{
11006 long pos;
11007
11008 sep = get_pat_quoted(sep, 0);
11009 if (RB_TYPE_P(sep, T_REGEXP)) {
11010 if (rb_reg_search(sep, str, 0, 0) < 0) {
11011 goto failed;
11012 }
11013 VALUE match = rb_backref_get();
11014 struct re_registers *regs = RMATCH_REGS(match);
11015
11016 pos = BEG(0);
11017 sep = rb_str_subseq(str, pos, END(0) - pos);
11018 }
11019 else {
11020 pos = rb_str_index(str, sep, 0);
11021 if (pos < 0) goto failed;
11022 }
11023 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11024 sep,
11025 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11026 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11027
11028 failed:
11029 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11030}
11031
11032/*
11033 * call-seq:
11034 * rpartition(sep) -> [head, match, tail]
11035 *
11036 * :include: doc/string/rpartition.rdoc
11037 *
11038 */
11039
11040static VALUE
11041rb_str_rpartition(VALUE str, VALUE sep)
11042{
11043 long pos = RSTRING_LEN(str);
11044
11045 sep = get_pat_quoted(sep, 0);
11046 if (RB_TYPE_P(sep, T_REGEXP)) {
11047 if (rb_reg_search(sep, str, pos, 1) < 0) {
11048 goto failed;
11049 }
11050 VALUE match = rb_backref_get();
11051 struct re_registers *regs = RMATCH_REGS(match);
11052
11053 pos = BEG(0);
11054 sep = rb_str_subseq(str, pos, END(0) - pos);
11055 }
11056 else {
11057 pos = rb_str_sublen(str, pos);
11058 pos = rb_str_rindex(str, sep, pos);
11059 if (pos < 0) {
11060 goto failed;
11061 }
11062 }
11063
11064 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11065 sep,
11066 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11067 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11068 failed:
11069 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
11070}
11071
11072/*
11073 * call-seq:
11074 * start_with?(*string_or_regexp) -> true or false
11075 *
11076 * :include: doc/string/start_with_p.rdoc
11077 *
11078 */
11079
11080static VALUE
11081rb_str_start_with(int argc, VALUE *argv, VALUE str)
11082{
11083 int i;
11084
11085 for (i=0; i<argc; i++) {
11086 VALUE tmp = argv[i];
11087 if (RB_TYPE_P(tmp, T_REGEXP)) {
11088 if (rb_reg_start_with_p(tmp, str))
11089 return Qtrue;
11090 }
11091 else {
11092 const char *p, *s, *e;
11093 long slen, tlen;
11094 rb_encoding *enc;
11095
11096 StringValue(tmp);
11097 enc = rb_enc_check(str, tmp);
11098 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11099 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11100 p = RSTRING_PTR(str);
11101 e = p + slen;
11102 s = p + tlen;
11103 if (!at_char_right_boundary(p, s, e, enc))
11104 continue;
11105 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11106 return Qtrue;
11107 }
11108 }
11109 return Qfalse;
11110}
11111
11112/*
11113 * call-seq:
11114 * end_with?(*strings) -> true or false
11115 *
11116 * :include: doc/string/end_with_p.rdoc
11117 *
11118 */
11119
11120static VALUE
11121rb_str_end_with(int argc, VALUE *argv, VALUE str)
11122{
11123 int i;
11124
11125 for (i=0; i<argc; i++) {
11126 VALUE tmp = argv[i];
11127 const char *p, *s, *e;
11128 long slen, tlen;
11129 rb_encoding *enc;
11130
11131 StringValue(tmp);
11132 enc = rb_enc_check(str, tmp);
11133 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11134 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11135 p = RSTRING_PTR(str);
11136 e = p + slen;
11137 s = e - tlen;
11138 if (!at_char_boundary(p, s, e, enc))
11139 continue;
11140 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11141 return Qtrue;
11142 }
11143 return Qfalse;
11144}
11145
11155static long
11156deleted_prefix_length(VALUE str, VALUE prefix)
11157{
11158 const char *strptr, *prefixptr;
11159 long olen, prefixlen;
11160 rb_encoding *enc = rb_enc_get(str);
11161
11162 StringValue(prefix);
11163
11164 if (!is_broken_string(prefix) ||
11165 !rb_enc_asciicompat(enc) ||
11166 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11167 enc = rb_enc_check(str, prefix);
11168 }
11169
11170 /* return 0 if not start with prefix */
11171 prefixlen = RSTRING_LEN(prefix);
11172 if (prefixlen <= 0) return 0;
11173 olen = RSTRING_LEN(str);
11174 if (olen < prefixlen) return 0;
11175 strptr = RSTRING_PTR(str);
11176 prefixptr = RSTRING_PTR(prefix);
11177 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
11178 if (is_broken_string(prefix)) {
11179 if (!is_broken_string(str)) {
11180 /* prefix in a valid string cannot be broken */
11181 return 0;
11182 }
11183 const char *strend = strptr + olen;
11184 const char *after_prefix = strptr + prefixlen;
11185 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11186 /* prefix does not end at char-boundary */
11187 return 0;
11188 }
11189 }
11190 /* prefix part in `str` also should be valid. */
11191
11192 return prefixlen;
11193}
11194
11195/*
11196 * call-seq:
11197 * delete_prefix!(prefix) -> self or nil
11198 *
11199 * Like String#delete_prefix, except that +self+ is modified in place.
11200 * Returns +self+ if the prefix is removed, +nil+ otherwise.
11201 *
11202 */
11203
11204static VALUE
11205rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
11206{
11207 long prefixlen;
11208 str_modify_keep_cr(str);
11209
11210 prefixlen = deleted_prefix_length(str, prefix);
11211 if (prefixlen <= 0) return Qnil;
11212
11213 return rb_str_drop_bytes(str, prefixlen);
11214}
11215
11216/*
11217 * call-seq:
11218 * delete_prefix(prefix) -> new_string
11219 *
11220 * :include: doc/string/delete_prefix.rdoc
11221 *
11222 */
11223
11224static VALUE
11225rb_str_delete_prefix(VALUE str, VALUE prefix)
11226{
11227 long prefixlen;
11228
11229 prefixlen = deleted_prefix_length(str, prefix);
11230 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
11231
11232 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11233}
11234
11244static long
11245deleted_suffix_length(VALUE str, VALUE suffix)
11246{
11247 const char *strptr, *suffixptr;
11248 long olen, suffixlen;
11249 rb_encoding *enc;
11250
11251 StringValue(suffix);
11252 if (is_broken_string(suffix)) return 0;
11253 enc = rb_enc_check(str, suffix);
11254
11255 /* return 0 if not start with suffix */
11256 suffixlen = RSTRING_LEN(suffix);
11257 if (suffixlen <= 0) return 0;
11258 olen = RSTRING_LEN(str);
11259 if (olen < suffixlen) return 0;
11260 strptr = RSTRING_PTR(str);
11261 suffixptr = RSTRING_PTR(suffix);
11262 const char *strend = strptr + olen;
11263 const char *before_suffix = strend - suffixlen;
11264 if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
11265 if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
11266
11267 return suffixlen;
11268}
11269
11270/*
11271 * call-seq:
11272 * delete_suffix!(suffix) -> self or nil
11273 *
11274 * Like String#delete_suffix, except that +self+ is modified in place.
11275 * Returns +self+ if the suffix is removed, +nil+ otherwise.
11276 *
11277 */
11278
11279static VALUE
11280rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
11281{
11282 long olen, suffixlen, len;
11283 str_modifiable(str);
11284
11285 suffixlen = deleted_suffix_length(str, suffix);
11286 if (suffixlen <= 0) return Qnil;
11287
11288 olen = RSTRING_LEN(str);
11289 str_modify_keep_cr(str);
11290 len = olen - suffixlen;
11291 STR_SET_LEN(str, len);
11292 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
11293 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
11295 }
11296 return str;
11297}
11298
11299/*
11300 * call-seq:
11301 * delete_suffix(suffix) -> new_string
11302 *
11303 * :include: doc/string/delete_suffix.rdoc
11304 *
11305 */
11306
11307static VALUE
11308rb_str_delete_suffix(VALUE str, VALUE suffix)
11309{
11310 long suffixlen;
11311
11312 suffixlen = deleted_suffix_length(str, suffix);
11313 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
11314
11315 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11316}
11317
11318void
11319rb_str_setter(VALUE val, ID id, VALUE *var)
11320{
11321 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
11322 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
11323 }
11324 *var = val;
11325}
11326
11327static void
11328rb_fs_setter(VALUE val, ID id, VALUE *var)
11329{
11330 val = rb_fs_check(val);
11331 if (!val) {
11332 rb_raise(rb_eTypeError,
11333 "value of %"PRIsVALUE" must be String or Regexp",
11334 rb_id2str(id));
11335 }
11336 if (!NIL_P(val)) {
11337 rb_warn_deprecated("'$;'", NULL);
11338 }
11339 *var = val;
11340}
11341
11342
11343/*
11344 * call-seq:
11345 * force_encoding(encoding) -> self
11346 *
11347 * :include: doc/string/force_encoding.rdoc
11348 *
11349 */
11350
11351static VALUE
11352rb_str_force_encoding(VALUE str, VALUE enc)
11353{
11354 str_modifiable(str);
11355
11356 rb_encoding *encoding = rb_to_encoding(enc);
11357 int idx = rb_enc_to_index(encoding);
11358
11359 // If the encoding is unchanged, we do nothing.
11360 if (ENCODING_GET(str) == idx) {
11361 return str;
11362 }
11363
11364 rb_enc_associate_index(str, idx);
11365
11366 // If the coderange was 7bit and the new encoding is ASCII-compatible
11367 // we can keep the coderange.
11368 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
11369 return str;
11370 }
11371
11373 return str;
11374}
11375
11376/*
11377 * call-seq:
11378 * b -> string
11379 *
11380 * :include: doc/string/b.rdoc
11381 *
11382 */
11383
11384static VALUE
11385rb_str_b(VALUE str)
11386{
11387 VALUE str2;
11388 if (STR_EMBED_P(str)) {
11389 str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11390 }
11391 else {
11392 str2 = str_alloc_heap(rb_cString);
11393 }
11394 str_replace_shared_without_enc(str2, str);
11395
11396 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11397 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
11398 // If we know the receiver's code range then we know the result's code range.
11399 int cr = ENC_CODERANGE(str);
11400 switch (cr) {
11401 case ENC_CODERANGE_7BIT:
11403 break;
11407 break;
11408 default:
11409 ENC_CODERANGE_CLEAR(str2);
11410 break;
11411 }
11412 }
11413
11414 return str2;
11415}
11416
11417/*
11418 * call-seq:
11419 * valid_encoding? -> true or false
11420 *
11421 * Returns +true+ if +self+ is encoded correctly, +false+ otherwise:
11422 *
11423 * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? # => true
11424 * "\xc2".force_encoding("UTF-8").valid_encoding? # => false
11425 * "\x80".force_encoding("UTF-8").valid_encoding? # => false
11426 */
11427
11428static VALUE
11429rb_str_valid_encoding_p(VALUE str)
11430{
11431 int cr = rb_enc_str_coderange(str);
11432
11433 return RBOOL(cr != ENC_CODERANGE_BROKEN);
11434}
11435
11436/*
11437 * call-seq:
11438 * ascii_only? -> true or false
11439 *
11440 * Returns +true+ if +self+ contains only ASCII characters,
11441 * +false+ otherwise:
11442 *
11443 * 'abc'.ascii_only? # => true
11444 * "abc\u{6666}".ascii_only? # => false
11445 *
11446 */
11447
11448static VALUE
11449rb_str_is_ascii_only_p(VALUE str)
11450{
11451 int cr = rb_enc_str_coderange(str);
11452
11453 return RBOOL(cr == ENC_CODERANGE_7BIT);
11454}
11455
11456VALUE
11458{
11459 static const char ellipsis[] = "...";
11460 const long ellipsislen = sizeof(ellipsis) - 1;
11461 rb_encoding *const enc = rb_enc_get(str);
11462 const long blen = RSTRING_LEN(str);
11463 const char *const p = RSTRING_PTR(str), *e = p + blen;
11464 VALUE estr, ret = 0;
11465
11466 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11467 if (len * rb_enc_mbminlen(enc) >= blen ||
11468 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11469 ret = str;
11470 }
11471 else if (len <= ellipsislen ||
11472 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11473 if (rb_enc_asciicompat(enc)) {
11474 ret = rb_str_new(ellipsis, len);
11475 rb_enc_associate(ret, enc);
11476 }
11477 else {
11478 estr = rb_usascii_str_new(ellipsis, len);
11479 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11480 }
11481 }
11482 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11483 rb_str_cat(ret, ellipsis, ellipsislen);
11484 }
11485 else {
11486 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11487 rb_enc_from_encoding(enc), 0, Qnil);
11488 rb_str_append(ret, estr);
11489 }
11490 return ret;
11491}
11492
11493static VALUE
11494str_compat_and_valid(VALUE str, rb_encoding *enc)
11495{
11496 int cr;
11497 str = StringValue(str);
11498 cr = rb_enc_str_coderange(str);
11499 if (cr == ENC_CODERANGE_BROKEN) {
11500 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11501 }
11502 else {
11503 rb_encoding *e = STR_ENC_GET(str);
11504 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11505 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11506 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11507 }
11508 }
11509 return str;
11510}
11511
11512static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11513
11514VALUE
11516{
11517 rb_encoding *enc = STR_ENC_GET(str);
11518 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11519}
11520
11521VALUE
11522rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11523{
11524 int cr = ENC_CODERANGE_UNKNOWN;
11525 if (enc == STR_ENC_GET(str)) {
11526 /* cached coderange makes sense only when enc equals the
11527 * actual encoding of str */
11528 cr = ENC_CODERANGE(str);
11529 }
11530 return enc_str_scrub(enc, str, repl, cr);
11531}
11532
11533static VALUE
11534enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11535{
11536 int encidx;
11537 VALUE buf = Qnil;
11538 const char *rep, *p, *e, *p1, *sp;
11539 long replen = -1;
11540 long slen;
11541
11542 if (rb_block_given_p()) {
11543 if (!NIL_P(repl))
11544 rb_raise(rb_eArgError, "both of block and replacement given");
11545 replen = 0;
11546 }
11547
11548 if (ENC_CODERANGE_CLEAN_P(cr))
11549 return Qnil;
11550
11551 if (!NIL_P(repl)) {
11552 repl = str_compat_and_valid(repl, enc);
11553 }
11554
11555 if (rb_enc_dummy_p(enc)) {
11556 return Qnil;
11557 }
11558 encidx = rb_enc_to_index(enc);
11559
11560#define DEFAULT_REPLACE_CHAR(str) do { \
11561 RBIMPL_ATTR_NONSTRING() static const char replace[sizeof(str)-1] = str; \
11562 rep = replace; replen = (int)sizeof(replace); \
11563 } while (0)
11564
11565 slen = RSTRING_LEN(str);
11566 p = RSTRING_PTR(str);
11567 e = RSTRING_END(str);
11568 p1 = p;
11569 sp = p;
11570
11571 if (rb_enc_asciicompat(enc)) {
11572 int rep7bit_p;
11573 if (!replen) {
11574 rep = NULL;
11575 rep7bit_p = FALSE;
11576 }
11577 else if (!NIL_P(repl)) {
11578 rep = RSTRING_PTR(repl);
11579 replen = RSTRING_LEN(repl);
11580 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11581 }
11582 else if (encidx == rb_utf8_encindex()) {
11583 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11584 rep7bit_p = FALSE;
11585 }
11586 else {
11587 DEFAULT_REPLACE_CHAR("?");
11588 rep7bit_p = TRUE;
11589 }
11590 cr = ENC_CODERANGE_7BIT;
11591
11592 p = search_nonascii(p, e);
11593 if (!p) {
11594 p = e;
11595 }
11596 while (p < e) {
11597 int ret = rb_enc_precise_mbclen(p, e, enc);
11598 if (MBCLEN_NEEDMORE_P(ret)) {
11599 break;
11600 }
11601 else if (MBCLEN_CHARFOUND_P(ret)) {
11603 p += MBCLEN_CHARFOUND_LEN(ret);
11604 }
11605 else if (MBCLEN_INVALID_P(ret)) {
11606 /*
11607 * p1~p: valid ascii/multibyte chars
11608 * p ~e: invalid bytes + unknown bytes
11609 */
11610 long clen = rb_enc_mbmaxlen(enc);
11611 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11612 if (p > p1) {
11613 rb_str_buf_cat(buf, p1, p - p1);
11614 }
11615
11616 if (e - p < clen) clen = e - p;
11617 if (clen <= 2) {
11618 clen = 1;
11619 }
11620 else {
11621 const char *q = p;
11622 clen--;
11623 for (; clen > 1; clen--) {
11624 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11625 if (MBCLEN_NEEDMORE_P(ret)) break;
11626 if (MBCLEN_INVALID_P(ret)) continue;
11628 }
11629 }
11630 if (rep) {
11631 rb_str_buf_cat(buf, rep, replen);
11632 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11633 }
11634 else {
11635 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11636 str_mod_check(str, sp, slen);
11637 repl = str_compat_and_valid(repl, enc);
11638 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11641 }
11642 p += clen;
11643 p1 = p;
11644 p = search_nonascii(p, e);
11645 if (!p) {
11646 p = e;
11647 break;
11648 }
11649 }
11650 else {
11652 }
11653 }
11654 if (NIL_P(buf)) {
11655 if (p == e) {
11656 ENC_CODERANGE_SET(str, cr);
11657 return Qnil;
11658 }
11659 buf = rb_str_buf_new(RSTRING_LEN(str));
11660 }
11661 if (p1 < p) {
11662 rb_str_buf_cat(buf, p1, p - p1);
11663 }
11664 if (p < e) {
11665 if (rep) {
11666 rb_str_buf_cat(buf, rep, replen);
11667 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11668 }
11669 else {
11670 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11671 str_mod_check(str, sp, slen);
11672 repl = str_compat_and_valid(repl, enc);
11673 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11676 }
11677 }
11678 }
11679 else {
11680 /* ASCII incompatible */
11681 long mbminlen = rb_enc_mbminlen(enc);
11682 if (!replen) {
11683 rep = NULL;
11684 }
11685 else if (!NIL_P(repl)) {
11686 rep = RSTRING_PTR(repl);
11687 replen = RSTRING_LEN(repl);
11688 }
11689 else if (encidx == ENCINDEX_UTF_16BE) {
11690 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11691 }
11692 else if (encidx == ENCINDEX_UTF_16LE) {
11693 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11694 }
11695 else if (encidx == ENCINDEX_UTF_32BE) {
11696 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11697 }
11698 else if (encidx == ENCINDEX_UTF_32LE) {
11699 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11700 }
11701 else {
11702 DEFAULT_REPLACE_CHAR("?");
11703 }
11704
11705 while (p < e) {
11706 int ret = rb_enc_precise_mbclen(p, e, enc);
11707 if (MBCLEN_NEEDMORE_P(ret)) {
11708 break;
11709 }
11710 else if (MBCLEN_CHARFOUND_P(ret)) {
11711 p += MBCLEN_CHARFOUND_LEN(ret);
11712 }
11713 else if (MBCLEN_INVALID_P(ret)) {
11714 const char *q = p;
11715 long clen = rb_enc_mbmaxlen(enc);
11716 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11717 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11718
11719 if (e - p < clen) clen = e - p;
11720 if (clen <= mbminlen * 2) {
11721 clen = mbminlen;
11722 }
11723 else {
11724 clen -= mbminlen;
11725 for (; clen > mbminlen; clen-=mbminlen) {
11726 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11727 if (MBCLEN_NEEDMORE_P(ret)) break;
11728 if (MBCLEN_INVALID_P(ret)) continue;
11730 }
11731 }
11732 if (rep) {
11733 rb_str_buf_cat(buf, rep, replen);
11734 }
11735 else {
11736 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11737 str_mod_check(str, sp, slen);
11738 repl = str_compat_and_valid(repl, enc);
11739 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11740 }
11741 p += clen;
11742 p1 = p;
11743 }
11744 else {
11746 }
11747 }
11748 if (NIL_P(buf)) {
11749 if (p == e) {
11751 return Qnil;
11752 }
11753 buf = rb_str_buf_new(RSTRING_LEN(str));
11754 }
11755 if (p1 < p) {
11756 rb_str_buf_cat(buf, p1, p - p1);
11757 }
11758 if (p < e) {
11759 if (rep) {
11760 rb_str_buf_cat(buf, rep, replen);
11761 }
11762 else {
11763 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11764 str_mod_check(str, sp, slen);
11765 repl = str_compat_and_valid(repl, enc);
11766 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11767 }
11768 }
11770 }
11771 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11772 return buf;
11773}
11774
11775/*
11776 * call-seq:
11777 * scrub(replacement_string = default_replacement) -> new_string
11778 * scrub{|bytes| ... } -> new_string
11779 *
11780 * :include: doc/string/scrub.rdoc
11781 *
11782 */
11783static VALUE
11784str_scrub(int argc, VALUE *argv, VALUE str)
11785{
11786 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11787 VALUE new = rb_str_scrub(str, repl);
11788 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11789}
11790
11791/*
11792 * call-seq:
11793 * scrub! -> self
11794 * scrub!(replacement_string = default_replacement) -> self
11795 * scrub!{|bytes| ... } -> self
11796 *
11797 * Like String#scrub, except that any replacements are made in +self+.
11798 *
11799 */
11800static VALUE
11801str_scrub_bang(int argc, VALUE *argv, VALUE str)
11802{
11803 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11804 VALUE new = rb_str_scrub(str, repl);
11805 if (!NIL_P(new)) rb_str_replace(str, new);
11806 return str;
11807}
11808
11809static ID id_normalize;
11810static ID id_normalized_p;
11811static VALUE mUnicodeNormalize;
11812
11813static VALUE
11814unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11815{
11816 static int UnicodeNormalizeRequired = 0;
11817 VALUE argv2[2];
11818
11819 if (!UnicodeNormalizeRequired) {
11820 rb_require("unicode_normalize/normalize.rb");
11821 UnicodeNormalizeRequired = 1;
11822 }
11823 argv2[0] = str;
11824 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11825 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11826}
11827
11828/*
11829 * call-seq:
11830 * unicode_normalize(form = :nfc) -> string
11831 *
11832 * Returns a copy of +self+ with
11833 * {Unicode normalization}[https://unicode.org/reports/tr15] applied.
11834 *
11835 * Argument +form+ must be one of the following symbols
11836 * (see {Unicode normalization forms}[https://unicode.org/reports/tr15/#Norm_Forms]):
11837 *
11838 * - +:nfc+: Canonical decomposition, followed by canonical composition.
11839 * - +:nfd+: Canonical decomposition.
11840 * - +:nfkc+: Compatibility decomposition, followed by canonical composition.
11841 * - +:nfkd+: Compatibility decomposition.
11842 *
11843 * The encoding of +self+ must be one of:
11844 *
11845 * - Encoding::UTF_8
11846 * - Encoding::UTF_16BE
11847 * - Encoding::UTF_16LE
11848 * - Encoding::UTF_32BE
11849 * - Encoding::UTF_32LE
11850 * - Encoding::GB18030
11851 * - Encoding::UCS_2BE
11852 * - Encoding::UCS_4BE
11853 *
11854 * Examples:
11855 *
11856 * "a\u0300".unicode_normalize # => "a"
11857 * "\u00E0".unicode_normalize(:nfd) # => "a "
11858 *
11859 * Related: String#unicode_normalize!, String#unicode_normalized?.
11860 */
11861static VALUE
11862rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11863{
11864 return unicode_normalize_common(argc, argv, str, id_normalize);
11865}
11866
11867/*
11868 * call-seq:
11869 * unicode_normalize!(form = :nfc) -> self
11870 *
11871 * Like String#unicode_normalize, except that the normalization
11872 * is performed on +self+.
11873 *
11874 * Related String#unicode_normalized?.
11875 *
11876 */
11877static VALUE
11878rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
11879{
11880 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11881}
11882
11883/* call-seq:
11884 * unicode_normalized?(form = :nfc) -> true or false
11885 *
11886 * Returns +true+ if +self+ is in the given +form+ of Unicode normalization,
11887 * +false+ otherwise.
11888 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11889 *
11890 * Examples:
11891 *
11892 * "a\u0300".unicode_normalized? # => false
11893 * "a\u0300".unicode_normalized?(:nfd) # => true
11894 * "\u00E0".unicode_normalized? # => true
11895 * "\u00E0".unicode_normalized?(:nfd) # => false
11896 *
11897 *
11898 * Raises an exception if +self+ is not in a Unicode encoding:
11899 *
11900 * s = "\xE0".force_encoding('ISO-8859-1')
11901 * s.unicode_normalized? # Raises Encoding::CompatibilityError.
11902 *
11903 * Related: String#unicode_normalize, String#unicode_normalize!.
11904 *
11905 */
11906static VALUE
11907rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
11908{
11909 return unicode_normalize_common(argc, argv, str, id_normalized_p);
11910}
11911
11912/**********************************************************************
11913 * Document-class: Symbol
11914 *
11915 * A +Symbol+ object represents a named identifier inside the Ruby interpreter.
11916 *
11917 * You can create a +Symbol+ object explicitly with:
11918 *
11919 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
11920 *
11921 * The same +Symbol+ object will be
11922 * created for a given name or string for the duration of a program's
11923 * execution, regardless of the context or meaning of that name. Thus
11924 * if <code>Fred</code> is a constant in one context, a method in
11925 * another, and a class in a third, the +Symbol+ <code>:Fred</code>
11926 * will be the same object in all three contexts.
11927 *
11928 * module One
11929 * class Fred
11930 * end
11931 * $f1 = :Fred
11932 * end
11933 * module Two
11934 * Fred = 1
11935 * $f2 = :Fred
11936 * end
11937 * def Fred()
11938 * end
11939 * $f3 = :Fred
11940 * $f1.object_id #=> 2514190
11941 * $f2.object_id #=> 2514190
11942 * $f3.object_id #=> 2514190
11943 *
11944 * Constant, method, and variable names are returned as symbols:
11945 *
11946 * module One
11947 * Two = 2
11948 * def three; 3 end
11949 * @four = 4
11950 * @@five = 5
11951 * $six = 6
11952 * end
11953 * seven = 7
11954 *
11955 * One.constants
11956 * # => [:Two]
11957 * One.instance_methods(true)
11958 * # => [:three]
11959 * One.instance_variables
11960 * # => [:@four]
11961 * One.class_variables
11962 * # => [:@@five]
11963 * global_variables.grep(/six/)
11964 * # => [:$six]
11965 * local_variables
11966 * # => [:seven]
11967 *
11968 * A +Symbol+ object differs from a String object in that
11969 * a +Symbol+ object represents an identifier, while a String object
11970 * represents text or data.
11971 *
11972 * == What's Here
11973 *
11974 * First, what's elsewhere. \Class +Symbol+:
11975 *
11976 * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
11977 * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
11978 *
11979 * Here, class +Symbol+ provides methods that are useful for:
11980 *
11981 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
11982 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
11983 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
11984 *
11985 * === Methods for Querying
11986 *
11987 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
11988 * - #=~: Returns the index of the first substring in symbol that matches a
11989 * given Regexp or other object; returns +nil+ if no match is found.
11990 * - #[], #slice : Returns a substring of symbol
11991 * determined by a given index, start/length, or range, or string.
11992 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
11993 * - #encoding: Returns the Encoding object that represents the encoding
11994 * of symbol.
11995 * - #end_with?: Returns +true+ if symbol ends with
11996 * any of the given strings.
11997 * - #match: Returns a MatchData object if symbol
11998 * matches a given Regexp; +nil+ otherwise.
11999 * - #match?: Returns +true+ if symbol
12000 * matches a given Regexp; +false+ otherwise.
12001 * - #length, #size: Returns the number of characters in symbol.
12002 * - #start_with?: Returns +true+ if symbol starts with
12003 * any of the given strings.
12004 *
12005 * === Methods for Comparing
12006 *
12007 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
12008 * or larger than symbol.
12009 * - #==, #===: Returns +true+ if a given symbol has the same content and
12010 * encoding.
12011 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
12012 * symbol is smaller than, equal to, or larger than symbol.
12013 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
12014 * after Unicode case folding; +false+ otherwise.
12015 *
12016 * === Methods for Converting
12017 *
12018 * - #capitalize: Returns symbol with the first character upcased
12019 * and all other characters downcased.
12020 * - #downcase: Returns symbol with all characters downcased.
12021 * - #inspect: Returns the string representation of +self+ as a symbol literal.
12022 * - #name: Returns the frozen string corresponding to symbol.
12023 * - #succ, #next: Returns the symbol that is the successor to symbol.
12024 * - #swapcase: Returns symbol with all upcase characters downcased
12025 * and all downcase characters upcased.
12026 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
12027 * - #to_s, #id2name: Returns the string corresponding to +self+.
12028 * - #to_sym, #intern: Returns +self+.
12029 * - #upcase: Returns symbol with all characters upcased.
12030 *
12031 */
12032
12033
12034/*
12035 * call-seq:
12036 * symbol == object -> true or false
12037 *
12038 * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
12039 */
12040
12041#define sym_equal rb_obj_equal
12042
12043static int
12044sym_printable(const char *s, const char *send, rb_encoding *enc)
12045{
12046 while (s < send) {
12047 int n;
12048 int c = rb_enc_precise_mbclen(s, send, enc);
12049
12050 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
12051 n = MBCLEN_CHARFOUND_LEN(c);
12052 c = rb_enc_mbc_to_codepoint(s, send, enc);
12053 if (!rb_enc_isprint(c, enc)) return FALSE;
12054 s += n;
12055 }
12056 return TRUE;
12057}
12058
12059int
12060rb_str_symname_p(VALUE sym)
12061{
12062 rb_encoding *enc;
12063 const char *ptr;
12064 long len;
12065 rb_encoding *resenc = rb_default_internal_encoding();
12066
12067 if (resenc == NULL) resenc = rb_default_external_encoding();
12068 enc = STR_ENC_GET(sym);
12069 ptr = RSTRING_PTR(sym);
12070 len = RSTRING_LEN(sym);
12071 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
12072 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
12073 return FALSE;
12074 }
12075 return TRUE;
12076}
12077
12078VALUE
12079rb_str_quote_unprintable(VALUE str)
12080{
12081 rb_encoding *enc;
12082 const char *ptr;
12083 long len;
12084 rb_encoding *resenc;
12085
12086 Check_Type(str, T_STRING);
12088 if (resenc == NULL) resenc = rb_default_external_encoding();
12089 enc = STR_ENC_GET(str);
12090 ptr = RSTRING_PTR(str);
12091 len = RSTRING_LEN(str);
12092 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12093 !sym_printable(ptr, ptr + len, enc)) {
12094 return rb_str_escape(str);
12095 }
12096 return str;
12097}
12098
12099VALUE
12100rb_id_quote_unprintable(ID id)
12101{
12102 VALUE str = rb_id2str(id);
12103 if (!rb_str_symname_p(str)) {
12104 return rb_str_escape(str);
12105 }
12106 return str;
12107}
12108
12109/*
12110 * call-seq:
12111 * inspect -> string
12112 *
12113 * Returns a string representation of +self+ (including the leading colon):
12114 *
12115 * :foo.inspect # => ":foo"
12116 *
12117 * Related: Symbol#to_s, Symbol#name.
12118 *
12119 */
12120
12121static VALUE
12122sym_inspect(VALUE sym)
12123{
12124 VALUE str = rb_sym2str(sym);
12125 const char *ptr;
12126 long len;
12127 char *dest;
12128
12129 if (!rb_str_symname_p(str)) {
12130 str = rb_str_inspect(str);
12131 len = RSTRING_LEN(str);
12132 rb_str_resize(str, len + 1);
12133 dest = RSTRING_PTR(str);
12134 memmove(dest + 1, dest, len);
12135 }
12136 else {
12137 rb_encoding *enc = STR_ENC_GET(str);
12138 VALUE orig_str = str;
12139
12140 len = RSTRING_LEN(orig_str);
12141 str = rb_enc_str_new(0, len + 1, enc);
12142
12143 // Get data pointer after allocation
12144 ptr = RSTRING_PTR(orig_str);
12145 dest = RSTRING_PTR(str);
12146 memcpy(dest + 1, ptr, len);
12147
12148 RB_GC_GUARD(orig_str);
12149 }
12150 dest[0] = ':';
12151
12153
12154 return str;
12155}
12156
12157VALUE
12159{
12160 VALUE str = str_new_shared(rb_cString, rb_sym2str(sym));
12161 FL_SET_RAW(str, STR_CHILLED_SYMBOL_TO_S);
12162 return str;
12163}
12164
12165VALUE
12166rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
12167{
12168 VALUE obj;
12169
12170 if (argc < 1) {
12171 rb_raise(rb_eArgError, "no receiver given");
12172 }
12173 obj = argv[0];
12174 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
12175}
12176
12177/*
12178 * call-seq:
12179 * succ
12180 *
12181 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
12182 *
12183 * :foo.succ # => :fop
12184 *
12185 * Related: String#succ.
12186 */
12187
12188static VALUE
12189sym_succ(VALUE sym)
12190{
12191 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
12192}
12193
12194/*
12195 * call-seq:
12196 * symbol <=> object -> -1, 0, +1, or nil
12197 *
12198 * If +object+ is a symbol,
12199 * returns the equivalent of <tt>symbol.to_s <=> object.to_s</tt>:
12200 *
12201 * :bar <=> :foo # => -1
12202 * :foo <=> :foo # => 0
12203 * :foo <=> :bar # => 1
12204 *
12205 * Otherwise, returns +nil+:
12206 *
12207 * :foo <=> 'bar' # => nil
12208 *
12209 * Related: String#<=>.
12210 */
12211
12212static VALUE
12213sym_cmp(VALUE sym, VALUE other)
12214{
12215 if (!SYMBOL_P(other)) {
12216 return Qnil;
12217 }
12218 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
12219}
12220
12221/*
12222 * call-seq:
12223 * casecmp(object) -> -1, 0, 1, or nil
12224 *
12225 * :include: doc/symbol/casecmp.rdoc
12226 *
12227 */
12228
12229static VALUE
12230sym_casecmp(VALUE sym, VALUE other)
12231{
12232 if (!SYMBOL_P(other)) {
12233 return Qnil;
12234 }
12235 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
12236}
12237
12238/*
12239 * call-seq:
12240 * casecmp?(object) -> true, false, or nil
12241 *
12242 * :include: doc/symbol/casecmp_p.rdoc
12243 *
12244 */
12245
12246static VALUE
12247sym_casecmp_p(VALUE sym, VALUE other)
12248{
12249 if (!SYMBOL_P(other)) {
12250 return Qnil;
12251 }
12252 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
12253}
12254
12255/*
12256 * call-seq:
12257 * symbol =~ object -> integer or nil
12258 *
12259 * Equivalent to <tt>symbol.to_s =~ object</tt>,
12260 * including possible updates to global variables;
12261 * see String#=~.
12262 *
12263 */
12264
12265static VALUE
12266sym_match(VALUE sym, VALUE other)
12267{
12268 return rb_str_match(rb_sym2str(sym), other);
12269}
12270
12271/*
12272 * call-seq:
12273 * match(pattern, offset = 0) -> matchdata or nil
12274 * match(pattern, offset = 0) {|matchdata| } -> object
12275 *
12276 * Equivalent to <tt>self.to_s.match</tt>,
12277 * including possible updates to global variables;
12278 * see String#match.
12279 *
12280 */
12281
12282static VALUE
12283sym_match_m(int argc, VALUE *argv, VALUE sym)
12284{
12285 return rb_str_match_m(argc, argv, rb_sym2str(sym));
12286}
12287
12288/*
12289 * call-seq:
12290 * match?(pattern, offset) -> true or false
12291 *
12292 * Equivalent to <tt>sym.to_s.match?</tt>;
12293 * see String#match.
12294 *
12295 */
12296
12297static VALUE
12298sym_match_m_p(int argc, VALUE *argv, VALUE sym)
12299{
12300 return rb_str_match_m_p(argc, argv, sym);
12301}
12302
12303/*
12304 * call-seq:
12305 * symbol[index] -> string or nil
12306 * symbol[start, length] -> string or nil
12307 * symbol[range] -> string or nil
12308 * symbol[regexp, capture = 0] -> string or nil
12309 * symbol[substring] -> string or nil
12310 *
12311 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
12312 *
12313 */
12314
12315static VALUE
12316sym_aref(int argc, VALUE *argv, VALUE sym)
12317{
12318 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
12319}
12320
12321/*
12322 * call-seq:
12323 * length -> integer
12324 *
12325 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
12326 */
12327
12328static VALUE
12329sym_length(VALUE sym)
12330{
12331 return rb_str_length(rb_sym2str(sym));
12332}
12333
12334/*
12335 * call-seq:
12336 * empty? -> true or false
12337 *
12338 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
12339 *
12340 */
12341
12342static VALUE
12343sym_empty(VALUE sym)
12344{
12345 return rb_str_empty(rb_sym2str(sym));
12346}
12347
12348/*
12349 * call-seq:
12350 * upcase(*options) -> symbol
12351 *
12352 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
12353 *
12354 * See String#upcase.
12355 *
12356 */
12357
12358static VALUE
12359sym_upcase(int argc, VALUE *argv, VALUE sym)
12360{
12361 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
12362}
12363
12364/*
12365 * call-seq:
12366 * downcase(*options) -> symbol
12367 *
12368 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
12369 *
12370 * See String#downcase.
12371 *
12372 * Related: Symbol#upcase.
12373 *
12374 */
12375
12376static VALUE
12377sym_downcase(int argc, VALUE *argv, VALUE sym)
12378{
12379 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
12380}
12381
12382/*
12383 * call-seq:
12384 * capitalize(*options) -> symbol
12385 *
12386 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
12387 *
12388 * See String#capitalize.
12389 *
12390 */
12391
12392static VALUE
12393sym_capitalize(int argc, VALUE *argv, VALUE sym)
12394{
12395 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
12396}
12397
12398/*
12399 * call-seq:
12400 * swapcase(*options) -> symbol
12401 *
12402 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
12403 *
12404 * See String#swapcase.
12405 *
12406 */
12407
12408static VALUE
12409sym_swapcase(int argc, VALUE *argv, VALUE sym)
12410{
12411 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
12412}
12413
12414/*
12415 * call-seq:
12416 * start_with?(*string_or_regexp) -> true or false
12417 *
12418 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
12419 *
12420 */
12421
12422static VALUE
12423sym_start_with(int argc, VALUE *argv, VALUE sym)
12424{
12425 return rb_str_start_with(argc, argv, rb_sym2str(sym));
12426}
12427
12428/*
12429 * call-seq:
12430 * end_with?(*strings) -> true or false
12431 *
12432 *
12433 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
12434 *
12435 */
12436
12437static VALUE
12438sym_end_with(int argc, VALUE *argv, VALUE sym)
12439{
12440 return rb_str_end_with(argc, argv, rb_sym2str(sym));
12441}
12442
12443/*
12444 * call-seq:
12445 * encoding -> encoding
12446 *
12447 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12448 *
12449 */
12450
12451static VALUE
12452sym_encoding(VALUE sym)
12453{
12454 return rb_obj_encoding(rb_sym2str(sym));
12455}
12456
12457static VALUE
12458string_for_symbol(VALUE name)
12459{
12460 if (!RB_TYPE_P(name, T_STRING)) {
12461 VALUE tmp = rb_check_string_type(name);
12462 if (NIL_P(tmp)) {
12463 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol nor a string",
12464 name);
12465 }
12466 name = tmp;
12467 }
12468 return name;
12469}
12470
12471ID
12473{
12474 if (SYMBOL_P(name)) {
12475 return SYM2ID(name);
12476 }
12477 name = string_for_symbol(name);
12478 return rb_intern_str(name);
12479}
12480
12481VALUE
12483{
12484 if (SYMBOL_P(name)) {
12485 return name;
12486 }
12487 name = string_for_symbol(name);
12488 return rb_str_intern(name);
12489}
12490
12491/*
12492 * call-seq:
12493 * Symbol.all_symbols -> array_of_symbols
12494 *
12495 * Returns an array of all symbols currently in Ruby's symbol table:
12496 *
12497 * Symbol.all_symbols.size # => 9334
12498 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12499 *
12500 */
12501
12502static VALUE
12503sym_all_symbols(VALUE _)
12504{
12505 return rb_sym_all_symbols();
12506}
12507
12508VALUE
12509rb_str_to_interned_str(VALUE str)
12510{
12511 return rb_fstring(str);
12512}
12513
12514VALUE
12515rb_interned_str(const char *ptr, long len)
12516{
12517 struct RString fake_str;
12518 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), true, false);
12519}
12520
12521VALUE
12522rb_interned_str_cstr(const char *ptr)
12523{
12524 return rb_interned_str(ptr, strlen(ptr));
12525}
12526
12527VALUE
12528rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12529{
12530 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12531 rb_enc_autoload(enc);
12532 }
12533
12534 struct RString fake_str;
12535 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, false);
12536}
12537
12538VALUE
12539rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc)
12540{
12541 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12542 rb_enc_autoload(enc);
12543 }
12544
12545 struct RString fake_str;
12546 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, true);
12547}
12548
12549VALUE
12550rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
12551{
12552 return rb_enc_interned_str(ptr, strlen(ptr), enc);
12553}
12554
12555#if USE_YJIT
12556void
12557rb_yjit_str_concat_codepoint(VALUE str, VALUE codepoint)
12558{
12559 if (RB_LIKELY(ENCODING_GET_INLINED(str) == rb_ascii8bit_encindex())) {
12560 ssize_t code = RB_NUM2SSIZE(codepoint);
12561
12562 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12563 rb_str_buf_cat_byte(str, (char) code);
12564 return;
12565 }
12566 }
12567
12568 rb_str_concat(str, codepoint);
12569}
12570#endif
12571
12572void
12573Init_String(void)
12574{
12575 rb_cString = rb_define_class("String", rb_cObject);
12576 RUBY_ASSERT(rb_vm_fstring_table());
12577 st_foreach(rb_vm_fstring_table(), fstring_set_class_i, rb_cString);
12579 rb_define_alloc_func(rb_cString, empty_str_alloc);
12580 rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12581 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12582 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12583 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12584 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12587 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12588 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12589 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12590 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12593 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12594 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12595 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12596 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12599 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12600 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12601 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12602 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12603 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12605 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12607 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12608 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12609 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12610 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12611 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12612 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12614 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12615 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12616 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12617 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12618 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12619 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12620 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12621 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12623 rb_define_method(rb_cString, "+@", str_uplus, 0);
12624 rb_define_method(rb_cString, "-@", str_uminus, 0);
12625 rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12626 rb_define_alias(rb_cString, "dedup", "-@");
12627
12628 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12629 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12630 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12631 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12634 rb_define_method(rb_cString, "undump", str_undump, 0);
12635
12636 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12637 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12638 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12639 sym_fold = ID2SYM(rb_intern_const("fold"));
12640
12641 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12642 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12643 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12644 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12645
12646 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12647 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12648 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12649 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12650
12651 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12652 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12653 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12654 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12655 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12656 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12657 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12658 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12659 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12660 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12661 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12662 rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
12664 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12665 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12666 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12667 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12668 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12669
12670 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12671 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12672 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12673
12674 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12675
12676 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12677 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12678 rb_define_method(rb_cString, "center", rb_str_center, -1);
12679
12680 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12681 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12682 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12683 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12684 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12685 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12686 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12687 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12688 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12689
12690 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12691 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12692 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12693 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12694 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12695 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12696 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12697 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12698 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12699
12700 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12701 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12702 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12703 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12704 rb_define_method(rb_cString, "count", rb_str_count, -1);
12705
12706 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12707 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12708 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12709 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12710
12711 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12712 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12713 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12714 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12715 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12716
12717 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12718
12719 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12720 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12721
12722 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12723 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12724
12725 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12726 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12727 rb_define_method(rb_cString, "b", rb_str_b, 0);
12728 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12729 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12730
12731 /* define UnicodeNormalize module here so that we don't have to look it up */
12732 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12733 id_normalize = rb_intern_const("normalize");
12734 id_normalized_p = rb_intern_const("normalized?");
12735
12736 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12737 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12738 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12739
12740 rb_fs = Qnil;
12741 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12742 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12743 rb_gc_register_address(&rb_fs);
12744
12745 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12749 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12750
12751 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12752 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12753 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12754 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12755 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12756 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12757
12758 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12759 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12760 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12761 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12762
12763 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12764 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12765 rb_define_method(rb_cSymbol, "length", sym_length, 0);
12766 rb_define_method(rb_cSymbol, "size", sym_length, 0);
12767 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12768 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12769 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12770
12771 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12772 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12773 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12774 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12775
12776 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12777 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12778
12779 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12780}
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:199
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
Definition assert.h:291
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:219
Atomic operations.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
Definition coderange.h:39
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1198
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition fl_type.h:883
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
Definition fl_type.h:469
@ RUBY_FL_FREEZE
This flag has something to do with data immutability.
Definition fl_type.h:324
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1187
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:980
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1095
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2345
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2166
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:2635
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:937
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:2424
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:108
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:106
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:134
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1682
#define FL_EXIVAR
Old name of RUBY_FL_EXIVAR.
Definition fl_type.h:66
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:404
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:137
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1683
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
Definition value_type.h:57
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:135
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:203
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:109
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:401
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:132
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:129
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:658
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:67
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
Definition fl_type.h:126
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:518
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:519
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
Definition memory.h:405
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:68
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:131
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:67
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:108
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:133
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:110
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:130
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:138
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:69
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition error.c:476
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3877
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1434
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1430
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1437
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1428
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1432
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:669
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:2097
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition object.c:2115
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1272
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3508
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:247
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition object.c:576
VALUE rb_cSymbol
Symbol class.
Definition string.c:80
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:179
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition object.c:1260
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:79
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3192
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition gc.h:603
Encoding relates APIs.
rb_encoding * rb_utf8_encoding(void)
Queries the encoding that represents UTF-8.
Definition encoding.c:1475
rb_encoding * rb_ascii8bit_encoding(void)
Queries the encoding that represents ASCII-8BIT a.k.a.
Definition encoding.c:1463
rb_encoding * rb_filesystem_encoding(void)
Queries the "filesystem" encoding.
Definition encoding.c:1537
rb_encoding * rb_default_internal_encoding(void)
Queries the "default internal" encoding.
Definition encoding.c:1676
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:683
int rb_utf8_encindex(void)
Identical to rb_utf8_encoding(), except it returns the encoding's index instead of the encoding itsel...
Definition encoding.c:1481
int rb_ascii8bit_encindex(void)
Identical to rb_ascii8bit_encoding(), except it returns the encoding's index instead of the encoding ...
Definition encoding.c:1469
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:704
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:571
rb_encoding * rb_default_external_encoding(void)
Queries the "default external" encoding.
Definition encoding.c:1589
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:447
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition encoding.h:99
rb_encoding * rb_locale_encoding(void)
Queries the encoding that represents the current locale.
Definition encoding.c:1523
rb_encoding * rb_usascii_encoding(void)
Queries the encoding that represents US-ASCII.
Definition encoding.c:1487
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:619
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:726
int rb_usascii_encindex(void)
Identical to rb_usascii_encoding(), except it returns the encoding's index instead of the encoding it...
Definition encoding.c:1493
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1290
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:905
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1155
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:2931
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1174
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:12528
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:252
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2254
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition string.c:3616
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:1103
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1395
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1296
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:924
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:12550
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:789
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:415
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1475
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2651
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2914
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1731
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1099
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1186
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition gc.h:479
VALUE rb_ary_new(void)
Allocates a new, empty array.
Definition array.c:741
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:206
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:239
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_hash_new(void)
Creates a new, empty hash object.
Definition hash.c:1477
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:674
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition io.c:207
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:1835
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1043
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:1841
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1892
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1235
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4198
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3695
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1489
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1905
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1682
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1460
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2405
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1583
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition string.h:945
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition string.h:939
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3681
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1371
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:12158
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2477
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "defaultexternal" encoding.
Definition string.c:1347
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1676
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:2959
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:5273
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:4050
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition string.c:3056
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:11457
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1770
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1498
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1718
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1681
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:1137
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1532
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:959
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1466
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1921
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:4036
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition string.c:3449
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2343
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
Definition string.c:1939
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1639
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1567
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6481
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:3064
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1146
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:12522
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition string.c:1377
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "defaultexternal" encoding.
Definition string.h:1604
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition string.c:3647
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:3006
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:4152
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3273
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:7202
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2697
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:12515
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:4106
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:3923
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:4081
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1692
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3623
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:3181
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5783
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:11515
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1625
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1632
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:631
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2855
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:3153
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1656
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3256
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1149
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1549
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2653
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7316
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1359
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1648
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2357
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1514
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5701
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:9412
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1143
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition symbol.c:879
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:1939
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
Definition variable.c:1956
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:2960
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1291
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:284
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:971
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:12482
ID rb_to_id(VALUE str)
Definition string.c:12472
int capa
Designed capacity of the buffer.
Definition io.h:11
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1844
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3479
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4442
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:215
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1354
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:360
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
int st_foreach(st_table *q, int_type *w, st_data_t e)
Iteration over the given table.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
Defines RBIMPL_ATTR_NONSTRING.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:150
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:138
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1389
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2832
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:468
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:442
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:488
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2716
static long RSTRING_LEN(VALUE str)
Queries the length of the string.
Definition rstring.h:367
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1383
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2727
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1709
static char * RSTRING_PTR(VALUE str)
Queries the contents pointer of the string.
Definition rstring.h:416
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:449
struct rb_data_type_struct rb_data_type_t
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:197
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1417
#define errno
Ractor-aware version of errno.
Definition ruby.h:388
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
Definition size_t.h:49
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:75
Ruby's String.
Definition rstring.h:196
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:199
long capa
Capacity of *ptr.
Definition rstring.h:232
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:206
union RString::@157025041137035241047331270155043025061071337053::@157067065136062356112324002106172053054013023024::@365170260060164113275356137374160141226332013204 aux
Auxiliary info.
struct RString::@157025041137035241047331270155043025061071337053::@153056146250355212360325351117351053336274231135 embed
Embedded contents.
VALUE shared
Parent of the string.
Definition rstring.h:240
char * ptr
Pointer to the contents of the string.
Definition rstring.h:222
union RString::@157025041137035241047331270155043025061071337053 as
String's specific fields.
struct RString::@157025041137035241047331270155043025061071337053::@157067065136062356112324002106172053054013023024 heap
Strings that use separated memory region for contents use this pattern.
Definition string.c:8274
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:300
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
Definition value_type.h:225
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:433
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376
ruby_value_type
C-level type of an object.
Definition value_type.h:113