Ruby 3.4.2p28 (2025-02-15 revision d2930f8e7a5db8a7337fa43370940381b420cc3e)
string.c
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "id.h"
27#include "internal.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/encoding.h"
32#include "internal/error.h"
33#include "internal/gc.h"
34#include "internal/numeric.h"
35#include "internal/object.h"
36#include "internal/proc.h"
37#include "internal/re.h"
38#include "internal/sanitizers.h"
39#include "internal/string.h"
40#include "internal/transcode.h"
41#include "probes.h"
42#include "ruby/encoding.h"
43#include "ruby/re.h"
44#include "ruby/util.h"
45#include "ruby_assert.h"
46#include "vm_sync.h"
47
48#if defined HAVE_CRYPT_R
49# if defined HAVE_CRYPT_H
50# include <crypt.h>
51# endif
52#elif !defined HAVE_CRYPT
53# include "missing/crypt.h"
54# define HAVE_CRYPT_R 1
55#endif
56
57#define BEG(no) (regs->beg[(no)])
58#define END(no) (regs->end[(no)])
59
60#undef rb_str_new
61#undef rb_usascii_str_new
62#undef rb_utf8_str_new
63#undef rb_enc_str_new
64#undef rb_str_new_cstr
65#undef rb_usascii_str_new_cstr
66#undef rb_utf8_str_new_cstr
67#undef rb_enc_str_new_cstr
68#undef rb_external_str_new_cstr
69#undef rb_locale_str_new_cstr
70#undef rb_str_dup_frozen
71#undef rb_str_buf_new_cstr
72#undef rb_str_buf_cat
73#undef rb_str_buf_cat2
74#undef rb_str_cat2
75#undef rb_str_cat_cstr
76#undef rb_fstring_cstr
77
80
81/* Flags of RString
82 *
83 * 0: STR_SHARED (equal to ELTS_SHARED)
84 * The string is shared. The buffer this string points to is owned by
85 * another string (the shared root).
86 * 1: RSTRING_NOEMBED
87 * The string is not embedded. When a string is embedded, the contents
88 * follow the header. When a string is not embedded, the contents is
89 * on a separately allocated buffer.
90 * 2: STR_CHILLED_LITERAL (will be frozen in a future version)
91 * The string was allocated as a literal in a file without an explicit `frozen_string_literal` comment.
92 * It emits a deprecation warning when mutated for the first time.
93 * 3: STR_CHILLED_SYMBOL_TO_S (will be frozen in a future version)
94 * The string was allocated by the `Symbol#to_s` method.
95 * It emits a deprecation warning when mutated for the first time.
96 * 4: STR_PRECOMPUTED_HASH
97 * The string is embedded and has its precomputed hashcode stored
98 * after the terminator.
99 * 5: STR_SHARED_ROOT
100 * Other strings may point to the contents of this string. When this
101 * flag is set, STR_SHARED must not be set.
102 * 6: STR_BORROWED
103 * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
104 * to be unshared by rb_str_tmp_frozen_release.
105 * 7: STR_TMPLOCK
106 * The pointer to the buffer is passed to a system call such as
107 * read(2). Any modification and realloc is prohibited.
108 * 8-9: ENC_CODERANGE
109 * Stores the coderange of the string.
110 * 10-16: ENCODING
111 * Stores the encoding of the string.
112 * 17: RSTRING_FSTR
113 * The string is a fstring. The string is deduplicated in the fstring
114 * table.
115 * 18: STR_NOFREE
116 * Do not free this string's buffer when the string is reclaimed
117 * by the garbage collector. Used for when the string buffer is a C
118 * string literal.
119 * 19: STR_FAKESTR
120 * The string is not allocated or managed by the garbage collector.
121 * Typically, the string object header (struct RString) is temporarily
122 * allocated on C stack.
123 */
124
125#define RUBY_MAX_CHAR_LEN 16
126#define STR_PRECOMPUTED_HASH FL_USER4
127#define STR_SHARED_ROOT FL_USER5
128#define STR_BORROWED FL_USER6
129#define STR_TMPLOCK FL_USER7
130#define STR_NOFREE FL_USER18
131#define STR_FAKESTR FL_USER19
132
133#define STR_SET_NOEMBED(str) do {\
134 FL_SET((str), STR_NOEMBED);\
135 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
136} while (0)
137#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
138
139#define STR_SET_LEN(str, n) do { \
140 RSTRING(str)->len = (n); \
141} while (0)
142
143static inline bool
144str_encindex_fastpath(int encindex)
145{
146 // The overwhelming majority of strings are in one of these 3 encodings.
147 switch (encindex) {
148 case ENCINDEX_ASCII_8BIT:
149 case ENCINDEX_UTF_8:
150 case ENCINDEX_US_ASCII:
151 return true;
152 default:
153 return false;
154 }
155}
156
157static inline bool
158str_enc_fastpath(VALUE str)
159{
160 return str_encindex_fastpath(ENCODING_GET_INLINED(str));
161}
162
163#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
164#define TERM_FILL(ptr, termlen) do {\
165 char *const term_fill_ptr = (ptr);\
166 const int term_fill_len = (termlen);\
167 *term_fill_ptr = '\0';\
168 if (UNLIKELY(term_fill_len > 1))\
169 memset(term_fill_ptr, 0, term_fill_len);\
170} while (0)
171
172#define RESIZE_CAPA(str,capacity) do {\
173 const int termlen = TERM_LEN(str);\
174 RESIZE_CAPA_TERM(str,capacity,termlen);\
175} while (0)
176#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
177 if (STR_EMBED_P(str)) {\
178 if (str_embed_capa(str) < capacity + termlen) {\
179 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
180 const long tlen = RSTRING_LEN(str);\
181 memcpy(tmp, RSTRING_PTR(str), tlen);\
182 RSTRING(str)->as.heap.ptr = tmp;\
183 RSTRING(str)->len = tlen;\
184 STR_SET_NOEMBED(str);\
185 RSTRING(str)->as.heap.aux.capa = (capacity);\
186 }\
187 }\
188 else {\
189 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
190 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
191 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
192 RSTRING(str)->as.heap.aux.capa = (capacity);\
193 }\
194} while (0)
195
196#define STR_SET_SHARED(str, shared_str) do { \
197 if (!FL_TEST(str, STR_FAKESTR)) { \
198 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
199 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
200 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
201 FL_SET((str), STR_SHARED); \
202 FL_SET((shared_str), STR_SHARED_ROOT); \
203 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
204 FL_SET_RAW((shared_str), STR_BORROWED); \
205 } \
206} while (0)
207
208#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
209#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
210/* TODO: include the terminator size in capa. */
211
212#define STR_ENC_GET(str) get_encoding(str)
213
214#if !defined SHARABLE_MIDDLE_SUBSTRING
215# define SHARABLE_MIDDLE_SUBSTRING 0
216#endif
217#if !SHARABLE_MIDDLE_SUBSTRING
218#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
219#else
220#define SHARABLE_SUBSTRING_P(beg, len, end) 1
221#endif
222
223
224static inline long
225str_embed_capa(VALUE str)
226{
227 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
228}
229
230bool
231rb_str_reembeddable_p(VALUE str)
232{
233 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
234}
235
236static inline size_t
237rb_str_embed_size(long capa)
238{
239 return offsetof(struct RString, as.embed.ary) + capa;
240}
241
242size_t
243rb_str_size_as_embedded(VALUE str)
244{
245 size_t real_size;
246 if (STR_EMBED_P(str)) {
247 real_size = rb_str_embed_size(RSTRING(str)->len) + TERM_LEN(str);
248 }
249 /* if the string is not currently embedded, but it can be embedded, how
250 * much space would it require */
251 else if (rb_str_reembeddable_p(str)) {
252 real_size = rb_str_embed_size(RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
253 }
254 else {
255 real_size = sizeof(struct RString);
256 }
257
258 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
259 real_size += sizeof(st_index_t);
260 }
261
262 return real_size;
263}
264
265static inline bool
266STR_EMBEDDABLE_P(long len, long termlen)
267{
268 return rb_gc_size_allocatable_p(rb_str_embed_size(len + termlen));
269}
270
271static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
272static VALUE str_new_frozen(VALUE klass, VALUE orig);
273static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
274static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
275static VALUE str_new(VALUE klass, const char *ptr, long len);
276static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
277static inline void str_modifiable(VALUE str);
278static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
279static inline VALUE str_alloc_embed(VALUE klass, size_t capa);
280
281static inline void
282str_make_independent(VALUE str)
283{
284 long len = RSTRING_LEN(str);
285 int termlen = TERM_LEN(str);
286 str_make_independent_expand((str), len, 0L, termlen);
287}
288
289static inline int str_dependent_p(VALUE str);
290
291void
292rb_str_make_independent(VALUE str)
293{
294 if (str_dependent_p(str)) {
295 str_make_independent(str);
296 }
297}
298
299void
300rb_str_make_embedded(VALUE str)
301{
302 RUBY_ASSERT(rb_str_reembeddable_p(str));
303 RUBY_ASSERT(!STR_EMBED_P(str));
304
305 char *buf = RSTRING(str)->as.heap.ptr;
306 long len = RSTRING(str)->len;
307
308 STR_SET_EMBED(str);
309 STR_SET_LEN(str, len);
310
311 if (len > 0) {
312 memcpy(RSTRING_PTR(str), buf, len);
313 ruby_xfree(buf);
314 }
315
316 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
317}
318
319void
320rb_debug_rstring_null_ptr(const char *func)
321{
322 fprintf(stderr, "%s is returning NULL!! "
323 "SIGSEGV is highly expected to follow immediately.\n"
324 "If you could reproduce, attach your debugger here, "
325 "and look at the passed string.\n",
326 func);
327}
328
329/* symbols for [up|down|swap]case/capitalize options */
330static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
331
332static rb_encoding *
333get_encoding(VALUE str)
334{
335 return rb_enc_from_index(ENCODING_GET(str));
336}
337
338static void
339mustnot_broken(VALUE str)
340{
341 if (is_broken_string(str)) {
342 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
343 }
344}
345
346static void
347mustnot_wchar(VALUE str)
348{
349 rb_encoding *enc = STR_ENC_GET(str);
350 if (rb_enc_mbminlen(enc) > 1) {
351 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
352 }
353}
354
355static int fstring_cmp(VALUE a, VALUE b);
356
357static VALUE register_fstring(VALUE str, bool copy, bool force_precompute_hash);
358
359#if SIZEOF_LONG == SIZEOF_VOIDP
360#define PRECOMPUTED_FAKESTR_HASH 1
361#else
362#endif
363
364#ifdef PRECOMPUTED_FAKESTR_HASH
365static st_index_t
366fstring_hash(VALUE str)
367{
368 if (FL_TEST_RAW(str, STR_FAKESTR)) {
369 // register_fstring precomputes the hash and stores it in capa for fake strings
370 return (st_index_t)RSTRING(str)->as.heap.aux.capa;
371 }
372 else {
373 return rb_str_hash(str);
374 }
375}
376#else
377#define fstring_hash rb_str_hash
378#endif
379
380const struct st_hash_type rb_fstring_hash_type = {
381 fstring_cmp,
382 fstring_hash,
383};
384
385#define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
386
387static inline st_index_t
388str_do_hash(VALUE str)
389{
390 st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
391 int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
392 if (e && !is_ascii_string(str)) {
393 h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
394 }
395 return h;
396}
397
398static VALUE
399str_store_precomputed_hash(VALUE str, st_index_t hash)
400{
401 RUBY_ASSERT(!FL_TEST_RAW(str, STR_PRECOMPUTED_HASH));
402 RUBY_ASSERT(STR_EMBED_P(str));
403
404#if RUBY_DEBUG
405 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
406 size_t free_bytes = str_embed_capa(str) - used_bytes;
407 RUBY_ASSERT(free_bytes >= sizeof(st_index_t));
408#endif
409
410 memcpy(RSTRING_END(str) + TERM_LEN(str), &hash, sizeof(hash));
411
412 FL_SET(str, STR_PRECOMPUTED_HASH);
413
414 return str;
415}
416
418 VALUE fstr;
419 bool copy;
420 bool force_precompute_hash;
421};
422
423static int
424fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data, int existing)
425{
426 struct fstr_update_arg *arg = (struct fstr_update_arg *)data;
427 VALUE str = (VALUE)*key;
428
429 if (existing) {
430 /* because of lazy sweep, str may be unmarked already and swept
431 * at next time */
432
433 if (rb_objspace_garbage_object_p(str)) {
434 arg->fstr = Qundef;
435 return ST_DELETE;
436 }
437
438 arg->fstr = str;
439 return ST_STOP;
440 }
441 else {
442 // Unless the string is empty or binary, its coderange has been precomputed.
443 int coderange = ENC_CODERANGE(str);
444
445 if (FL_TEST_RAW(str, STR_FAKESTR)) {
446 if (arg->copy) {
447 VALUE new_str;
448 long len = RSTRING_LEN(str);
449 long capa = len + sizeof(st_index_t);
450 int term_len = TERM_LEN(str);
451
452 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(capa, term_len)) {
453 new_str = str_alloc_embed(rb_cString, capa + term_len);
454 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str), len);
455 STR_SET_LEN(new_str, RSTRING_LEN(str));
456 TERM_FILL(RSTRING_END(new_str), TERM_LEN(str));
457 rb_enc_copy(new_str, str);
458 str_store_precomputed_hash(new_str, fstring_hash(str));
459 }
460 else {
461 new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
462 rb_enc_copy(new_str, str);
463#ifdef PRECOMPUTED_FAKESTR_HASH
464 if (rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len + sizeof(st_index_t)) {
465 str_store_precomputed_hash(new_str, (st_index_t)RSTRING(str)->as.heap.aux.capa);
466 }
467#endif
468 }
469 str = new_str;
470 }
471 else {
472 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
473 RSTRING(str)->len,
474 ENCODING_GET(str));
475 }
476 OBJ_FREEZE(str);
477 }
478 else {
479 if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
480 str = str_new_frozen(rb_cString, str);
481 }
482 if (STR_SHARED_P(str)) { /* str should not be shared */
483 /* shared substring */
484 str_make_independent(str);
486 }
487 if (!BARE_STRING_P(str)) {
488 str = str_new_frozen(rb_cString, str);
489 }
490 }
491
492 ENC_CODERANGE_SET(str, coderange);
493 RBASIC(str)->flags |= RSTRING_FSTR;
494
495 *key = *value = arg->fstr = str;
496 return ST_CONTINUE;
497 }
498}
499
500VALUE
501rb_fstring(VALUE str)
502{
503 VALUE fstr;
504 int bare;
505
506 Check_Type(str, T_STRING);
507
508 if (FL_TEST(str, RSTRING_FSTR))
509 return str;
510
511 bare = BARE_STRING_P(str);
512 if (!bare) {
513 if (STR_EMBED_P(str)) {
514 OBJ_FREEZE(str);
515 return str;
516 }
517
518 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
520 return str;
521 }
522 }
523
524 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
525 rb_str_resize(str, RSTRING_LEN(str));
526
527 fstr = register_fstring(str, false, false);
528
529 if (!bare) {
530 str_replace_shared_without_enc(str, fstr);
531 OBJ_FREEZE(str);
532 return str;
533 }
534 return fstr;
535}
536
537static VALUE
538register_fstring(VALUE str, bool copy, bool force_precompute_hash)
539{
540 struct fstr_update_arg args = {
541 .copy = copy,
542 .force_precompute_hash = force_precompute_hash
543 };
544
545#if SIZEOF_VOIDP == SIZEOF_LONG
546 if (FL_TEST_RAW(str, STR_FAKESTR)) {
547 // if the string hasn't been interned, we'll need the hash twice, so we
548 // compute it once and store it in capa
549 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
550 }
551#endif
552
553 RB_VM_LOCK_ENTER();
554 {
555 st_table *frozen_strings = rb_vm_fstring_table();
556 do {
557 args.fstr = str;
558 st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
559 } while (UNDEF_P(args.fstr));
560 }
561 RB_VM_LOCK_LEAVE();
562
563 RUBY_ASSERT(OBJ_FROZEN(args.fstr));
564 RUBY_ASSERT(!FL_TEST_RAW(args.fstr, STR_FAKESTR));
565 RUBY_ASSERT(!FL_TEST_RAW(args.fstr, FL_EXIVAR));
566 RUBY_ASSERT(RBASIC_CLASS(args.fstr) == rb_cString);
567
568 return args.fstr;
569}
570
571static VALUE
572setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
573{
574 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
575
576 if (!name) {
578 name = "";
579 }
580
581 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
582
583 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
584 fake_str->len = len;
585 fake_str->as.heap.ptr = (char *)name;
586 fake_str->as.heap.aux.capa = len;
587 return (VALUE)fake_str;
588}
589
590/*
591 * set up a fake string which refers a static string literal.
592 */
593VALUE
594rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
595{
596 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
597}
598
599/*
600 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
601 * shared string which refers a static string literal. `ptr` must
602 * point a constant string.
603 */
604VALUE
605rb_fstring_new(const char *ptr, long len)
606{
607 struct RString fake_str;
608 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), false, false);
609}
610
611VALUE
612rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
613{
614 struct RString fake_str;
615 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), false, false);
616}
617
618VALUE
619rb_fstring_cstr(const char *ptr)
620{
621 return rb_fstring_new(ptr, strlen(ptr));
622}
623
624static int
625fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
626{
627 RBASIC_SET_CLASS((VALUE)key, (VALUE)arg);
628 return ST_CONTINUE;
629}
630
631static int
632fstring_cmp(VALUE a, VALUE b)
633{
634 long alen, blen;
635 const char *aptr, *bptr;
636 RSTRING_GETMEM(a, aptr, alen);
637 RSTRING_GETMEM(b, bptr, blen);
638 return (alen != blen ||
639 ENCODING_GET(a) != ENCODING_GET(b) ||
640 memcmp(aptr, bptr, alen) != 0);
641}
642
643static inline bool
644single_byte_optimizable(VALUE str)
645{
646 int encindex = ENCODING_GET(str);
647 switch (encindex) {
648 case ENCINDEX_ASCII_8BIT:
649 case ENCINDEX_US_ASCII:
650 return true;
651 case ENCINDEX_UTF_8:
652 // For UTF-8 it's worth scanning the string coderange when unknown.
654 }
655 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
656 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
657 return true;
658 }
659
660 if (rb_enc_mbmaxlen(rb_enc_from_index(encindex)) == 1) {
661 return true;
662 }
663
664 /* Conservative. Possibly single byte.
665 * "\xa1" in Shift_JIS for example. */
666 return false;
667}
668
670
671static inline const char *
672search_nonascii(const char *p, const char *e)
673{
674 const uintptr_t *s, *t;
675
676#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
677# if SIZEOF_UINTPTR_T == 8
678# define NONASCII_MASK UINT64_C(0x8080808080808080)
679# elif SIZEOF_UINTPTR_T == 4
680# define NONASCII_MASK UINT32_C(0x80808080)
681# else
682# error "don't know what to do."
683# endif
684#else
685# if SIZEOF_UINTPTR_T == 8
686# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
687# elif SIZEOF_UINTPTR_T == 4
688# define NONASCII_MASK 0x80808080UL /* or...? */
689# else
690# error "don't know what to do."
691# endif
692#endif
693
694 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
695#if !UNALIGNED_WORD_ACCESS
696 if ((uintptr_t)p % SIZEOF_VOIDP) {
697 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
698 p += l;
699 switch (l) {
700 default: UNREACHABLE;
701#if SIZEOF_VOIDP > 4
702 case 7: if (p[-7]&0x80) return p-7;
703 case 6: if (p[-6]&0x80) return p-6;
704 case 5: if (p[-5]&0x80) return p-5;
705 case 4: if (p[-4]&0x80) return p-4;
706#endif
707 case 3: if (p[-3]&0x80) return p-3;
708 case 2: if (p[-2]&0x80) return p-2;
709 case 1: if (p[-1]&0x80) return p-1;
710 case 0: break;
711 }
712 }
713#endif
714#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
715#define aligned_ptr(value) \
716 __builtin_assume_aligned((value), sizeof(uintptr_t))
717#else
718#define aligned_ptr(value) (uintptr_t *)(value)
719#endif
720 s = aligned_ptr(p);
721 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
722#undef aligned_ptr
723 for (;s < t; s++) {
724 if (*s & NONASCII_MASK) {
725#ifdef WORDS_BIGENDIAN
726 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
727#else
728 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
729#endif
730 }
731 }
732 p = (const char *)s;
733 }
734
735 switch (e - p) {
736 default: UNREACHABLE;
737#if SIZEOF_VOIDP > 4
738 case 7: if (e[-7]&0x80) return e-7;
739 case 6: if (e[-6]&0x80) return e-6;
740 case 5: if (e[-5]&0x80) return e-5;
741 case 4: if (e[-4]&0x80) return e-4;
742#endif
743 case 3: if (e[-3]&0x80) return e-3;
744 case 2: if (e[-2]&0x80) return e-2;
745 case 1: if (e[-1]&0x80) return e-1;
746 case 0: return NULL;
747 }
748}
749
750static int
751coderange_scan(const char *p, long len, rb_encoding *enc)
752{
753 const char *e = p + len;
754
755 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
756 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
757 p = search_nonascii(p, e);
759 }
760
761 if (rb_enc_asciicompat(enc)) {
762 p = search_nonascii(p, e);
763 if (!p) return ENC_CODERANGE_7BIT;
764 for (;;) {
765 int ret = rb_enc_precise_mbclen(p, e, enc);
767 p += MBCLEN_CHARFOUND_LEN(ret);
768 if (p == e) break;
769 p = search_nonascii(p, e);
770 if (!p) break;
771 }
772 }
773 else {
774 while (p < e) {
775 int ret = rb_enc_precise_mbclen(p, e, enc);
777 p += MBCLEN_CHARFOUND_LEN(ret);
778 }
779 }
780 return ENC_CODERANGE_VALID;
781}
782
783long
784rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
785{
786 const char *p = s;
787
788 if (*cr == ENC_CODERANGE_BROKEN)
789 return e - s;
790
791 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
792 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
793 if (*cr == ENC_CODERANGE_VALID) return e - s;
794 p = search_nonascii(p, e);
796 return e - s;
797 }
798 else if (rb_enc_asciicompat(enc)) {
799 p = search_nonascii(p, e);
800 if (!p) {
801 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
802 return e - s;
803 }
804 for (;;) {
805 int ret = rb_enc_precise_mbclen(p, e, enc);
806 if (!MBCLEN_CHARFOUND_P(ret)) {
808 return p - s;
809 }
810 p += MBCLEN_CHARFOUND_LEN(ret);
811 if (p == e) break;
812 p = search_nonascii(p, e);
813 if (!p) break;
814 }
815 }
816 else {
817 while (p < e) {
818 int ret = rb_enc_precise_mbclen(p, e, enc);
819 if (!MBCLEN_CHARFOUND_P(ret)) {
821 return p - s;
822 }
823 p += MBCLEN_CHARFOUND_LEN(ret);
824 }
825 }
827 return e - s;
828}
829
830static inline void
831str_enc_copy(VALUE str1, VALUE str2)
832{
833 rb_enc_set_index(str1, ENCODING_GET(str2));
834}
835
836/* Like str_enc_copy, but does not check frozen status of str1.
837 * You should use this only if you're certain that str1 is not frozen. */
838static inline void
839str_enc_copy_direct(VALUE str1, VALUE str2)
840{
841 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
842 if (inlined_encoding == ENCODING_INLINE_MAX) {
843 rb_enc_set_index(str1, rb_enc_get_index(str2));
844 }
845 else {
846 ENCODING_SET_INLINED(str1, inlined_encoding);
847 }
848}
849
850static void
851rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
852{
853 /* this function is designed for copying encoding and coderange
854 * from src to new string "dest" which is made from the part of src.
855 */
856 str_enc_copy(dest, src);
857 if (RSTRING_LEN(dest) == 0) {
858 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
860 else
862 return;
863 }
864 switch (ENC_CODERANGE(src)) {
867 break;
869 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
870 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
872 else
874 break;
875 default:
876 break;
877 }
878}
879
880static void
881rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
882{
883 str_enc_copy(dest, src);
885}
886
887static int
888enc_coderange_scan(VALUE str, rb_encoding *enc)
889{
890 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
891}
892
893int
894rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
895{
896 return enc_coderange_scan(str, enc);
897}
898
899int
901{
902 int cr = ENC_CODERANGE(str);
903
904 if (cr == ENC_CODERANGE_UNKNOWN) {
905 cr = enc_coderange_scan(str, get_encoding(str));
906 ENC_CODERANGE_SET(str, cr);
907 }
908 return cr;
909}
910
911static inline bool
912rb_enc_str_asciicompat(VALUE str)
913{
914 int encindex = ENCODING_GET_INLINED(str);
915 return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
916}
917
918int
920{
921 switch(ENC_CODERANGE(str)) {
923 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
925 return true;
926 default:
927 return false;
928 }
929}
930
931static inline void
932str_mod_check(VALUE s, const char *p, long len)
933{
934 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
935 rb_raise(rb_eRuntimeError, "string modified");
936 }
937}
938
939static size_t
940str_capacity(VALUE str, const int termlen)
941{
942 if (STR_EMBED_P(str)) {
943 return str_embed_capa(str) - termlen;
944 }
945 else if (FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
946 return RSTRING(str)->len;
947 }
948 else {
949 return RSTRING(str)->as.heap.aux.capa;
950 }
951}
952
953size_t
955{
956 return str_capacity(str, TERM_LEN(str));
957}
958
959static inline void
960must_not_null(const char *ptr)
961{
962 if (!ptr) {
963 rb_raise(rb_eArgError, "NULL pointer given");
964 }
965}
966
967static inline VALUE
968str_alloc_embed(VALUE klass, size_t capa)
969{
970 size_t size = rb_str_embed_size(capa);
971 RUBY_ASSERT(size > 0);
972 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
973
974 NEWOBJ_OF(str, struct RString, klass,
976
977 return (VALUE)str;
978}
979
980static inline VALUE
981str_alloc_heap(VALUE klass)
982{
983 NEWOBJ_OF(str, struct RString, klass,
984 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
985
986 return (VALUE)str;
987}
988
989static inline VALUE
990empty_str_alloc(VALUE klass)
991{
992 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
993 VALUE str = str_alloc_embed(klass, 0);
994 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
996 return str;
997}
998
999static VALUE
1000str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
1001{
1002 VALUE str;
1003
1004 if (len < 0) {
1005 rb_raise(rb_eArgError, "negative string size (or size too big)");
1006 }
1007
1008 if (enc == NULL) {
1009 enc = rb_ascii8bit_encoding();
1010 }
1011
1012 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1013
1014 int termlen = rb_enc_mbminlen(enc);
1015
1016 if (STR_EMBEDDABLE_P(len, termlen)) {
1017 str = str_alloc_embed(klass, len + termlen);
1018 if (len == 0) {
1019 ENC_CODERANGE_SET(str, rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1020 }
1021 }
1022 else {
1023 str = str_alloc_heap(klass);
1024 RSTRING(str)->as.heap.aux.capa = len;
1025 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
1026 * integer overflow. If we can STATIC_ASSERT that, the following
1027 * mul_add_mul can be reverted to a simple ALLOC_N. */
1028 RSTRING(str)->as.heap.ptr =
1029 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
1030 }
1031
1032 rb_enc_raw_set(str, enc);
1033
1034 if (ptr) {
1035 memcpy(RSTRING_PTR(str), ptr, len);
1036 }
1037
1038 STR_SET_LEN(str, len);
1039 TERM_FILL(RSTRING_PTR(str) + len, termlen);
1040 return str;
1041}
1042
1043static VALUE
1044str_new(VALUE klass, const char *ptr, long len)
1045{
1046 return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
1047}
1048
1049VALUE
1050rb_str_new(const char *ptr, long len)
1051{
1052 return str_new(rb_cString, ptr, len);
1053}
1054
1055VALUE
1056rb_usascii_str_new(const char *ptr, long len)
1057{
1058 return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
1059}
1060
1061VALUE
1062rb_utf8_str_new(const char *ptr, long len)
1063{
1064 return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
1065}
1066
1067VALUE
1068rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
1069{
1070 return str_enc_new(rb_cString, ptr, len, enc);
1071}
1072
1073VALUE
1074rb_str_new_cstr(const char *ptr)
1075{
1076 must_not_null(ptr);
1077 /* rb_str_new_cstr() can take pointer from non-malloc-generated
1078 * memory regions, and that cannot be detected by the MSAN. Just
1079 * trust the programmer that the argument passed here is a sane C
1080 * string. */
1081 __msan_unpoison_string(ptr);
1082 return rb_str_new(ptr, strlen(ptr));
1083}
1084
1085VALUE
1087{
1088 return rb_enc_str_new_cstr(ptr, rb_usascii_encoding());
1089}
1090
1091VALUE
1092rb_utf8_str_new_cstr(const char *ptr)
1093{
1094 return rb_enc_str_new_cstr(ptr, rb_utf8_encoding());
1095}
1096
1097VALUE
1098rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
1099{
1100 must_not_null(ptr);
1101 if (rb_enc_mbminlen(enc) != 1) {
1102 rb_raise(rb_eArgError, "wchar encoding given");
1103 }
1104 return rb_enc_str_new(ptr, strlen(ptr), enc);
1105}
1106
1107static VALUE
1108str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1109{
1110 VALUE str;
1111
1112 if (len < 0) {
1113 rb_raise(rb_eArgError, "negative string size (or size too big)");
1114 }
1115
1116 if (!ptr) {
1117 str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
1118 }
1119 else {
1120 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1121 str = str_alloc_heap(klass);
1122 RSTRING(str)->len = len;
1123 RSTRING(str)->as.heap.ptr = (char *)ptr;
1124 RSTRING(str)->as.heap.aux.capa = len;
1125 RBASIC(str)->flags |= STR_NOFREE;
1126 rb_enc_associate_index(str, encindex);
1127 }
1128 return str;
1129}
1130
1131VALUE
1132rb_str_new_static(const char *ptr, long len)
1133{
1134 return str_new_static(rb_cString, ptr, len, 0);
1135}
1136
1137VALUE
1138rb_usascii_str_new_static(const char *ptr, long len)
1139{
1140 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1141}
1142
1143VALUE
1144rb_utf8_str_new_static(const char *ptr, long len)
1145{
1146 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1147}
1148
1149VALUE
1150rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
1151{
1152 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1153}
1154
1155static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1156 rb_encoding *from, rb_encoding *to,
1157 int ecflags, VALUE ecopts);
1158
1159static inline bool
1160is_enc_ascii_string(VALUE str, rb_encoding *enc)
1161{
1162 int encidx = rb_enc_to_index(enc);
1163 if (rb_enc_get_index(str) == encidx)
1164 return is_ascii_string(str);
1165 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1166}
1167
1168VALUE
1169rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1170{
1171 long len;
1172 const char *ptr;
1173 VALUE newstr;
1174
1175 if (!to) return str;
1176 if (!from) from = rb_enc_get(str);
1177 if (from == to) return str;
1178 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1179 rb_is_ascii8bit_enc(to)) {
1180 if (STR_ENC_GET(str) != to) {
1181 str = rb_str_dup(str);
1182 rb_enc_associate(str, to);
1183 }
1184 return str;
1185 }
1186
1187 RSTRING_GETMEM(str, ptr, len);
1188 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1189 from, to, ecflags, ecopts);
1190 if (NIL_P(newstr)) {
1191 /* some error, return original */
1192 return str;
1193 }
1194 return newstr;
1195}
1196
1197VALUE
1198rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1199 rb_encoding *from, int ecflags, VALUE ecopts)
1200{
1201 long olen;
1202
1203 olen = RSTRING_LEN(newstr);
1204 if (ofs < -olen || olen < ofs)
1205 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1206 if (ofs < 0) ofs += olen;
1207 if (!from) {
1208 STR_SET_LEN(newstr, ofs);
1209 return rb_str_cat(newstr, ptr, len);
1210 }
1211
1212 rb_str_modify(newstr);
1213 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1214 rb_enc_get(newstr),
1215 ecflags, ecopts);
1216}
1217
1218VALUE
1219rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1220{
1221 STR_SET_LEN(str, 0);
1222 rb_enc_associate(str, enc);
1223 rb_str_cat(str, ptr, len);
1224 return str;
1225}
1226
1227static VALUE
1228str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1229 rb_encoding *from, rb_encoding *to,
1230 int ecflags, VALUE ecopts)
1231{
1232 rb_econv_t *ec;
1234 long olen;
1235 VALUE econv_wrapper;
1236 const unsigned char *start, *sp;
1237 unsigned char *dest, *dp;
1238 size_t converted_output = (size_t)ofs;
1239
1240 olen = rb_str_capacity(newstr);
1241
1242 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1243 RBASIC_CLEAR_CLASS(econv_wrapper);
1244 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1245 if (!ec) return Qnil;
1246 DATA_PTR(econv_wrapper) = ec;
1247
1248 sp = (unsigned char*)ptr;
1249 start = sp;
1250 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1251 (dp = dest + converted_output),
1252 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1254 /* destination buffer short */
1255 size_t converted_input = sp - start;
1256 size_t rest = len - converted_input;
1257 converted_output = dp - dest;
1258 rb_str_set_len(newstr, converted_output);
1259 if (converted_input && converted_output &&
1260 rest < (LONG_MAX / converted_output)) {
1261 rest = (rest * converted_output) / converted_input;
1262 }
1263 else {
1264 rest = olen;
1265 }
1266 olen += rest < 2 ? 2 : rest;
1267 rb_str_resize(newstr, olen);
1268 }
1269 DATA_PTR(econv_wrapper) = 0;
1270 RB_GC_GUARD(econv_wrapper);
1271 rb_econv_close(ec);
1272 switch (ret) {
1273 case econv_finished:
1274 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1275 rb_str_set_len(newstr, len);
1276 rb_enc_associate(newstr, to);
1277 return newstr;
1278
1279 default:
1280 return Qnil;
1281 }
1282}
1283
1284VALUE
1285rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
1286{
1287 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1288}
1289
1290VALUE
1291rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
1292{
1293 rb_encoding *ienc;
1294 VALUE str;
1295 const int eidx = rb_enc_to_index(eenc);
1296
1297 if (!ptr) {
1298 return rb_enc_str_new(ptr, len, eenc);
1299 }
1300
1301 /* ASCII-8BIT case, no conversion */
1302 if ((eidx == rb_ascii8bit_encindex()) ||
1303 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1304 return rb_str_new(ptr, len);
1305 }
1306 /* no default_internal or same encoding, no conversion */
1307 ienc = rb_default_internal_encoding();
1308 if (!ienc || eenc == ienc) {
1309 return rb_enc_str_new(ptr, len, eenc);
1310 }
1311 /* ASCII compatible, and ASCII only string, no conversion in
1312 * default_internal */
1313 if ((eidx == rb_ascii8bit_encindex()) ||
1314 (eidx == rb_usascii_encindex()) ||
1315 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1316 return rb_enc_str_new(ptr, len, ienc);
1317 }
1318 /* convert from the given encoding to default_internal */
1319 str = rb_enc_str_new(NULL, 0, ienc);
1320 /* when the conversion failed for some reason, just ignore the
1321 * default_internal and result in the given encoding as-is. */
1322 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1323 rb_str_initialize(str, ptr, len, eenc);
1324 }
1325 return str;
1326}
1327
1328VALUE
1329rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1330{
1331 int eidx = rb_enc_to_index(eenc);
1332 if (eidx == rb_usascii_encindex() &&
1333 !is_ascii_string(str)) {
1334 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1335 return str;
1336 }
1337 rb_enc_associate_index(str, eidx);
1338 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1339}
1340
1341VALUE
1342rb_external_str_new(const char *ptr, long len)
1343{
1344 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1345}
1346
1347VALUE
1349{
1350 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1351}
1352
1353VALUE
1354rb_locale_str_new(const char *ptr, long len)
1355{
1356 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1357}
1358
1359VALUE
1361{
1362 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1363}
1364
1365VALUE
1366rb_filesystem_str_new(const char *ptr, long len)
1367{
1368 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1369}
1370
1371VALUE
1373{
1374 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1375}
1376
1377VALUE
1379{
1380 return rb_str_export_to_enc(str, rb_default_external_encoding());
1381}
1382
1383VALUE
1385{
1386 return rb_str_export_to_enc(str, rb_locale_encoding());
1387}
1388
1389VALUE
1390rb_str_export_to_enc(VALUE str, rb_encoding *enc)
1391{
1392 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1393}
1394
1395static VALUE
1396str_replace_shared_without_enc(VALUE str2, VALUE str)
1397{
1398 const int termlen = TERM_LEN(str);
1399 char *ptr;
1400 long len;
1401
1402 RSTRING_GETMEM(str, ptr, len);
1403 if (str_embed_capa(str2) >= len + termlen) {
1404 char *ptr2 = RSTRING(str2)->as.embed.ary;
1405 STR_SET_EMBED(str2);
1406 memcpy(ptr2, RSTRING_PTR(str), len);
1407 TERM_FILL(ptr2+len, termlen);
1408 }
1409 else {
1410 VALUE root;
1411 if (STR_SHARED_P(str)) {
1412 root = RSTRING(str)->as.heap.aux.shared;
1413 RSTRING_GETMEM(str, ptr, len);
1414 }
1415 else {
1416 root = rb_str_new_frozen(str);
1417 RSTRING_GETMEM(root, ptr, len);
1418 }
1419 RUBY_ASSERT(OBJ_FROZEN(root));
1420
1421 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1422 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1423 rb_fatal("about to free a possible shared root");
1424 }
1425 char *ptr2 = STR_HEAP_PTR(str2);
1426 if (ptr2 != ptr) {
1427 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1428 }
1429 }
1430 FL_SET(str2, STR_NOEMBED);
1431 RSTRING(str2)->as.heap.ptr = ptr;
1432 STR_SET_SHARED(str2, root);
1433 }
1434
1435 STR_SET_LEN(str2, len);
1436
1437 return str2;
1438}
1439
1440static VALUE
1441str_replace_shared(VALUE str2, VALUE str)
1442{
1443 str_replace_shared_without_enc(str2, str);
1444 rb_enc_cr_str_exact_copy(str2, str);
1445 return str2;
1446}
1447
1448static VALUE
1449str_new_shared(VALUE klass, VALUE str)
1450{
1451 return str_replace_shared(str_alloc_heap(klass), str);
1452}
1453
1454VALUE
1456{
1457 return str_new_shared(rb_obj_class(str), str);
1458}
1459
1460VALUE
1462{
1463 if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig;
1464 return str_new_frozen(rb_obj_class(orig), orig);
1465}
1466
1467static VALUE
1468rb_str_new_frozen_String(VALUE orig)
1469{
1470 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1471 return str_new_frozen(rb_cString, orig);
1472}
1473
1474VALUE
1475rb_str_tmp_frozen_acquire(VALUE orig)
1476{
1477 if (OBJ_FROZEN_RAW(orig)) return orig;
1478 return str_new_frozen_buffer(0, orig, FALSE);
1479}
1480
1481VALUE
1482rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1483{
1484 if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1485 if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1486
1487 VALUE str = str_alloc_heap(0);
1488 OBJ_FREEZE(str);
1489 /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1490 FL_SET(str, STR_SHARED_ROOT);
1491
1492 size_t capa = str_capacity(orig, TERM_LEN(orig));
1493
1494 /* If the string is embedded then we want to create a copy that is heap
1495 * allocated. If the string is shared then the shared root must be
1496 * embedded, so we want to create a copy. If the string is a shared root
1497 * then it must be embedded, so we want to create a copy. */
1498 if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT)) {
1499 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1500 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1501 }
1502 else {
1503 /* orig must be heap allocated and not shared, so we can safely transfer
1504 * the pointer to str. */
1505 RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1506 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1507 RBASIC(orig)->flags &= ~STR_NOFREE;
1508 STR_SET_SHARED(orig, str);
1509 }
1510
1511 RSTRING(str)->len = RSTRING(orig)->len;
1512 RSTRING(str)->as.heap.aux.capa = capa;
1513
1514 return str;
1515}
1516
1517void
1518rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1519{
1520 if (RBASIC_CLASS(tmp) != 0)
1521 return;
1522
1523 if (STR_EMBED_P(tmp)) {
1525 }
1526 else if (FL_TEST_RAW(orig, STR_SHARED) &&
1527 !FL_TEST_RAW(orig, STR_TMPLOCK|RUBY_FL_FREEZE)) {
1528 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1529
1530 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1531 RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1532 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1533
1534 /* Unshare orig since the root (tmp) only has this one child. */
1535 FL_UNSET_RAW(orig, STR_SHARED);
1536 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1537 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1539
1540 /* Make tmp embedded and empty so it is safe for sweeping. */
1541 STR_SET_EMBED(tmp);
1542 STR_SET_LEN(tmp, 0);
1543 }
1544 }
1545}
1546
1547static VALUE
1548str_new_frozen(VALUE klass, VALUE orig)
1549{
1550 return str_new_frozen_buffer(klass, orig, TRUE);
1551}
1552
1553static VALUE
1554heap_str_make_shared(VALUE klass, VALUE orig)
1555{
1556 RUBY_ASSERT(!STR_EMBED_P(orig));
1557 RUBY_ASSERT(!STR_SHARED_P(orig));
1558
1559 VALUE str = str_alloc_heap(klass);
1560 STR_SET_LEN(str, RSTRING_LEN(orig));
1561 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1562 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1563 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1564 RBASIC(orig)->flags &= ~STR_NOFREE;
1565 STR_SET_SHARED(orig, str);
1566 if (klass == 0)
1567 FL_UNSET_RAW(str, STR_BORROWED);
1568 return str;
1569}
1570
1571static VALUE
1572str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1573{
1574 VALUE str;
1575
1576 long len = RSTRING_LEN(orig);
1577 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1578 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1579
1580 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1581 str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
1582 RUBY_ASSERT(STR_EMBED_P(str));
1583 }
1584 else {
1585 if (FL_TEST_RAW(orig, STR_SHARED)) {
1586 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1587 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1588 long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1589 RUBY_ASSERT(ofs >= 0);
1590 RUBY_ASSERT(rest >= 0);
1591 RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared));
1593
1594 if ((ofs > 0) || (rest > 0) ||
1595 (klass != RBASIC(shared)->klass) ||
1596 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1597 str = str_new_shared(klass, shared);
1598 RUBY_ASSERT(!STR_EMBED_P(str));
1599 RSTRING(str)->as.heap.ptr += ofs;
1600 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1601 }
1602 else {
1603 if (RBASIC_CLASS(shared) == 0)
1604 FL_SET_RAW(shared, STR_BORROWED);
1605 return shared;
1606 }
1607 }
1608 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1609 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1610 STR_SET_EMBED(str);
1611 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1612 STR_SET_LEN(str, RSTRING_LEN(orig));
1613 ENC_CODERANGE_SET(str, ENC_CODERANGE(orig));
1614 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1615 }
1616 else {
1617 str = heap_str_make_shared(klass, orig);
1618 }
1619 }
1620
1621 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1622 OBJ_FREEZE(str);
1623 return str;
1624}
1625
1626VALUE
1627rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1628{
1629 return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
1630}
1631
1632static VALUE
1633str_new_empty_String(VALUE str)
1634{
1635 VALUE v = rb_str_new(0, 0);
1636 rb_enc_copy(v, str);
1637 return v;
1638}
1639
1640#define STR_BUF_MIN_SIZE 63
1641
1642VALUE
1644{
1645 if (STR_EMBEDDABLE_P(capa, 1)) {
1646 return str_alloc_embed(rb_cString, capa + 1);
1647 }
1648
1649 VALUE str = str_alloc_heap(rb_cString);
1650
1651 RSTRING(str)->as.heap.aux.capa = capa;
1652 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1653 RSTRING(str)->as.heap.ptr[0] = '\0';
1654
1655 return str;
1656}
1657
1658VALUE
1659rb_str_buf_new_cstr(const char *ptr)
1660{
1661 VALUE str;
1662 long len = strlen(ptr);
1663
1664 str = rb_str_buf_new(len);
1665 rb_str_buf_cat(str, ptr, len);
1666
1667 return str;
1668}
1669
1670VALUE
1672{
1673 return str_new(0, 0, len);
1674}
1675
1676void
1678{
1679 if (STR_EMBED_P(str)) {
1680 RB_DEBUG_COUNTER_INC(obj_str_embed);
1681 }
1682 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1683 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1684 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1685 }
1686 else {
1687 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1688 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1689 }
1690}
1691
1692size_t
1693rb_str_memsize(VALUE str)
1694{
1695 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1696 return STR_HEAP_SIZE(str);
1697 }
1698 else {
1699 return 0;
1700 }
1701}
1702
1703VALUE
1705{
1706 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1707}
1708
1709static inline void str_discard(VALUE str);
1710static void str_shared_replace(VALUE str, VALUE str2);
1711
1712void
1714{
1715 if (str != str2) str_shared_replace(str, str2);
1716}
1717
1718static void
1719str_shared_replace(VALUE str, VALUE str2)
1720{
1721 rb_encoding *enc;
1722 int cr;
1723 int termlen;
1724
1725 RUBY_ASSERT(str2 != str);
1726 enc = STR_ENC_GET(str2);
1727 cr = ENC_CODERANGE(str2);
1728 str_discard(str);
1729 termlen = rb_enc_mbminlen(enc);
1730
1731 STR_SET_LEN(str, RSTRING_LEN(str2));
1732
1733 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1734 STR_SET_EMBED(str);
1735 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1736 rb_enc_associate(str, enc);
1737 ENC_CODERANGE_SET(str, cr);
1738 }
1739 else {
1740 if (STR_EMBED_P(str2)) {
1741 RUBY_ASSERT(!FL_TEST(str2, STR_SHARED));
1742 long len = RSTRING_LEN(str2);
1743 RUBY_ASSERT(len + termlen <= str_embed_capa(str2));
1744
1745 char *new_ptr = ALLOC_N(char, len + termlen);
1746 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1747 RSTRING(str2)->as.heap.ptr = new_ptr;
1748 STR_SET_LEN(str2, len);
1749 RSTRING(str2)->as.heap.aux.capa = len;
1750 STR_SET_NOEMBED(str2);
1751 }
1752
1753 STR_SET_NOEMBED(str);
1754 FL_UNSET(str, STR_SHARED);
1755 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1756
1757 if (FL_TEST(str2, STR_SHARED)) {
1758 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1759 STR_SET_SHARED(str, shared);
1760 }
1761 else {
1762 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1763 }
1764
1765 /* abandon str2 */
1766 STR_SET_EMBED(str2);
1767 RSTRING_PTR(str2)[0] = 0;
1768 STR_SET_LEN(str2, 0);
1769 rb_enc_associate(str, enc);
1770 ENC_CODERANGE_SET(str, cr);
1771 }
1772}
1773
1774VALUE
1775rb_obj_as_string(VALUE obj)
1776{
1777 VALUE str;
1778
1779 if (RB_TYPE_P(obj, T_STRING)) {
1780 return obj;
1781 }
1782 str = rb_funcall(obj, idTo_s, 0);
1783 return rb_obj_as_string_result(str, obj);
1784}
1785
1786VALUE
1787rb_obj_as_string_result(VALUE str, VALUE obj)
1788{
1789 if (!RB_TYPE_P(str, T_STRING))
1790 return rb_any_to_s(obj);
1791 return str;
1792}
1793
1794static VALUE
1795str_replace(VALUE str, VALUE str2)
1796{
1797 long len;
1798
1799 len = RSTRING_LEN(str2);
1800 if (STR_SHARED_P(str2)) {
1801 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1803 STR_SET_NOEMBED(str);
1804 STR_SET_LEN(str, len);
1805 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1806 STR_SET_SHARED(str, shared);
1807 rb_enc_cr_str_exact_copy(str, str2);
1808 }
1809 else {
1810 str_replace_shared(str, str2);
1811 }
1812
1813 return str;
1814}
1815
1816static inline VALUE
1817ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1818{
1819 size_t size = rb_str_embed_size(capa);
1820 RUBY_ASSERT(size > 0);
1821 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1822
1823 NEWOBJ_OF(str, struct RString, klass,
1825
1826 return (VALUE)str;
1827}
1828
1829static inline VALUE
1830ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1831{
1832 NEWOBJ_OF(str, struct RString, klass,
1833 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
1834
1835 return (VALUE)str;
1836}
1837
1838static inline VALUE
1839str_duplicate_setup_encoding(VALUE str, VALUE dup, VALUE flags)
1840{
1841 int encidx = 0;
1842 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1843 encidx = rb_enc_get_index(str);
1844 flags &= ~ENCODING_MASK;
1845 }
1846 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1847 if (encidx) rb_enc_associate_index(dup, encidx);
1848 return dup;
1849}
1850
1851static const VALUE flag_mask = ENC_CODERANGE_MASK | ENCODING_MASK | FL_FREEZE;
1852
1853static inline VALUE
1854str_duplicate_setup_embed(VALUE klass, VALUE str, VALUE dup)
1855{
1856 VALUE flags = FL_TEST_RAW(str, flag_mask);
1857 long len = RSTRING_LEN(str);
1858
1859 RUBY_ASSERT(STR_EMBED_P(dup));
1860 RUBY_ASSERT(str_embed_capa(dup) >= len + 1);
1861 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1862 STR_SET_LEN(dup, RSTRING_LEN(str));
1863 return str_duplicate_setup_encoding(str, dup, flags);
1864}
1865
1866static inline VALUE
1867str_duplicate_setup_heap(VALUE klass, VALUE str, VALUE dup)
1868{
1869 VALUE flags = FL_TEST_RAW(str, flag_mask);
1870 VALUE root = str;
1871 if (FL_TEST_RAW(str, STR_SHARED)) {
1872 root = RSTRING(str)->as.heap.aux.shared;
1873 }
1874 else if (UNLIKELY(!(flags & FL_FREEZE))) {
1875 root = str = str_new_frozen(klass, str);
1876 flags = FL_TEST_RAW(str, flag_mask);
1877 }
1878 RUBY_ASSERT(!STR_SHARED_P(root));
1880
1881 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1882 FL_SET(root, STR_SHARED_ROOT);
1883 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1884 flags |= RSTRING_NOEMBED | STR_SHARED;
1885
1886 STR_SET_LEN(dup, RSTRING_LEN(str));
1887 return str_duplicate_setup_encoding(str, dup, flags);
1888}
1889
1890static inline VALUE
1891str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1892{
1893 if (STR_EMBED_P(str)) {
1894 return str_duplicate_setup_embed(klass, str, dup);
1895 }
1896 else {
1897 return str_duplicate_setup_heap(klass, str, dup);
1898 }
1899}
1900
1901static inline VALUE
1902str_duplicate(VALUE klass, VALUE str)
1903{
1904 VALUE dup;
1905 if (STR_EMBED_P(str)) {
1906 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1907 }
1908 else {
1909 dup = str_alloc_heap(klass);
1910 }
1911
1912 return str_duplicate_setup(klass, str, dup);
1913}
1914
1915VALUE
1917{
1918 return str_duplicate(rb_obj_class(str), str);
1919}
1920
1921/* :nodoc: */
1922VALUE
1923rb_str_dup_m(VALUE str)
1924{
1925 if (LIKELY(BARE_STRING_P(str))) {
1926 return str_duplicate(rb_obj_class(str), str);
1927 }
1928 else {
1929 return rb_obj_dup(str);
1930 }
1931}
1932
1933VALUE
1935{
1936 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1937 return str_duplicate(rb_cString, str);
1938}
1939
1940VALUE
1941rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled)
1942{
1943 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1944 VALUE new_str, klass = rb_cString;
1945
1946 if (!(chilled && RTEST(rb_ivar_defined(str, id_debug_created_info))) && STR_EMBED_P(str)) {
1947 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
1948 str_duplicate_setup_embed(klass, str, new_str);
1949 }
1950 else {
1951 new_str = ec_str_alloc_heap(ec, klass);
1952 str_duplicate_setup_heap(klass, str, new_str);
1953 }
1954 if (chilled) {
1955 FL_SET_RAW(new_str, STR_CHILLED_LITERAL);
1956 }
1957 return new_str;
1958}
1959
1960VALUE
1961rb_str_with_debug_created_info(VALUE str, VALUE path, int line)
1962{
1963 VALUE debug_info = rb_ary_new_from_args(2, path, INT2FIX(line));
1964 if (OBJ_FROZEN_RAW(str)) str = rb_str_dup(str);
1965 rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
1966 FL_SET_RAW(str, STR_CHILLED_LITERAL);
1967 return rb_str_freeze(str);
1968}
1969
1970/*
1971 *
1972 * call-seq:
1973 * String.new(string = '', **opts) -> new_string
1974 *
1975 * :include: doc/string/new.rdoc
1976 *
1977 */
1978
1979static VALUE
1980rb_str_init(int argc, VALUE *argv, VALUE str)
1981{
1982 static ID keyword_ids[2];
1983 VALUE orig, opt, venc, vcapa;
1984 VALUE kwargs[2];
1985 rb_encoding *enc = 0;
1986 int n;
1987
1988 if (!keyword_ids[0]) {
1989 keyword_ids[0] = rb_id_encoding();
1990 CONST_ID(keyword_ids[1], "capacity");
1991 }
1992
1993 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1994 if (!NIL_P(opt)) {
1995 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
1996 venc = kwargs[0];
1997 vcapa = kwargs[1];
1998 if (!UNDEF_P(venc) && !NIL_P(venc)) {
1999 enc = rb_to_encoding(venc);
2000 }
2001 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
2002 long capa = NUM2LONG(vcapa);
2003 long len = 0;
2004 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2005
2006 if (capa < STR_BUF_MIN_SIZE) {
2007 capa = STR_BUF_MIN_SIZE;
2008 }
2009 if (n == 1) {
2010 StringValue(orig);
2011 len = RSTRING_LEN(orig);
2012 if (capa < len) {
2013 capa = len;
2014 }
2015 if (orig == str) n = 0;
2016 }
2017 str_modifiable(str);
2018 if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2019 /* make noembed always */
2020 const size_t size = (size_t)capa + termlen;
2021 const char *const old_ptr = RSTRING_PTR(str);
2022 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2023 char *new_ptr = ALLOC_N(char, size);
2024 if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str));
2025 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2026 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
2027 RSTRING(str)->as.heap.ptr = new_ptr;
2028 }
2029 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
2030 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2031 (size_t)capa + termlen, STR_HEAP_SIZE(str));
2032 }
2033 STR_SET_LEN(str, len);
2034 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
2035 if (n == 1) {
2036 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
2037 rb_enc_cr_str_exact_copy(str, orig);
2038 }
2039 FL_SET(str, STR_NOEMBED);
2040 RSTRING(str)->as.heap.aux.capa = capa;
2041 }
2042 else if (n == 1) {
2043 rb_str_replace(str, orig);
2044 }
2045 if (enc) {
2046 rb_enc_associate(str, enc);
2048 }
2049 }
2050 else if (n == 1) {
2051 rb_str_replace(str, orig);
2052 }
2053 return str;
2054}
2055
2056/* :nodoc: */
2057static VALUE
2058rb_str_s_new(int argc, VALUE *argv, VALUE klass)
2059{
2060 if (klass != rb_cString) {
2061 return rb_class_new_instance_pass_kw(argc, argv, klass);
2062 }
2063
2064 static ID keyword_ids[2];
2065 VALUE orig, opt, encoding = Qnil, capacity = Qnil;
2066 VALUE kwargs[2];
2067 rb_encoding *enc = NULL;
2068
2069 int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2070 if (NIL_P(opt)) {
2071 return rb_class_new_instance_pass_kw(argc, argv, klass);
2072 }
2073
2074 keyword_ids[0] = rb_id_encoding();
2075 CONST_ID(keyword_ids[1], "capacity");
2076 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2077 encoding = kwargs[0];
2078 capacity = kwargs[1];
2079
2080 if (n == 1) {
2081 orig = StringValue(orig);
2082 }
2083 else {
2084 orig = Qnil;
2085 }
2086
2087 if (UNDEF_P(encoding)) {
2088 if (!NIL_P(orig)) {
2089 encoding = rb_obj_encoding(orig);
2090 }
2091 }
2092
2093 if (!UNDEF_P(encoding)) {
2094 enc = rb_to_encoding(encoding);
2095 }
2096
2097 // If capacity is nil, we're basically just duping `orig`.
2098 if (UNDEF_P(capacity)) {
2099 if (NIL_P(orig)) {
2100 VALUE empty_str = str_new(klass, "", 0);
2101 if (enc) {
2102 rb_enc_associate(empty_str, enc);
2103 }
2104 return empty_str;
2105 }
2106 VALUE copy = str_duplicate(klass, orig);
2107 rb_enc_associate(copy, enc);
2108 ENC_CODERANGE_CLEAR(copy);
2109 return copy;
2110 }
2111
2112 long capa = 0;
2113 capa = NUM2LONG(capacity);
2114 if (capa < 0) {
2115 capa = 0;
2116 }
2117
2118 if (!NIL_P(orig)) {
2119 long orig_capa = rb_str_capacity(orig);
2120 if (orig_capa > capa) {
2121 capa = orig_capa;
2122 }
2123 }
2124
2125 VALUE str = str_enc_new(klass, NULL, capa, enc);
2126 STR_SET_LEN(str, 0);
2127 TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);
2128
2129 if (!NIL_P(orig)) {
2130 rb_str_buf_append(str, orig);
2131 }
2132
2133 return str;
2134}
2135
2136#ifdef NONASCII_MASK
2137#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2138
2139/*
2140 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
2141 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
2142 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2143 *
2144 * if (!(byte & 0x80))
2145 * byte |= 0x40; // turn on bit6
2146 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
2147 *
2148 * This function calculates whether a byte is leading or not for all bytes
2149 * in the argument word by concurrently using the above logic, and then
2150 * adds up the number of leading bytes in the word.
2151 */
2152static inline uintptr_t
2153count_utf8_lead_bytes_with_word(const uintptr_t *s)
2154{
2155 uintptr_t d = *s;
2156
2157 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2158 d = (d>>6) | (~d>>7);
2159 d &= NONASCII_MASK >> 7;
2160
2161 /* Gather all bytes. */
2162#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2163 /* use only if it can use POPCNT */
2164 return rb_popcount_intptr(d);
2165#else
2166 d += (d>>8);
2167 d += (d>>16);
2168# if SIZEOF_VOIDP == 8
2169 d += (d>>32);
2170# endif
2171 return (d&0xF);
2172#endif
2173}
2174#endif
2175
2176static inline long
2177enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2178{
2179 long c;
2180 const char *q;
2181
2182 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2183 long diff = (long)(e - p);
2184 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2185 }
2186#ifdef NONASCII_MASK
2187 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2188 uintptr_t len = 0;
2189 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2190 const uintptr_t *s, *t;
2191 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2192 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2193 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2194 while (p < (const char *)s) {
2195 if (is_utf8_lead_byte(*p)) len++;
2196 p++;
2197 }
2198 while (s < t) {
2199 len += count_utf8_lead_bytes_with_word(s);
2200 s++;
2201 }
2202 p = (const char *)s;
2203 }
2204 while (p < e) {
2205 if (is_utf8_lead_byte(*p)) len++;
2206 p++;
2207 }
2208 return (long)len;
2209 }
2210#endif
2211 else if (rb_enc_asciicompat(enc)) {
2212 c = 0;
2213 if (ENC_CODERANGE_CLEAN_P(cr)) {
2214 while (p < e) {
2215 if (ISASCII(*p)) {
2216 q = search_nonascii(p, e);
2217 if (!q)
2218 return c + (e - p);
2219 c += q - p;
2220 p = q;
2221 }
2222 p += rb_enc_fast_mbclen(p, e, enc);
2223 c++;
2224 }
2225 }
2226 else {
2227 while (p < e) {
2228 if (ISASCII(*p)) {
2229 q = search_nonascii(p, e);
2230 if (!q)
2231 return c + (e - p);
2232 c += q - p;
2233 p = q;
2234 }
2235 p += rb_enc_mbclen(p, e, enc);
2236 c++;
2237 }
2238 }
2239 return c;
2240 }
2241
2242 for (c=0; p<e; c++) {
2243 p += rb_enc_mbclen(p, e, enc);
2244 }
2245 return c;
2246}
2247
2248long
2249rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2250{
2251 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2252}
2253
2254/* To get strlen with cr
2255 * Note that given cr is not used.
2256 */
2257long
2258rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2259{
2260 long c;
2261 const char *q;
2262 int ret;
2263
2264 *cr = 0;
2265 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2266 long diff = (long)(e - p);
2267 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2268 }
2269 else if (rb_enc_asciicompat(enc)) {
2270 c = 0;
2271 while (p < e) {
2272 if (ISASCII(*p)) {
2273 q = search_nonascii(p, e);
2274 if (!q) {
2275 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2276 return c + (e - p);
2277 }
2278 c += q - p;
2279 p = q;
2280 }
2281 ret = rb_enc_precise_mbclen(p, e, enc);
2282 if (MBCLEN_CHARFOUND_P(ret)) {
2283 *cr |= ENC_CODERANGE_VALID;
2284 p += MBCLEN_CHARFOUND_LEN(ret);
2285 }
2286 else {
2288 p++;
2289 }
2290 c++;
2291 }
2292 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2293 return c;
2294 }
2295
2296 for (c=0; p<e; c++) {
2297 ret = rb_enc_precise_mbclen(p, e, enc);
2298 if (MBCLEN_CHARFOUND_P(ret)) {
2299 *cr |= ENC_CODERANGE_VALID;
2300 p += MBCLEN_CHARFOUND_LEN(ret);
2301 }
2302 else {
2304 if (p + rb_enc_mbminlen(enc) <= e)
2305 p += rb_enc_mbminlen(enc);
2306 else
2307 p = e;
2308 }
2309 }
2310 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2311 return c;
2312}
2313
2314/* enc must be str's enc or rb_enc_check(str, str2) */
2315static long
2316str_strlen(VALUE str, rb_encoding *enc)
2317{
2318 const char *p, *e;
2319 int cr;
2320
2321 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2322 if (!enc) enc = STR_ENC_GET(str);
2323 p = RSTRING_PTR(str);
2324 e = RSTRING_END(str);
2325 cr = ENC_CODERANGE(str);
2326
2327 if (cr == ENC_CODERANGE_UNKNOWN) {
2328 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2329 if (cr) ENC_CODERANGE_SET(str, cr);
2330 return n;
2331 }
2332 else {
2333 return enc_strlen(p, e, enc, cr);
2334 }
2335}
2336
2337long
2339{
2340 return str_strlen(str, NULL);
2341}
2342
2343/*
2344 * call-seq:
2345 * length -> integer
2346 *
2347 * :include: doc/string/length.rdoc
2348 *
2349 */
2350
2351VALUE
2353{
2354 return LONG2NUM(str_strlen(str, NULL));
2355}
2356
2357/*
2358 * call-seq:
2359 * bytesize -> integer
2360 *
2361 * :include: doc/string/bytesize.rdoc
2362 *
2363 */
2364
2365VALUE
2366rb_str_bytesize(VALUE str)
2367{
2368 return LONG2NUM(RSTRING_LEN(str));
2369}
2370
2371/*
2372 * call-seq:
2373 * empty? -> true or false
2374 *
2375 * Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2376 *
2377 * "hello".empty? # => false
2378 * " ".empty? # => false
2379 * "".empty? # => true
2380 *
2381 */
2382
2383static VALUE
2384rb_str_empty(VALUE str)
2385{
2386 return RBOOL(RSTRING_LEN(str) == 0);
2387}
2388
2389/*
2390 * call-seq:
2391 * string + other_string -> new_string
2392 *
2393 * Returns a new +String+ containing +other_string+ concatenated to +self+:
2394 *
2395 * "Hello from " + self.to_s # => "Hello from main"
2396 *
2397 */
2398
2399VALUE
2401{
2402 VALUE str3;
2403 rb_encoding *enc;
2404 char *ptr1, *ptr2, *ptr3;
2405 long len1, len2;
2406 int termlen;
2407
2408 StringValue(str2);
2409 enc = rb_enc_check_str(str1, str2);
2410 RSTRING_GETMEM(str1, ptr1, len1);
2411 RSTRING_GETMEM(str2, ptr2, len2);
2412 termlen = rb_enc_mbminlen(enc);
2413 if (len1 > LONG_MAX - len2) {
2414 rb_raise(rb_eArgError, "string size too big");
2415 }
2416 str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
2417 ptr3 = RSTRING_PTR(str3);
2418 memcpy(ptr3, ptr1, len1);
2419 memcpy(ptr3+len1, ptr2, len2);
2420 TERM_FILL(&ptr3[len1+len2], termlen);
2421
2422 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2424 RB_GC_GUARD(str1);
2425 RB_GC_GUARD(str2);
2426 return str3;
2427}
2428
2429/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2430VALUE
2431rb_str_opt_plus(VALUE str1, VALUE str2)
2432{
2435 long len1, len2;
2436 MAYBE_UNUSED(char) *ptr1, *ptr2;
2437 RSTRING_GETMEM(str1, ptr1, len1);
2438 RSTRING_GETMEM(str2, ptr2, len2);
2439 int enc1 = rb_enc_get_index(str1);
2440 int enc2 = rb_enc_get_index(str2);
2441
2442 if (enc1 < 0) {
2443 return Qundef;
2444 }
2445 else if (enc2 < 0) {
2446 return Qundef;
2447 }
2448 else if (enc1 != enc2) {
2449 return Qundef;
2450 }
2451 else if (len1 > LONG_MAX - len2) {
2452 return Qundef;
2453 }
2454 else {
2455 return rb_str_plus(str1, str2);
2456 }
2457
2458}
2459
2460/*
2461 * call-seq:
2462 * string * integer -> new_string
2463 *
2464 * Returns a new +String+ containing +integer+ copies of +self+:
2465 *
2466 * "Ho! " * 3 # => "Ho! Ho! Ho! "
2467 * "Ho! " * 0 # => ""
2468 *
2469 */
2470
2471VALUE
2473{
2474 VALUE str2;
2475 long n, len;
2476 char *ptr2;
2477 int termlen;
2478
2479 if (times == INT2FIX(1)) {
2480 return str_duplicate(rb_cString, str);
2481 }
2482 if (times == INT2FIX(0)) {
2483 str2 = str_alloc_embed(rb_cString, 0);
2484 rb_enc_copy(str2, str);
2485 return str2;
2486 }
2487 len = NUM2LONG(times);
2488 if (len < 0) {
2489 rb_raise(rb_eArgError, "negative argument");
2490 }
2491 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2492 if (STR_EMBEDDABLE_P(len, 1)) {
2493 str2 = str_alloc_embed(rb_cString, len + 1);
2494 memset(RSTRING_PTR(str2), 0, len + 1);
2495 }
2496 else {
2497 str2 = str_alloc_heap(rb_cString);
2498 RSTRING(str2)->as.heap.aux.capa = len;
2499 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2500 }
2501 STR_SET_LEN(str2, len);
2502 rb_enc_copy(str2, str);
2503 return str2;
2504 }
2505 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2506 rb_raise(rb_eArgError, "argument too big");
2507 }
2508
2509 len *= RSTRING_LEN(str);
2510 termlen = TERM_LEN(str);
2511 str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
2512 ptr2 = RSTRING_PTR(str2);
2513 if (len) {
2514 n = RSTRING_LEN(str);
2515 memcpy(ptr2, RSTRING_PTR(str), n);
2516 while (n <= len/2) {
2517 memcpy(ptr2 + n, ptr2, n);
2518 n *= 2;
2519 }
2520 memcpy(ptr2 + n, ptr2, len-n);
2521 }
2522 STR_SET_LEN(str2, len);
2523 TERM_FILL(&ptr2[len], termlen);
2524 rb_enc_cr_str_copy_for_substr(str2, str);
2525
2526 return str2;
2527}
2528
2529/*
2530 * call-seq:
2531 * string % object -> new_string
2532 *
2533 * Returns the result of formatting +object+ into the format specification +self+
2534 * (see Kernel#sprintf for formatting details):
2535 *
2536 * "%05d" % 123 # => "00123"
2537 *
2538 * If +self+ contains multiple substitutions, +object+ must be
2539 * an Array or Hash containing the values to be substituted:
2540 *
2541 * "%-5s: %016x" % [ "ID", self.object_id ] # => "ID : 00002b054ec93168"
2542 * "foo = %{foo}" % {foo: 'bar'} # => "foo = bar"
2543 * "foo = %{foo}, baz = %{baz}" % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2544 *
2545 */
2546
2547static VALUE
2548rb_str_format_m(VALUE str, VALUE arg)
2549{
2550 VALUE tmp = rb_check_array_type(arg);
2551
2552 if (!NIL_P(tmp)) {
2553 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2554 }
2555 return rb_str_format(1, &arg, str);
2556}
2557
2558static inline void
2559rb_check_lockedtmp(VALUE str)
2560{
2561 if (FL_TEST(str, STR_TMPLOCK)) {
2562 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2563 }
2564}
2565
2566// If none of these flags are set, we know we have an modifiable string.
2567// If any is set, we need to do more detailed checks.
2568#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2569static inline void
2570str_modifiable(VALUE str)
2571{
2572 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2573 if (CHILLED_STRING_P(str)) {
2574 CHILLED_STRING_MUTATED(str);
2575 }
2576 rb_check_lockedtmp(str);
2577 rb_check_frozen(str);
2578 }
2579}
2580
2581static inline int
2582str_dependent_p(VALUE str)
2583{
2584 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2585 return FALSE;
2586 }
2587 else {
2588 return TRUE;
2589 }
2590}
2591
2592// If none of these flags are set, we know we have an independent string.
2593// If any is set, we need to do more detailed checks.
2594#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2595static inline int
2596str_independent(VALUE str)
2597{
2598 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2599 str_modifiable(str);
2600 return !str_dependent_p(str);
2601 }
2602 return TRUE;
2603}
2604
2605static void
2606str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2607{
2608 char *ptr;
2609 char *oldptr;
2610 long capa = len + expand;
2611
2612 if (len > capa) len = capa;
2613
2614 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2615 ptr = RSTRING(str)->as.heap.ptr;
2616 STR_SET_EMBED(str);
2617 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2618 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2619 STR_SET_LEN(str, len);
2620 return;
2621 }
2622
2623 ptr = ALLOC_N(char, (size_t)capa + termlen);
2624 oldptr = RSTRING_PTR(str);
2625 if (oldptr) {
2626 memcpy(ptr, oldptr, len);
2627 }
2628 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2629 xfree(oldptr);
2630 }
2631 STR_SET_NOEMBED(str);
2632 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2633 TERM_FILL(ptr + len, termlen);
2634 RSTRING(str)->as.heap.ptr = ptr;
2635 STR_SET_LEN(str, len);
2636 RSTRING(str)->as.heap.aux.capa = capa;
2637}
2638
2639void
2640rb_str_modify(VALUE str)
2641{
2642 if (!str_independent(str))
2643 str_make_independent(str);
2645}
2646
2647void
2649{
2650 int termlen = TERM_LEN(str);
2651 long len = RSTRING_LEN(str);
2652
2653 if (expand < 0) {
2654 rb_raise(rb_eArgError, "negative expanding string size");
2655 }
2656 if (expand >= LONG_MAX - len) {
2657 rb_raise(rb_eArgError, "string size too big");
2658 }
2659
2660 if (!str_independent(str)) {
2661 str_make_independent_expand(str, len, expand, termlen);
2662 }
2663 else if (expand > 0) {
2664 RESIZE_CAPA_TERM(str, len + expand, termlen);
2665 }
2667}
2668
2669/* As rb_str_modify(), but don't clear coderange */
2670static void
2671str_modify_keep_cr(VALUE str)
2672{
2673 if (!str_independent(str))
2674 str_make_independent(str);
2676 /* Force re-scan later */
2678}
2679
2680static inline void
2681str_discard(VALUE str)
2682{
2683 str_modifiable(str);
2684 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2685 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2686 RSTRING(str)->as.heap.ptr = 0;
2687 STR_SET_LEN(str, 0);
2688 }
2689}
2690
2691void
2693{
2694 int encindex = rb_enc_get_index(str);
2695
2696 if (RB_UNLIKELY(encindex == -1)) {
2697 rb_raise(rb_eTypeError, "not encoding capable object");
2698 }
2699
2700 if (RB_LIKELY(str_encindex_fastpath(encindex))) {
2701 return;
2702 }
2703
2704 rb_encoding *enc = rb_enc_from_index(encindex);
2705 if (!rb_enc_asciicompat(enc)) {
2706 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2707 }
2708}
2709
2710VALUE
2712{
2713 VALUE s = *ptr;
2714 if (!RB_TYPE_P(s, T_STRING)) {
2715 s = rb_str_to_str(s);
2716 *ptr = s;
2717 }
2718 return s;
2719}
2720
2721char *
2723{
2724 VALUE str = rb_string_value(ptr);
2725 return RSTRING_PTR(str);
2726}
2727
2728static int
2729zero_filled(const char *s, int n)
2730{
2731 for (; n > 0; --n) {
2732 if (*s++) return 0;
2733 }
2734 return 1;
2735}
2736
2737static const char *
2738str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2739{
2740 const char *e = s + len;
2741
2742 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2743 if (zero_filled(s, minlen)) return s;
2744 }
2745 return 0;
2746}
2747
2748static char *
2749str_fill_term(VALUE str, char *s, long len, int termlen)
2750{
2751 /* This function assumes that (capa + termlen) bytes of memory
2752 * is allocated, like many other functions in this file.
2753 */
2754 if (str_dependent_p(str)) {
2755 if (!zero_filled(s + len, termlen))
2756 str_make_independent_expand(str, len, 0L, termlen);
2757 }
2758 else {
2759 TERM_FILL(s + len, termlen);
2760 return s;
2761 }
2762 return RSTRING_PTR(str);
2763}
2764
2765void
2766rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2767{
2768 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2769 long len = RSTRING_LEN(str);
2770
2771 RUBY_ASSERT(capa >= len);
2772 if (capa - len < termlen) {
2773 rb_check_lockedtmp(str);
2774 str_make_independent_expand(str, len, 0L, termlen);
2775 }
2776 else if (str_dependent_p(str)) {
2777 if (termlen > oldtermlen)
2778 str_make_independent_expand(str, len, 0L, termlen);
2779 }
2780 else {
2781 if (!STR_EMBED_P(str)) {
2782 /* modify capa instead of realloc */
2783 RUBY_ASSERT(!FL_TEST((str), STR_SHARED));
2784 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2785 }
2786 if (termlen > oldtermlen) {
2787 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2788 }
2789 }
2790
2791 return;
2792}
2793
2794static char *
2795str_null_check(VALUE str, int *w)
2796{
2797 char *s = RSTRING_PTR(str);
2798 long len = RSTRING_LEN(str);
2799 rb_encoding *enc = rb_enc_get(str);
2800 const int minlen = rb_enc_mbminlen(enc);
2801
2802 if (minlen > 1) {
2803 *w = 1;
2804 if (str_null_char(s, len, minlen, enc)) {
2805 return NULL;
2806 }
2807 return str_fill_term(str, s, len, minlen);
2808 }
2809 *w = 0;
2810 if (!s || memchr(s, 0, len)) {
2811 return NULL;
2812 }
2813 if (s[len]) {
2814 s = str_fill_term(str, s, len, minlen);
2815 }
2816 return s;
2817}
2818
2819char *
2820rb_str_to_cstr(VALUE str)
2821{
2822 int w;
2823 return str_null_check(str, &w);
2824}
2825
2826char *
2828{
2829 VALUE str = rb_string_value(ptr);
2830 int w;
2831 char *s = str_null_check(str, &w);
2832 if (!s) {
2833 if (w) {
2834 rb_raise(rb_eArgError, "string contains null char");
2835 }
2836 rb_raise(rb_eArgError, "string contains null byte");
2837 }
2838 return s;
2839}
2840
2841char *
2842rb_str_fill_terminator(VALUE str, const int newminlen)
2843{
2844 char *s = RSTRING_PTR(str);
2845 long len = RSTRING_LEN(str);
2846 return str_fill_term(str, s, len, newminlen);
2847}
2848
2849VALUE
2851{
2852 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2853 return str;
2854}
2855
2856/*
2857 * call-seq:
2858 * String.try_convert(object) -> object, new_string, or nil
2859 *
2860 * If +object+ is a +String+ object, returns +object+.
2861 *
2862 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2863 * calls <tt>object.to_str</tt> and returns the result.
2864 *
2865 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2866 *
2867 * Raises an exception unless <tt>object.to_str</tt> returns a +String+ object.
2868 */
2869static VALUE
2870rb_str_s_try_convert(VALUE dummy, VALUE str)
2871{
2872 return rb_check_string_type(str);
2873}
2874
2875static char*
2876str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2877{
2878 long nth = *nthp;
2879 if (rb_enc_mbmaxlen(enc) == 1) {
2880 p += nth;
2881 }
2882 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2883 p += nth * rb_enc_mbmaxlen(enc);
2884 }
2885 else if (rb_enc_asciicompat(enc)) {
2886 const char *p2, *e2;
2887 int n;
2888
2889 while (p < e && 0 < nth) {
2890 e2 = p + nth;
2891 if (e < e2) {
2892 *nthp = nth;
2893 return (char *)e;
2894 }
2895 if (ISASCII(*p)) {
2896 p2 = search_nonascii(p, e2);
2897 if (!p2) {
2898 nth -= e2 - p;
2899 *nthp = nth;
2900 return (char *)e2;
2901 }
2902 nth -= p2 - p;
2903 p = p2;
2904 }
2905 n = rb_enc_mbclen(p, e, enc);
2906 p += n;
2907 nth--;
2908 }
2909 *nthp = nth;
2910 if (nth != 0) {
2911 return (char *)e;
2912 }
2913 return (char *)p;
2914 }
2915 else {
2916 while (p < e && nth--) {
2917 p += rb_enc_mbclen(p, e, enc);
2918 }
2919 }
2920 if (p > e) p = e;
2921 *nthp = nth;
2922 return (char*)p;
2923}
2924
2925char*
2926rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2927{
2928 return str_nth_len(p, e, &nth, enc);
2929}
2930
2931static char*
2932str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2933{
2934 if (singlebyte)
2935 p += nth;
2936 else {
2937 p = str_nth_len(p, e, &nth, enc);
2938 }
2939 if (!p) return 0;
2940 if (p > e) p = e;
2941 return (char *)p;
2942}
2943
2944/* char offset to byte offset */
2945static long
2946str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2947{
2948 const char *pp = str_nth(p, e, nth, enc, singlebyte);
2949 if (!pp) return e - p;
2950 return pp - p;
2951}
2952
2953long
2954rb_str_offset(VALUE str, long pos)
2955{
2956 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2957 STR_ENC_GET(str), single_byte_optimizable(str));
2958}
2959
2960#ifdef NONASCII_MASK
2961static char *
2962str_utf8_nth(const char *p, const char *e, long *nthp)
2963{
2964 long nth = *nthp;
2965 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
2966 const uintptr_t *s, *t;
2967 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2968 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2969 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2970 while (p < (const char *)s) {
2971 if (is_utf8_lead_byte(*p)) nth--;
2972 p++;
2973 }
2974 do {
2975 nth -= count_utf8_lead_bytes_with_word(s);
2976 s++;
2977 } while (s < t && (int)SIZEOF_VOIDP <= nth);
2978 p = (char *)s;
2979 }
2980 while (p < e) {
2981 if (is_utf8_lead_byte(*p)) {
2982 if (nth == 0) break;
2983 nth--;
2984 }
2985 p++;
2986 }
2987 *nthp = nth;
2988 return (char *)p;
2989}
2990
2991static long
2992str_utf8_offset(const char *p, const char *e, long nth)
2993{
2994 const char *pp = str_utf8_nth(p, e, &nth);
2995 return pp - p;
2996}
2997#endif
2998
2999/* byte offset to char offset */
3000long
3001rb_str_sublen(VALUE str, long pos)
3002{
3003 if (single_byte_optimizable(str) || pos < 0)
3004 return pos;
3005 else {
3006 char *p = RSTRING_PTR(str);
3007 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
3008 }
3009}
3010
3011static VALUE
3012str_subseq(VALUE str, long beg, long len)
3013{
3014 VALUE str2;
3015
3016 RUBY_ASSERT(beg >= 0);
3017 RUBY_ASSERT(len >= 0);
3018 RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
3019
3020 const int termlen = TERM_LEN(str);
3021 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
3022 str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
3023 RB_GC_GUARD(str);
3024 return str2;
3025 }
3026
3027 str2 = str_alloc_heap(rb_cString);
3028 if (str_embed_capa(str2) >= len + termlen) {
3029 char *ptr2 = RSTRING(str2)->as.embed.ary;
3030 STR_SET_EMBED(str2);
3031 memcpy(ptr2, RSTRING_PTR(str) + beg, len);
3032 TERM_FILL(ptr2+len, termlen);
3033
3034 STR_SET_LEN(str2, len);
3035 RB_GC_GUARD(str);
3036 }
3037 else {
3038 str_replace_shared(str2, str);
3039 RUBY_ASSERT(!STR_EMBED_P(str2));
3040 ENC_CODERANGE_CLEAR(str2);
3041 RSTRING(str2)->as.heap.ptr += beg;
3042 if (RSTRING_LEN(str2) > len) {
3043 STR_SET_LEN(str2, len);
3044 }
3045 }
3046
3047 return str2;
3048}
3049
3050VALUE
3051rb_str_subseq(VALUE str, long beg, long len)
3052{
3053 VALUE str2 = str_subseq(str, beg, len);
3054 rb_enc_cr_str_copy_for_substr(str2, str);
3055 return str2;
3056}
3057
3058char *
3059rb_str_subpos(VALUE str, long beg, long *lenp)
3060{
3061 long len = *lenp;
3062 long slen = -1L;
3063 const long blen = RSTRING_LEN(str);
3064 rb_encoding *enc = STR_ENC_GET(str);
3065 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3066
3067 if (len < 0) return 0;
3068 if (beg < 0 && -beg < 0) return 0;
3069 if (!blen) {
3070 len = 0;
3071 }
3072 if (single_byte_optimizable(str)) {
3073 if (beg > blen) return 0;
3074 if (beg < 0) {
3075 beg += blen;
3076 if (beg < 0) return 0;
3077 }
3078 if (len > blen - beg)
3079 len = blen - beg;
3080 if (len < 0) return 0;
3081 p = s + beg;
3082 goto end;
3083 }
3084 if (beg < 0) {
3085 if (len > -beg) len = -beg;
3086 if ((ENC_CODERANGE(str) == ENC_CODERANGE_VALID) &&
3087 (-beg * rb_enc_mbmaxlen(enc) < blen / 8)) {
3088 beg = -beg;
3089 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3090 p = e;
3091 if (!p) return 0;
3092 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3093 if (!p) return 0;
3094 len = e - p;
3095 goto end;
3096 }
3097 else {
3098 slen = str_strlen(str, enc);
3099 beg += slen;
3100 if (beg < 0) return 0;
3101 p = s + beg;
3102 if (len == 0) goto end;
3103 }
3104 }
3105 else if (beg > 0 && beg > blen) {
3106 return 0;
3107 }
3108 if (len == 0) {
3109 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
3110 p = s + beg;
3111 }
3112#ifdef NONASCII_MASK
3113 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
3114 enc == rb_utf8_encoding()) {
3115 p = str_utf8_nth(s, e, &beg);
3116 if (beg > 0) return 0;
3117 len = str_utf8_offset(p, e, len);
3118 }
3119#endif
3120 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3121 int char_sz = rb_enc_mbmaxlen(enc);
3122
3123 p = s + beg * char_sz;
3124 if (p > e) {
3125 return 0;
3126 }
3127 else if (len * char_sz > e - p)
3128 len = e - p;
3129 else
3130 len *= char_sz;
3131 }
3132 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3133 if (beg > 0) return 0;
3134 len = 0;
3135 }
3136 else {
3137 len = str_offset(p, e, len, enc, 0);
3138 }
3139 end:
3140 *lenp = len;
3141 RB_GC_GUARD(str);
3142 return p;
3143}
3144
3145static VALUE str_substr(VALUE str, long beg, long len, int empty);
3146
3147VALUE
3148rb_str_substr(VALUE str, long beg, long len)
3149{
3150 return str_substr(str, beg, len, TRUE);
3151}
3152
3153VALUE
3154rb_str_substr_two_fixnums(VALUE str, VALUE beg, VALUE len, int empty)
3155{
3156 return str_substr(str, NUM2LONG(beg), NUM2LONG(len), empty);
3157}
3158
3159static VALUE
3160str_substr(VALUE str, long beg, long len, int empty)
3161{
3162 char *p = rb_str_subpos(str, beg, &len);
3163
3164 if (!p) return Qnil;
3165 if (!len && !empty) return Qnil;
3166
3167 beg = p - RSTRING_PTR(str);
3168
3169 VALUE str2 = str_subseq(str, beg, len);
3170 rb_enc_cr_str_copy_for_substr(str2, str);
3171 return str2;
3172}
3173
3174/* :nodoc: */
3175VALUE
3177{
3178 if (CHILLED_STRING_P(str)) {
3179 FL_UNSET_RAW(str, STR_CHILLED);
3180 }
3181
3182 if (OBJ_FROZEN(str)) return str;
3183 rb_str_resize(str, RSTRING_LEN(str));
3184 return rb_obj_freeze(str);
3185}
3186
3187/*
3188 * call-seq:
3189 * +string -> new_string or self
3190 *
3191 * Returns +self+ if +self+ is not frozen and can be mutated
3192 * without warning issuance.
3193 *
3194 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3195 */
3196static VALUE
3197str_uplus(VALUE str)
3198{
3199 if (OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3200 return rb_str_dup(str);
3201 }
3202 else {
3203 return str;
3204 }
3205}
3206
3207/*
3208 * call-seq:
3209 * -string -> frozen_string
3210 * dedup -> frozen_string
3211 *
3212 * Returns a frozen, possibly pre-existing copy of the string.
3213 *
3214 * The returned +String+ will be deduplicated as long as it does not have
3215 * any instance variables set on it and is not a String subclass.
3216 *
3217 * Note that <tt>-string</tt> variant is more convenient for defining
3218 * constants:
3219 *
3220 * FILENAME = -'config/database.yml'
3221 *
3222 * while +dedup+ is better suitable for using the method in chains
3223 * of calculations:
3224 *
3225 * @url_list.concat(urls.map(&:dedup))
3226 *
3227 */
3228static VALUE
3229str_uminus(VALUE str)
3230{
3231 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3232 str = rb_str_dup(str);
3233 }
3234 return rb_fstring(str);
3235}
3236
3237RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3238#define rb_str_dup_frozen rb_str_new_frozen
3239
3240VALUE
3242{
3243 if (FL_TEST(str, STR_TMPLOCK)) {
3244 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3245 }
3246 FL_SET(str, STR_TMPLOCK);
3247 return str;
3248}
3249
3250VALUE
3252{
3253 if (!FL_TEST(str, STR_TMPLOCK)) {
3254 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3255 }
3256 FL_UNSET(str, STR_TMPLOCK);
3257 return str;
3258}
3259
3260VALUE
3261rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3262{
3263 rb_str_locktmp(str);
3264 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3265}
3266
3267void
3269{
3270 long capa;
3271 const int termlen = TERM_LEN(str);
3272
3273 str_modifiable(str);
3274 if (STR_SHARED_P(str)) {
3275 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3276 }
3277 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3278 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3279 }
3280
3281 int cr = ENC_CODERANGE(str);
3282 if (len == 0) {
3283 /* Empty string does not contain non-ASCII */
3285 }
3286 else if (cr == ENC_CODERANGE_UNKNOWN) {
3287 /* Leave unknown. */
3288 }
3289 else if (len > RSTRING_LEN(str)) {
3290 if (ENC_CODERANGE_CLEAN_P(cr)) {
3291 /* Update the coderange regarding the extended part. */
3292 const char *const prev_end = RSTRING_END(str);
3293 const char *const new_end = RSTRING_PTR(str) + len;
3294 rb_encoding *enc = rb_enc_get(str);
3295 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3296 ENC_CODERANGE_SET(str, cr);
3297 }
3298 else if (cr == ENC_CODERANGE_BROKEN) {
3299 /* May be valid now, by appended part. */
3301 }
3302 }
3303 else if (len < RSTRING_LEN(str)) {
3304 if (cr != ENC_CODERANGE_7BIT) {
3305 /* ASCII-only string is keeping after truncated. Valid
3306 * and broken may be invalid or valid, leave unknown. */
3308 }
3309 }
3310
3311 STR_SET_LEN(str, len);
3312 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3313}
3314
3315VALUE
3316rb_str_resize(VALUE str, long len)
3317{
3318 if (len < 0) {
3319 rb_raise(rb_eArgError, "negative string size (or size too big)");
3320 }
3321
3322 int independent = str_independent(str);
3323 long slen = RSTRING_LEN(str);
3324 const int termlen = TERM_LEN(str);
3325
3326 if (slen > len || (termlen != 1 && slen < len)) {
3328 }
3329
3330 {
3331 long capa;
3332 if (STR_EMBED_P(str)) {
3333 if (len == slen) return str;
3334 if (str_embed_capa(str) >= len + termlen) {
3335 STR_SET_LEN(str, len);
3336 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3337 return str;
3338 }
3339 str_make_independent_expand(str, slen, len - slen, termlen);
3340 }
3341 else if (str_embed_capa(str) >= len + termlen) {
3342 char *ptr = STR_HEAP_PTR(str);
3343 STR_SET_EMBED(str);
3344 if (slen > len) slen = len;
3345 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3346 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3347 STR_SET_LEN(str, len);
3348 if (independent) ruby_xfree(ptr);
3349 return str;
3350 }
3351 else if (!independent) {
3352 if (len == slen) return str;
3353 str_make_independent_expand(str, slen, len - slen, termlen);
3354 }
3355 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3356 (capa - len) > (len < 1024 ? len : 1024)) {
3357 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3358 (size_t)len + termlen, STR_HEAP_SIZE(str));
3359 RSTRING(str)->as.heap.aux.capa = len;
3360 }
3361 else if (len == slen) return str;
3362 STR_SET_LEN(str, len);
3363 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3364 }
3365 return str;
3366}
3367
3368static void
3369str_ensure_available_capa(VALUE str, long len)
3370{
3371 str_modify_keep_cr(str);
3372
3373 const int termlen = TERM_LEN(str);
3374 long olen = RSTRING_LEN(str);
3375
3376 if (RB_UNLIKELY(olen > LONG_MAX - len)) {
3377 rb_raise(rb_eArgError, "string sizes too big");
3378 }
3379
3380 long total = olen + len;
3381 long capa = str_capacity(str, termlen);
3382
3383 if (capa < total) {
3384 if (total >= LONG_MAX / 2) {
3385 capa = total;
3386 }
3387 while (total > capa) {
3388 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3389 }
3390 RESIZE_CAPA_TERM(str, capa, termlen);
3391 }
3392}
3393
3394static VALUE
3395str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3396{
3397 if (keep_cr) {
3398 str_modify_keep_cr(str);
3399 }
3400 else {
3401 rb_str_modify(str);
3402 }
3403 if (len == 0) return 0;
3404
3405 long total, olen, off = -1;
3406 char *sptr;
3407 const int termlen = TERM_LEN(str);
3408
3409 RSTRING_GETMEM(str, sptr, olen);
3410 if (ptr >= sptr && ptr <= sptr + olen) {
3411 off = ptr - sptr;
3412 }
3413
3414 long capa = str_capacity(str, termlen);
3415
3416 if (olen > LONG_MAX - len) {
3417 rb_raise(rb_eArgError, "string sizes too big");
3418 }
3419 total = olen + len;
3420 if (capa < total) {
3421 if (total >= LONG_MAX / 2) {
3422 capa = total;
3423 }
3424 while (total > capa) {
3425 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3426 }
3427 RESIZE_CAPA_TERM(str, capa, termlen);
3428 sptr = RSTRING_PTR(str);
3429 }
3430 if (off != -1) {
3431 ptr = sptr + off;
3432 }
3433 memcpy(sptr + olen, ptr, len);
3434 STR_SET_LEN(str, total);
3435 TERM_FILL(sptr + total, termlen); /* sentinel */
3436
3437 return str;
3438}
3439
3440#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3441#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3442
3443VALUE
3444rb_str_cat(VALUE str, const char *ptr, long len)
3445{
3446 if (len == 0) return str;
3447 if (len < 0) {
3448 rb_raise(rb_eArgError, "negative string size (or size too big)");
3449 }
3450 return str_buf_cat(str, ptr, len);
3451}
3452
3453VALUE
3454rb_str_cat_cstr(VALUE str, const char *ptr)
3455{
3456 must_not_null(ptr);
3457 return rb_str_buf_cat(str, ptr, strlen(ptr));
3458}
3459
3460static void
3461rb_str_buf_cat_byte(VALUE str, unsigned char byte)
3462{
3463 RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT || RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII);
3464
3465 // We can't write directly to shared strings without impacting others, so we must make the string independent.
3466 if (UNLIKELY(!str_independent(str))) {
3467 str_make_independent(str);
3468 }
3469
3470 long string_length = -1;
3471 const int null_terminator_length = 1;
3472 char *sptr;
3473 RSTRING_GETMEM(str, sptr, string_length);
3474
3475 // Ensure the resulting string wouldn't be too long.
3476 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3477 rb_raise(rb_eArgError, "string sizes too big");
3478 }
3479
3480 long string_capacity = str_capacity(str, null_terminator_length);
3481
3482 // Get the code range before any modifications since those might clear the code range.
3483 int cr = ENC_CODERANGE(str);
3484
3485 // Check if the string has spare string_capacity to write the new byte.
3486 if (LIKELY(string_capacity >= string_length + 1)) {
3487 // In fast path we can write the new byte and note the string's new length.
3488 sptr[string_length] = byte;
3489 STR_SET_LEN(str, string_length + 1);
3490 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3491 }
3492 else {
3493 // If there's not enough string_capacity, make a call into the general string concatenation function.
3494 str_buf_cat(str, (char *)&byte, 1);
3495 }
3496
3497 // If the code range is already known, we can derive the resulting code range cheaply by looking at the byte we
3498 // just appended. If the code range is unknown, but the string was empty, then we can also derive the code range
3499 // by looking at the byte we just appended. Otherwise, we'd have to scan the bytes to determine the code range so
3500 // we leave it as unknown. It cannot be broken for binary strings so we don't need to handle that option.
3501 if (cr == ENC_CODERANGE_7BIT || string_length == 0) {
3502 if (ISASCII(byte)) {
3504 }
3505 else {
3507
3508 // Promote a US-ASCII string to ASCII-8BIT when a non-ASCII byte is appended.
3509 if (UNLIKELY(RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)) {
3510 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3511 }
3512 }
3513 }
3514}
3515
3516RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3517RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3518RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3519
3520static VALUE
3521rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3522 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3523{
3524 int str_encindex = ENCODING_GET(str);
3525 int res_encindex;
3526 int str_cr, res_cr;
3527 rb_encoding *str_enc, *ptr_enc;
3528
3529 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3530
3531 if (str_encindex == ptr_encindex) {
3532 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3533 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3534 }
3535 }
3536 else {
3537 str_enc = rb_enc_from_index(str_encindex);
3538 ptr_enc = rb_enc_from_index(ptr_encindex);
3539 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3540 if (len == 0)
3541 return str;
3542 if (RSTRING_LEN(str) == 0) {
3543 rb_str_buf_cat(str, ptr, len);
3544 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3545 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3546 return str;
3547 }
3548 goto incompatible;
3549 }
3550 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3551 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3552 }
3553 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3554 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3555 str_cr = rb_enc_str_coderange(str);
3556 }
3557 }
3558 }
3559 if (ptr_cr_ret)
3560 *ptr_cr_ret = ptr_cr;
3561
3562 if (str_encindex != ptr_encindex &&
3563 str_cr != ENC_CODERANGE_7BIT &&
3564 ptr_cr != ENC_CODERANGE_7BIT) {
3565 str_enc = rb_enc_from_index(str_encindex);
3566 ptr_enc = rb_enc_from_index(ptr_encindex);
3567 goto incompatible;
3568 }
3569
3570 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3571 res_encindex = str_encindex;
3572 res_cr = ENC_CODERANGE_UNKNOWN;
3573 }
3574 else if (str_cr == ENC_CODERANGE_7BIT) {
3575 if (ptr_cr == ENC_CODERANGE_7BIT) {
3576 res_encindex = str_encindex;
3577 res_cr = ENC_CODERANGE_7BIT;
3578 }
3579 else {
3580 res_encindex = ptr_encindex;
3581 res_cr = ptr_cr;
3582 }
3583 }
3584 else if (str_cr == ENC_CODERANGE_VALID) {
3585 res_encindex = str_encindex;
3586 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3587 res_cr = str_cr;
3588 else
3589 res_cr = ptr_cr;
3590 }
3591 else { /* str_cr == ENC_CODERANGE_BROKEN */
3592 res_encindex = str_encindex;
3593 res_cr = str_cr;
3594 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3595 }
3596
3597 if (len < 0) {
3598 rb_raise(rb_eArgError, "negative string size (or size too big)");
3599 }
3600 str_buf_cat(str, ptr, len);
3601 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3602 return str;
3603
3604 incompatible:
3605 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3606 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3608}
3609
3610VALUE
3611rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3612{
3613 return rb_enc_cr_str_buf_cat(str, ptr, len,
3614 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3615}
3616
3617VALUE
3618rb_str_buf_cat_ascii(VALUE str, const char *ptr)
3619{
3620 /* ptr must reference NUL terminated ASCII string. */
3621 int encindex = ENCODING_GET(str);
3622 rb_encoding *enc = rb_enc_from_index(encindex);
3623 if (rb_enc_asciicompat(enc)) {
3624 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3625 encindex, ENC_CODERANGE_7BIT, 0);
3626 }
3627 else {
3628 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3629 while (*ptr) {
3630 unsigned int c = (unsigned char)*ptr;
3631 int len = rb_enc_codelen(c, enc);
3632 rb_enc_mbcput(c, buf, enc);
3633 rb_enc_cr_str_buf_cat(str, buf, len,
3634 encindex, ENC_CODERANGE_VALID, 0);
3635 ptr++;
3636 }
3637 return str;
3638 }
3639}
3640
3641VALUE
3643{
3644 int str2_cr = rb_enc_str_coderange(str2);
3645
3646 if (str_enc_fastpath(str)) {
3647 switch (str2_cr) {
3648 case ENC_CODERANGE_7BIT:
3649 // If RHS is 7bit we can do simple concatenation
3650 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3651 RB_GC_GUARD(str2);
3652 return str;
3654 // If RHS is valid, we can do simple concatenation if encodings are the same
3655 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3656 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3657 int str_cr = ENC_CODERANGE(str);
3658 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3659 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3660 }
3661 RB_GC_GUARD(str2);
3662 return str;
3663 }
3664 }
3665 }
3666
3667 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3668 ENCODING_GET(str2), str2_cr, &str2_cr);
3669
3670 ENC_CODERANGE_SET(str2, str2_cr);
3671
3672 return str;
3673}
3674
3675VALUE
3677{
3678 StringValue(str2);
3679 return rb_str_buf_append(str, str2);
3680}
3681
3682VALUE
3683rb_str_concat_literals(size_t num, const VALUE *strary)
3684{
3685 VALUE str;
3686 size_t i, s = 0;
3687 unsigned long len = 1;
3688
3689 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3690 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3691
3692 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3693 str = rb_str_buf_new(len);
3694 str_enc_copy_direct(str, strary[0]);
3695
3696 for (i = s; i < num; ++i) {
3697 const VALUE v = strary[i];
3698 int encidx = ENCODING_GET(v);
3699
3700 rb_str_buf_append(str, v);
3701 if (encidx != ENCINDEX_US_ASCII) {
3702 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3703 rb_enc_set_index(str, encidx);
3704 }
3705 }
3706 return str;
3707}
3708
3709/*
3710 * call-seq:
3711 * concat(*objects) -> string
3712 *
3713 * Concatenates each object in +objects+ to +self+ and returns +self+:
3714 *
3715 * s = 'foo'
3716 * s.concat('bar', 'baz') # => "foobarbaz"
3717 * s # => "foobarbaz"
3718 *
3719 * For each given object +object+ that is an Integer,
3720 * the value is considered a codepoint and converted to a character before concatenation:
3721 *
3722 * s = 'foo'
3723 * s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
3724 *
3725 * Related: String#<<, which takes a single argument.
3726 */
3727static VALUE
3728rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3729{
3730 str_modifiable(str);
3731
3732 if (argc == 1) {
3733 return rb_str_concat(str, argv[0]);
3734 }
3735 else if (argc > 1) {
3736 int i;
3737 VALUE arg_str = rb_str_tmp_new(0);
3738 rb_enc_copy(arg_str, str);
3739 for (i = 0; i < argc; i++) {
3740 rb_str_concat(arg_str, argv[i]);
3741 }
3742 rb_str_buf_append(str, arg_str);
3743 }
3744
3745 return str;
3746}
3747
3748/*
3749 * call-seq:
3750 * append_as_bytes(*objects) -> string
3751 *
3752 * Concatenates each object in +objects+ into +self+ without any encoding
3753 * validation or conversion and returns +self+:
3754 *
3755 * s = 'foo'
3756 * s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82"
3757 * s.valid_encoding? # => false
3758 * s.append_as_bytes("\xAC 12")
3759 * s.valid_encoding? # => true
3760 *
3761 * For each given object +object+ that is an Integer,
3762 * the value is considered a Byte. If the Integer is bigger
3763 * than one byte, only the lower byte is considered, similar to String#setbyte:
3764 *
3765 * s = ""
3766 * s.append_as_bytes(0, 257) # => "\u0000\u0001"
3767 *
3768 * Related: String#<<, String#concat, which do an encoding aware concatenation.
3769 */
3770
3771VALUE
3772rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
3773{
3774 long needed_capacity = 0;
3775 volatile VALUE t0;
3776 enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
3777
3778 for (int index = 0; index < argc; index++) {
3779 VALUE obj = argv[index];
3780 enum ruby_value_type type = types[index] = rb_type(obj);
3781 switch (type) {
3782 case T_FIXNUM:
3783 case T_BIGNUM:
3784 needed_capacity++;
3785 break;
3786 case T_STRING:
3787 needed_capacity += RSTRING_LEN(obj);
3788 break;
3789 default:
3790 rb_raise(
3792 "wrong argument type %"PRIsVALUE" (expected String or Integer)",
3793 rb_obj_class(obj)
3794 );
3795 break;
3796 }
3797 }
3798
3799 str_ensure_available_capa(str, needed_capacity);
3800 char *sptr = RSTRING_END(str);
3801
3802 for (int index = 0; index < argc; index++) {
3803 VALUE obj = argv[index];
3804 enum ruby_value_type type = types[index];
3805 switch (type) {
3806 case T_FIXNUM:
3807 case T_BIGNUM: {
3808 argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
3809 char byte = (char)(NUM2INT(obj) & 0xFF);
3810 *sptr = byte;
3811 sptr++;
3812 break;
3813 }
3814 case T_STRING: {
3815 const char *ptr;
3816 long len;
3817 RSTRING_GETMEM(obj, ptr, len);
3818 memcpy(sptr, ptr, len);
3819 sptr += len;
3820 break;
3821 }
3822 default:
3823 rb_bug("append_as_bytes arguments should have been validated");
3824 }
3825 }
3826
3827 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3828 TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
3829
3830 int cr = ENC_CODERANGE(str);
3831 switch (cr) {
3832 case ENC_CODERANGE_7BIT: {
3833 for (int index = 0; index < argc; index++) {
3834 VALUE obj = argv[index];
3835 enum ruby_value_type type = types[index];
3836 switch (type) {
3837 case T_FIXNUM:
3838 case T_BIGNUM: {
3839 if (!ISASCII(NUM2INT(obj))) {
3840 goto clear_cr;
3841 }
3842 break;
3843 }
3844 case T_STRING: {
3845 if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
3846 goto clear_cr;
3847 }
3848 break;
3849 }
3850 default:
3851 rb_bug("append_as_bytes arguments should have been validated");
3852 }
3853 }
3854 break;
3855 }
3857 if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
3858 goto keep_cr;
3859 }
3860 else {
3861 goto clear_cr;
3862 }
3863 break;
3864 default:
3865 goto clear_cr;
3866 break;
3867 }
3868
3869 RB_GC_GUARD(t0);
3870
3871 clear_cr:
3872 // If no fast path was hit, we clear the coderange.
3873 // append_as_bytes is predominently meant to be used in
3874 // buffering situation, hence it's likely the coderange
3875 // will never be scanned, so it's not worth spending time
3876 // precomputing the coderange except for simple and common
3877 // situations.
3879 keep_cr:
3880 return str;
3881}
3882
3883/*
3884 * call-seq:
3885 * string << object -> string
3886 *
3887 * Concatenates +object+ to +self+ and returns +self+:
3888 *
3889 * s = 'foo'
3890 * s << 'bar' # => "foobar"
3891 * s # => "foobar"
3892 *
3893 * If +object+ is an Integer,
3894 * the value is considered a codepoint and converted to a character before concatenation:
3895 *
3896 * s = 'foo'
3897 * s << 33 # => "foo!"
3898 *
3899 * If that codepoint is not representable in the encoding of
3900 * _string_, RangeError is raised.
3901 *
3902 * s = 'foo'
3903 * s.encoding # => <Encoding:UTF-8>
3904 * s << 0x00110000 # 1114112 out of char range (RangeError)
3905 * s = 'foo'.encode('EUC-JP')
3906 * s << 0x00800080 # invalid codepoint 0x800080 in EUC-JP (RangeError)
3907 *
3908 * If the encoding is US-ASCII and the codepoint is 0..0xff, _string_
3909 * is automatically promoted to ASCII-8BIT.
3910 *
3911 * s = 'foo'.encode('US-ASCII')
3912 * s << 0xff
3913 * s.encoding # => #<Encoding:BINARY (ASCII-8BIT)>
3914 *
3915 * Related: String#concat, which takes multiple arguments.
3916 */
3917VALUE
3919{
3920 unsigned int code;
3921 rb_encoding *enc = STR_ENC_GET(str1);
3922 int encidx;
3923
3924 if (RB_INTEGER_TYPE_P(str2)) {
3925 if (rb_num_to_uint(str2, &code) == 0) {
3926 }
3927 else if (FIXNUM_P(str2)) {
3928 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
3929 }
3930 else {
3931 rb_raise(rb_eRangeError, "bignum out of char range");
3932 }
3933 }
3934 else {
3935 return rb_str_append(str1, str2);
3936 }
3937
3938 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
3939
3940 if (encidx >= 0) {
3941 rb_str_buf_cat_byte(str1, (unsigned char)code);
3942 }
3943 else {
3944 long pos = RSTRING_LEN(str1);
3945 int cr = ENC_CODERANGE(str1);
3946 int len;
3947 char *buf;
3948
3949 switch (len = rb_enc_codelen(code, enc)) {
3950 case ONIGERR_INVALID_CODE_POINT_VALUE:
3951 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3952 break;
3953 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3954 case 0:
3955 rb_raise(rb_eRangeError, "%u out of char range", code);
3956 break;
3957 }
3958 buf = ALLOCA_N(char, len + 1);
3959 rb_enc_mbcput(code, buf, enc);
3960 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
3961 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3962 }
3963 rb_str_resize(str1, pos+len);
3964 memcpy(RSTRING_PTR(str1) + pos, buf, len);
3965 if (cr == ENC_CODERANGE_7BIT && code > 127) {
3967 }
3968 else if (cr == ENC_CODERANGE_BROKEN) {
3970 }
3971 ENC_CODERANGE_SET(str1, cr);
3972 }
3973 return str1;
3974}
3975
3976int
3977rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
3978{
3979 int encidx = rb_enc_to_index(enc);
3980
3981 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
3982 /* US-ASCII automatically extended to ASCII-8BIT */
3983 if (code > 0xFF) {
3984 rb_raise(rb_eRangeError, "%u out of char range", code);
3985 }
3986 if (encidx == ENCINDEX_US_ASCII && code > 127) {
3987 return ENCINDEX_ASCII_8BIT;
3988 }
3989 return encidx;
3990 }
3991 else {
3992 return -1;
3993 }
3994}
3995
3996/*
3997 * call-seq:
3998 * prepend(*other_strings) -> string
3999 *
4000 * Prepends each string in +other_strings+ to +self+ and returns +self+:
4001 *
4002 * s = 'foo'
4003 * s.prepend('bar', 'baz') # => "barbazfoo"
4004 * s # => "barbazfoo"
4005 *
4006 * Related: String#concat.
4007 */
4008
4009static VALUE
4010rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
4011{
4012 str_modifiable(str);
4013
4014 if (argc == 1) {
4015 rb_str_update(str, 0L, 0L, argv[0]);
4016 }
4017 else if (argc > 1) {
4018 int i;
4019 VALUE arg_str = rb_str_tmp_new(0);
4020 rb_enc_copy(arg_str, str);
4021 for (i = 0; i < argc; i++) {
4022 rb_str_append(arg_str, argv[i]);
4023 }
4024 rb_str_update(str, 0L, 0L, arg_str);
4025 }
4026
4027 return str;
4028}
4029
4030st_index_t
4032{
4033 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
4034 st_index_t precomputed_hash;
4035 memcpy(&precomputed_hash, RSTRING_END(str) + TERM_LEN(str), sizeof(precomputed_hash));
4036
4037 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4038 return precomputed_hash;
4039 }
4040
4041 return str_do_hash(str);
4042}
4043
4044int
4046{
4047 long len1, len2;
4048 const char *ptr1, *ptr2;
4049 RSTRING_GETMEM(str1, ptr1, len1);
4050 RSTRING_GETMEM(str2, ptr2, len2);
4051 return (len1 != len2 ||
4052 !rb_str_comparable(str1, str2) ||
4053 memcmp(ptr1, ptr2, len1) != 0);
4054}
4055
4056/*
4057 * call-seq:
4058 * hash -> integer
4059 *
4060 * Returns the integer hash value for +self+.
4061 * The value is based on the length, content and encoding of +self+.
4062 *
4063 * Related: Object#hash.
4064 */
4065
4066static VALUE
4067rb_str_hash_m(VALUE str)
4068{
4069 st_index_t hval = rb_str_hash(str);
4070 return ST2FIX(hval);
4071}
4072
4073#define lesser(a,b) (((a)>(b))?(b):(a))
4074
4075int
4077{
4078 int idx1, idx2;
4079 int rc1, rc2;
4080
4081 if (RSTRING_LEN(str1) == 0) return TRUE;
4082 if (RSTRING_LEN(str2) == 0) return TRUE;
4083 idx1 = ENCODING_GET(str1);
4084 idx2 = ENCODING_GET(str2);
4085 if (idx1 == idx2) return TRUE;
4086 rc1 = rb_enc_str_coderange(str1);
4087 rc2 = rb_enc_str_coderange(str2);
4088 if (rc1 == ENC_CODERANGE_7BIT) {
4089 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
4090 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4091 return TRUE;
4092 }
4093 if (rc2 == ENC_CODERANGE_7BIT) {
4094 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4095 return TRUE;
4096 }
4097 return FALSE;
4098}
4099
4100int
4102{
4103 long len1, len2;
4104 const char *ptr1, *ptr2;
4105 int retval;
4106
4107 if (str1 == str2) return 0;
4108 RSTRING_GETMEM(str1, ptr1, len1);
4109 RSTRING_GETMEM(str2, ptr2, len2);
4110 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4111 if (len1 == len2) {
4112 if (!rb_str_comparable(str1, str2)) {
4113 if (ENCODING_GET(str1) > ENCODING_GET(str2))
4114 return 1;
4115 return -1;
4116 }
4117 return 0;
4118 }
4119 if (len1 > len2) return 1;
4120 return -1;
4121 }
4122 if (retval > 0) return 1;
4123 return -1;
4124}
4125
4126/*
4127 * call-seq:
4128 * string == object -> true or false
4129 * string === object -> true or false
4130 *
4131 * Returns +true+ if +object+ has the same length and content;
4132 * as +self+; +false+ otherwise:
4133 *
4134 * s = 'foo'
4135 * s == 'foo' # => true
4136 * s == 'food' # => false
4137 * s == 'FOO' # => false
4138 *
4139 * Returns +false+ if the two strings' encodings are not compatible:
4140 * "\u{e4 f6 fc}".encode("ISO-8859-1") == ("\u{c4 d6 dc}") # => false
4141 *
4142 * If +object+ is not an instance of +String+ but responds to +to_str+, then the
4143 * two strings are compared using <code>object.==</code>.
4144 */
4145
4146VALUE
4148{
4149 if (str1 == str2) return Qtrue;
4150 if (!RB_TYPE_P(str2, T_STRING)) {
4151 if (!rb_respond_to(str2, idTo_str)) {
4152 return Qfalse;
4153 }
4154 return rb_equal(str2, str1);
4155 }
4156 return rb_str_eql_internal(str1, str2);
4157}
4158
4159/*
4160 * call-seq:
4161 * eql?(object) -> true or false
4162 *
4163 * Returns +true+ if +object+ has the same length and content;
4164 * as +self+; +false+ otherwise:
4165 *
4166 * s = 'foo'
4167 * s.eql?('foo') # => true
4168 * s.eql?('food') # => false
4169 * s.eql?('FOO') # => false
4170 *
4171 * Returns +false+ if the two strings' encodings are not compatible:
4172 *
4173 * "\u{e4 f6 fc}".encode("ISO-8859-1").eql?("\u{c4 d6 dc}") # => false
4174 *
4175 */
4176
4177VALUE
4178rb_str_eql(VALUE str1, VALUE str2)
4179{
4180 if (str1 == str2) return Qtrue;
4181 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
4182 return rb_str_eql_internal(str1, str2);
4183}
4184
4185/*
4186 * call-seq:
4187 * string <=> other_string -> -1, 0, 1, or nil
4188 *
4189 * Compares +self+ and +other_string+, returning:
4190 *
4191 * - -1 if +other_string+ is larger.
4192 * - 0 if the two are equal.
4193 * - 1 if +other_string+ is smaller.
4194 * - +nil+ if the two are incomparable.
4195 *
4196 * Examples:
4197 *
4198 * 'foo' <=> 'foo' # => 0
4199 * 'foo' <=> 'food' # => -1
4200 * 'food' <=> 'foo' # => 1
4201 * 'FOO' <=> 'foo' # => -1
4202 * 'foo' <=> 'FOO' # => 1
4203 * 'foo' <=> 1 # => nil
4204 *
4205 */
4206
4207static VALUE
4208rb_str_cmp_m(VALUE str1, VALUE str2)
4209{
4210 int result;
4211 VALUE s = rb_check_string_type(str2);
4212 if (NIL_P(s)) {
4213 return rb_invcmp(str1, str2);
4214 }
4215 result = rb_str_cmp(str1, s);
4216 return INT2FIX(result);
4217}
4218
4219static VALUE str_casecmp(VALUE str1, VALUE str2);
4220static VALUE str_casecmp_p(VALUE str1, VALUE str2);
4221
4222/*
4223 * call-seq:
4224 * casecmp(other_string) -> -1, 0, 1, or nil
4225 *
4226 * Compares <tt>self.downcase</tt> and <tt>other_string.downcase</tt>; returns:
4227 *
4228 * - -1 if <tt>other_string.downcase</tt> is larger.
4229 * - 0 if the two are equal.
4230 * - 1 if <tt>other_string.downcase</tt> is smaller.
4231 * - +nil+ if the two are incomparable.
4232 *
4233 * Examples:
4234 *
4235 * 'foo'.casecmp('foo') # => 0
4236 * 'foo'.casecmp('food') # => -1
4237 * 'food'.casecmp('foo') # => 1
4238 * 'FOO'.casecmp('foo') # => 0
4239 * 'foo'.casecmp('FOO') # => 0
4240 * 'foo'.casecmp(1) # => nil
4241 *
4242 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4243 *
4244 * Related: String#casecmp?.
4245 *
4246 */
4247
4248static VALUE
4249rb_str_casecmp(VALUE str1, VALUE str2)
4250{
4251 VALUE s = rb_check_string_type(str2);
4252 if (NIL_P(s)) {
4253 return Qnil;
4254 }
4255 return str_casecmp(str1, s);
4256}
4257
4258static VALUE
4259str_casecmp(VALUE str1, VALUE str2)
4260{
4261 long len;
4262 rb_encoding *enc;
4263 const char *p1, *p1end, *p2, *p2end;
4264
4265 enc = rb_enc_compatible(str1, str2);
4266 if (!enc) {
4267 return Qnil;
4268 }
4269
4270 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
4271 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
4272 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4273 while (p1 < p1end && p2 < p2end) {
4274 if (*p1 != *p2) {
4275 unsigned int c1 = TOLOWER(*p1 & 0xff);
4276 unsigned int c2 = TOLOWER(*p2 & 0xff);
4277 if (c1 != c2)
4278 return INT2FIX(c1 < c2 ? -1 : 1);
4279 }
4280 p1++;
4281 p2++;
4282 }
4283 }
4284 else {
4285 while (p1 < p1end && p2 < p2end) {
4286 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4287 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4288
4289 if (0 <= c1 && 0 <= c2) {
4290 c1 = TOLOWER(c1);
4291 c2 = TOLOWER(c2);
4292 if (c1 != c2)
4293 return INT2FIX(c1 < c2 ? -1 : 1);
4294 }
4295 else {
4296 int r;
4297 l1 = rb_enc_mbclen(p1, p1end, enc);
4298 l2 = rb_enc_mbclen(p2, p2end, enc);
4299 len = l1 < l2 ? l1 : l2;
4300 r = memcmp(p1, p2, len);
4301 if (r != 0)
4302 return INT2FIX(r < 0 ? -1 : 1);
4303 if (l1 != l2)
4304 return INT2FIX(l1 < l2 ? -1 : 1);
4305 }
4306 p1 += l1;
4307 p2 += l2;
4308 }
4309 }
4310 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
4311 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
4312 return INT2FIX(-1);
4313}
4314
4315/*
4316 * call-seq:
4317 * casecmp?(other_string) -> true, false, or nil
4318 *
4319 * Returns +true+ if +self+ and +other_string+ are equal after
4320 * Unicode case folding, otherwise +false+:
4321 *
4322 * 'foo'.casecmp?('foo') # => true
4323 * 'foo'.casecmp?('food') # => false
4324 * 'food'.casecmp?('foo') # => false
4325 * 'FOO'.casecmp?('foo') # => true
4326 * 'foo'.casecmp?('FOO') # => true
4327 *
4328 * Returns +nil+ if the two values are incomparable:
4329 *
4330 * 'foo'.casecmp?(1) # => nil
4331 *
4332 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4333 *
4334 * Related: String#casecmp.
4335 *
4336 */
4337
4338static VALUE
4339rb_str_casecmp_p(VALUE str1, VALUE str2)
4340{
4341 VALUE s = rb_check_string_type(str2);
4342 if (NIL_P(s)) {
4343 return Qnil;
4344 }
4345 return str_casecmp_p(str1, s);
4346}
4347
4348static VALUE
4349str_casecmp_p(VALUE str1, VALUE str2)
4350{
4351 rb_encoding *enc;
4352 VALUE folded_str1, folded_str2;
4353 VALUE fold_opt = sym_fold;
4354
4355 enc = rb_enc_compatible(str1, str2);
4356 if (!enc) {
4357 return Qnil;
4358 }
4359
4360 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4361 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4362
4363 return rb_str_eql(folded_str1, folded_str2);
4364}
4365
4366static long
4367strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
4368 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
4369{
4370 const char *search_start = str_ptr;
4371 long pos, search_len = str_len - offset;
4372
4373 for (;;) {
4374 const char *t;
4375 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4376 if (pos < 0) return pos;
4377 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
4378 if (t == search_start + pos) break;
4379 search_len -= t - search_start;
4380 if (search_len <= 0) return -1;
4381 offset += t - search_start;
4382 search_start = t;
4383 }
4384 return pos + offset;
4385}
4386
4387/* found index in byte */
4388#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4389#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4390
4391static long
4392rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
4393{
4394 const char *str_ptr, *str_ptr_end, *sub_ptr;
4395 long str_len, sub_len;
4396 rb_encoding *enc;
4397
4398 enc = rb_enc_check(str, sub);
4399 if (is_broken_string(sub)) return -1;
4400
4401 str_ptr = RSTRING_PTR(str);
4402 str_ptr_end = RSTRING_END(str);
4403 str_len = RSTRING_LEN(str);
4404 sub_ptr = RSTRING_PTR(sub);
4405 sub_len = RSTRING_LEN(sub);
4406
4407 if (str_len < sub_len) return -1;
4408
4409 if (offset != 0) {
4410 long str_len_char, sub_len_char;
4411 int single_byte = single_byte_optimizable(str);
4412 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4413 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4414 if (offset < 0) {
4415 offset += str_len_char;
4416 if (offset < 0) return -1;
4417 }
4418 if (str_len_char - offset < sub_len_char) return -1;
4419 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4420 str_ptr += offset;
4421 }
4422 if (sub_len == 0) return offset;
4423
4424 /* need proceed one character at a time */
4425 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4426}
4427
4428
4429/*
4430 * call-seq:
4431 * index(substring, offset = 0) -> integer or nil
4432 * index(regexp, offset = 0) -> integer or nil
4433 *
4434 * :include: doc/string/index.rdoc
4435 *
4436 */
4437
4438static VALUE
4439rb_str_index_m(int argc, VALUE *argv, VALUE str)
4440{
4441 VALUE sub;
4442 VALUE initpos;
4443 rb_encoding *enc = STR_ENC_GET(str);
4444 long pos;
4445
4446 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4447 long slen = str_strlen(str, enc); /* str's enc */
4448 pos = NUM2LONG(initpos);
4449 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4450 if (RB_TYPE_P(sub, T_REGEXP)) {
4452 }
4453 return Qnil;
4454 }
4455 }
4456 else {
4457 pos = 0;
4458 }
4459
4460 if (RB_TYPE_P(sub, T_REGEXP)) {
4461 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4462 enc, single_byte_optimizable(str));
4463
4464 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4465 VALUE match = rb_backref_get();
4466 struct re_registers *regs = RMATCH_REGS(match);
4467 pos = rb_str_sublen(str, BEG(0));
4468 return LONG2NUM(pos);
4469 }
4470 }
4471 else {
4472 StringValue(sub);
4473 pos = rb_str_index(str, sub, pos);
4474 if (pos >= 0) {
4475 pos = rb_str_sublen(str, pos);
4476 return LONG2NUM(pos);
4477 }
4478 }
4479 return Qnil;
4480}
4481
4482/* Ensure that the given pos is a valid character boundary.
4483 * Note that in this function, "character" means a code point
4484 * (Unicode scalar value), not a grapheme cluster.
4485 */
4486static void
4487str_ensure_byte_pos(VALUE str, long pos)
4488{
4489 if (!single_byte_optimizable(str)) {
4490 const char *s = RSTRING_PTR(str);
4491 const char *e = RSTRING_END(str);
4492 const char *p = s + pos;
4493 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4494 rb_raise(rb_eIndexError,
4495 "offset %ld does not land on character boundary", pos);
4496 }
4497 }
4498}
4499
4500/*
4501 * call-seq:
4502 * byteindex(substring, offset = 0) -> integer or nil
4503 * byteindex(regexp, offset = 0) -> integer or nil
4504 *
4505 * Returns the Integer byte-based index of the first occurrence of the given +substring+,
4506 * or +nil+ if none found:
4507 *
4508 * 'foo'.byteindex('f') # => 0
4509 * 'foo'.byteindex('o') # => 1
4510 * 'foo'.byteindex('oo') # => 1
4511 * 'foo'.byteindex('ooo') # => nil
4512 *
4513 * Returns the Integer byte-based index of the first match for the given Regexp +regexp+,
4514 * or +nil+ if none found:
4515 *
4516 * 'foo'.byteindex(/f/) # => 0
4517 * 'foo'.byteindex(/o/) # => 1
4518 * 'foo'.byteindex(/oo/) # => 1
4519 * 'foo'.byteindex(/ooo/) # => nil
4520 *
4521 * Integer argument +offset+, if given, specifies the byte-based position in the
4522 * string to begin the search:
4523 *
4524 * 'foo'.byteindex('o', 1) # => 1
4525 * 'foo'.byteindex('o', 2) # => 2
4526 * 'foo'.byteindex('o', 3) # => nil
4527 *
4528 * If +offset+ is negative, counts backward from the end of +self+:
4529 *
4530 * 'foo'.byteindex('o', -1) # => 2
4531 * 'foo'.byteindex('o', -2) # => 1
4532 * 'foo'.byteindex('o', -3) # => 1
4533 * 'foo'.byteindex('o', -4) # => nil
4534 *
4535 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4536 * raised.
4537 *
4538 * Related: String#index, String#byterindex.
4539 */
4540
4541static VALUE
4542rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4543{
4544 VALUE sub;
4545 VALUE initpos;
4546 long pos;
4547
4548 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4549 long slen = RSTRING_LEN(str);
4550 pos = NUM2LONG(initpos);
4551 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4552 if (RB_TYPE_P(sub, T_REGEXP)) {
4554 }
4555 return Qnil;
4556 }
4557 }
4558 else {
4559 pos = 0;
4560 }
4561
4562 str_ensure_byte_pos(str, pos);
4563
4564 if (RB_TYPE_P(sub, T_REGEXP)) {
4565 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4566 VALUE match = rb_backref_get();
4567 struct re_registers *regs = RMATCH_REGS(match);
4568 pos = BEG(0);
4569 return LONG2NUM(pos);
4570 }
4571 }
4572 else {
4573 StringValue(sub);
4574 pos = rb_str_byteindex(str, sub, pos);
4575 if (pos >= 0) return LONG2NUM(pos);
4576 }
4577 return Qnil;
4578}
4579
4580#ifndef HAVE_MEMRCHR
4581static void*
4582memrchr(const char *search_str, int chr, long search_len)
4583{
4584 const char *ptr = search_str + search_len;
4585 while (ptr > search_str) {
4586 if ((unsigned char)*(--ptr) == chr) return (void *)ptr;
4587 }
4588
4589 return ((void *)0);
4590}
4591#endif
4592
4593static long
4594str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4595{
4596 char *hit, *adjusted;
4597 int c;
4598 long slen, searchlen;
4599 char *sbeg, *e, *t;
4600
4601 sbeg = RSTRING_PTR(str);
4602 slen = RSTRING_LEN(sub);
4603 if (slen == 0) return s - sbeg;
4604 e = RSTRING_END(str);
4605 t = RSTRING_PTR(sub);
4606 c = *t & 0xff;
4607 searchlen = s - sbeg + 1;
4608
4609 if (memcmp(s, t, slen) == 0) {
4610 return s - sbeg;
4611 }
4612
4613 do {
4614 hit = memrchr(sbeg, c, searchlen);
4615 if (!hit) break;
4616 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4617 if (hit != adjusted) {
4618 searchlen = adjusted - sbeg;
4619 continue;
4620 }
4621 if (memcmp(hit, t, slen) == 0)
4622 return hit - sbeg;
4623 searchlen = adjusted - sbeg;
4624 } while (searchlen > 0);
4625
4626 return -1;
4627}
4628
4629/* found index in byte */
4630static long
4631rb_str_rindex(VALUE str, VALUE sub, long pos)
4632{
4633 long len, slen;
4634 char *sbeg, *s;
4635 rb_encoding *enc;
4636 int singlebyte;
4637
4638 enc = rb_enc_check(str, sub);
4639 if (is_broken_string(sub)) return -1;
4640 singlebyte = single_byte_optimizable(str);
4641 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4642 slen = str_strlen(sub, enc); /* rb_enc_check */
4643
4644 /* substring longer than string */
4645 if (len < slen) return -1;
4646 if (len - pos < slen) pos = len - slen;
4647 if (len == 0) return pos;
4648
4649 sbeg = RSTRING_PTR(str);
4650
4651 if (pos == 0) {
4652 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4653 return 0;
4654 else
4655 return -1;
4656 }
4657
4658 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4659 return str_rindex(str, sub, s, enc);
4660}
4661
4662/*
4663 * call-seq:
4664 * rindex(substring, offset = self.length) -> integer or nil
4665 * rindex(regexp, offset = self.length) -> integer or nil
4666 *
4667 * Returns the Integer index of the _last_ occurrence of the given +substring+,
4668 * or +nil+ if none found:
4669 *
4670 * 'foo'.rindex('f') # => 0
4671 * 'foo'.rindex('o') # => 2
4672 * 'foo'.rindex('oo') # => 1
4673 * 'foo'.rindex('ooo') # => nil
4674 *
4675 * Returns the Integer index of the _last_ match for the given Regexp +regexp+,
4676 * or +nil+ if none found:
4677 *
4678 * 'foo'.rindex(/f/) # => 0
4679 * 'foo'.rindex(/o/) # => 2
4680 * 'foo'.rindex(/oo/) # => 1
4681 * 'foo'.rindex(/ooo/) # => nil
4682 *
4683 * The _last_ match means starting at the possible last position, not
4684 * the last of longest matches.
4685 *
4686 * 'foo'.rindex(/o+/) # => 2
4687 * $~ #=> #<MatchData "o">
4688 *
4689 * To get the last longest match, needs to combine with negative
4690 * lookbehind.
4691 *
4692 * 'foo'.rindex(/(?<!o)o+/) # => 1
4693 * $~ #=> #<MatchData "oo">
4694 *
4695 * Or String#index with negative lookforward.
4696 *
4697 * 'foo'.index(/o+(?!.*o)/) # => 1
4698 * $~ #=> #<MatchData "oo">
4699 *
4700 * Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4701 * string to _end_ the search:
4702 *
4703 * 'foo'.rindex('o', 0) # => nil
4704 * 'foo'.rindex('o', 1) # => 1
4705 * 'foo'.rindex('o', 2) # => 2
4706 * 'foo'.rindex('o', 3) # => 2
4707 *
4708 * If +offset+ is a negative Integer, the maximum starting position in the
4709 * string to _end_ the search is the sum of the string's length and +offset+:
4710 *
4711 * 'foo'.rindex('o', -1) # => 2
4712 * 'foo'.rindex('o', -2) # => 1
4713 * 'foo'.rindex('o', -3) # => nil
4714 * 'foo'.rindex('o', -4) # => nil
4715 *
4716 * Related: String#index.
4717 */
4718
4719static VALUE
4720rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4721{
4722 VALUE sub;
4723 VALUE initpos;
4724 rb_encoding *enc = STR_ENC_GET(str);
4725 long pos, len = str_strlen(str, enc); /* str's enc */
4726
4727 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4728 pos = NUM2LONG(initpos);
4729 if (pos < 0 && (pos += len) < 0) {
4730 if (RB_TYPE_P(sub, T_REGEXP)) {
4732 }
4733 return Qnil;
4734 }
4735 if (pos > len) pos = len;
4736 }
4737 else {
4738 pos = len;
4739 }
4740
4741 if (RB_TYPE_P(sub, T_REGEXP)) {
4742 /* enc = rb_enc_check(str, sub); */
4743 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4744 enc, single_byte_optimizable(str));
4745
4746 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4747 VALUE match = rb_backref_get();
4748 struct re_registers *regs = RMATCH_REGS(match);
4749 pos = rb_str_sublen(str, BEG(0));
4750 return LONG2NUM(pos);
4751 }
4752 }
4753 else {
4754 StringValue(sub);
4755 pos = rb_str_rindex(str, sub, pos);
4756 if (pos >= 0) {
4757 pos = rb_str_sublen(str, pos);
4758 return LONG2NUM(pos);
4759 }
4760 }
4761 return Qnil;
4762}
4763
4764static long
4765rb_str_byterindex(VALUE str, VALUE sub, long pos)
4766{
4767 long len, slen;
4768 char *sbeg, *s;
4769 rb_encoding *enc;
4770
4771 enc = rb_enc_check(str, sub);
4772 if (is_broken_string(sub)) return -1;
4773 len = RSTRING_LEN(str);
4774 slen = RSTRING_LEN(sub);
4775
4776 /* substring longer than string */
4777 if (len < slen) return -1;
4778 if (len - pos < slen) pos = len - slen;
4779 if (len == 0) return pos;
4780
4781 sbeg = RSTRING_PTR(str);
4782
4783 if (pos == 0) {
4784 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4785 return 0;
4786 else
4787 return -1;
4788 }
4789
4790 s = sbeg + pos;
4791 return str_rindex(str, sub, s, enc);
4792}
4793
4794
4795/*
4796 * call-seq:
4797 * byterindex(substring, offset = self.bytesize) -> integer or nil
4798 * byterindex(regexp, offset = self.bytesize) -> integer or nil
4799 *
4800 * Returns the Integer byte-based index of the _last_ occurrence of the given +substring+,
4801 * or +nil+ if none found:
4802 *
4803 * 'foo'.byterindex('f') # => 0
4804 * 'foo'.byterindex('o') # => 2
4805 * 'foo'.byterindex('oo') # => 1
4806 * 'foo'.byterindex('ooo') # => nil
4807 *
4808 * Returns the Integer byte-based index of the _last_ match for the given Regexp +regexp+,
4809 * or +nil+ if none found:
4810 *
4811 * 'foo'.byterindex(/f/) # => 0
4812 * 'foo'.byterindex(/o/) # => 2
4813 * 'foo'.byterindex(/oo/) # => 1
4814 * 'foo'.byterindex(/ooo/) # => nil
4815 *
4816 * The _last_ match means starting at the possible last position, not
4817 * the last of longest matches.
4818 *
4819 * 'foo'.byterindex(/o+/) # => 2
4820 * $~ #=> #<MatchData "o">
4821 *
4822 * To get the last longest match, needs to combine with negative
4823 * lookbehind.
4824 *
4825 * 'foo'.byterindex(/(?<!o)o+/) # => 1
4826 * $~ #=> #<MatchData "oo">
4827 *
4828 * Or String#byteindex with negative lookforward.
4829 *
4830 * 'foo'.byteindex(/o+(?!.*o)/) # => 1
4831 * $~ #=> #<MatchData "oo">
4832 *
4833 * Integer argument +offset+, if given and non-negative, specifies the maximum starting byte-based position in the
4834 * string to _end_ the search:
4835 *
4836 * 'foo'.byterindex('o', 0) # => nil
4837 * 'foo'.byterindex('o', 1) # => 1
4838 * 'foo'.byterindex('o', 2) # => 2
4839 * 'foo'.byterindex('o', 3) # => 2
4840 *
4841 * If +offset+ is a negative Integer, the maximum starting position in the
4842 * string to _end_ the search is the sum of the string's length and +offset+:
4843 *
4844 * 'foo'.byterindex('o', -1) # => 2
4845 * 'foo'.byterindex('o', -2) # => 1
4846 * 'foo'.byterindex('o', -3) # => nil
4847 * 'foo'.byterindex('o', -4) # => nil
4848 *
4849 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4850 * raised.
4851 *
4852 * Related: String#byteindex.
4853 */
4854
4855static VALUE
4856rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
4857{
4858 VALUE sub;
4859 VALUE initpos;
4860 long pos, len = RSTRING_LEN(str);
4861
4862 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4863 pos = NUM2LONG(initpos);
4864 if (pos < 0 && (pos += len) < 0) {
4865 if (RB_TYPE_P(sub, T_REGEXP)) {
4867 }
4868 return Qnil;
4869 }
4870 if (pos > len) pos = len;
4871 }
4872 else {
4873 pos = len;
4874 }
4875
4876 str_ensure_byte_pos(str, pos);
4877
4878 if (RB_TYPE_P(sub, T_REGEXP)) {
4879 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4880 VALUE match = rb_backref_get();
4881 struct re_registers *regs = RMATCH_REGS(match);
4882 pos = BEG(0);
4883 return LONG2NUM(pos);
4884 }
4885 }
4886 else {
4887 StringValue(sub);
4888 pos = rb_str_byterindex(str, sub, pos);
4889 if (pos >= 0) return LONG2NUM(pos);
4890 }
4891 return Qnil;
4892}
4893
4894/*
4895 * call-seq:
4896 * string =~ regexp -> integer or nil
4897 * string =~ object -> integer or nil
4898 *
4899 * Returns the Integer index of the first substring that matches
4900 * the given +regexp+, or +nil+ if no match found:
4901 *
4902 * 'foo' =~ /f/ # => 0
4903 * 'foo' =~ /o/ # => 1
4904 * 'foo' =~ /x/ # => nil
4905 *
4906 * Note: also updates Regexp@Global+Variables.
4907 *
4908 * If the given +object+ is not a Regexp, returns the value
4909 * returned by <tt>object =~ self</tt>.
4910 *
4911 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
4912 * (see Regexp#=~):
4913 *
4914 * number= nil
4915 * "no. 9" =~ /(?<number>\d+)/
4916 * number # => nil (not assigned)
4917 * /(?<number>\d+)/ =~ "no. 9"
4918 * number #=> "9"
4919 *
4920 */
4921
4922static VALUE
4923rb_str_match(VALUE x, VALUE y)
4924{
4925 switch (OBJ_BUILTIN_TYPE(y)) {
4926 case T_STRING:
4927 rb_raise(rb_eTypeError, "type mismatch: String given");
4928
4929 case T_REGEXP:
4930 return rb_reg_match(y, x);
4931
4932 default:
4933 return rb_funcall(y, idEqTilde, 1, x);
4934 }
4935}
4936
4937
4938static VALUE get_pat(VALUE);
4939
4940
4941/*
4942 * call-seq:
4943 * match(pattern, offset = 0) -> matchdata or nil
4944 * match(pattern, offset = 0) {|matchdata| ... } -> object
4945 *
4946 * Returns a MatchData object (or +nil+) based on +self+ and the given +pattern+.
4947 *
4948 * Note: also updates Regexp@Global+Variables.
4949 *
4950 * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
4951 * regexp = Regexp.new(pattern)
4952 * - Computes +matchdata+, which will be either a MatchData object or +nil+
4953 * (see Regexp#match):
4954 * matchdata = <tt>regexp.match(self)
4955 *
4956 * With no block given, returns the computed +matchdata+:
4957 *
4958 * 'foo'.match('f') # => #<MatchData "f">
4959 * 'foo'.match('o') # => #<MatchData "o">
4960 * 'foo'.match('x') # => nil
4961 *
4962 * If Integer argument +offset+ is given, the search begins at index +offset+:
4963 *
4964 * 'foo'.match('f', 1) # => nil
4965 * 'foo'.match('o', 1) # => #<MatchData "o">
4966 *
4967 * With a block given, calls the block with the computed +matchdata+
4968 * and returns the block's return value:
4969 *
4970 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
4971 * 'foo'.match(/x/) {|matchdata| matchdata } # => nil
4972 * 'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
4973 *
4974 */
4975
4976static VALUE
4977rb_str_match_m(int argc, VALUE *argv, VALUE str)
4978{
4979 VALUE re, result;
4980 if (argc < 1)
4981 rb_check_arity(argc, 1, 2);
4982 re = argv[0];
4983 argv[0] = str;
4984 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
4985 if (!NIL_P(result) && rb_block_given_p()) {
4986 return rb_yield(result);
4987 }
4988 return result;
4989}
4990
4991/*
4992 * call-seq:
4993 * match?(pattern, offset = 0) -> true or false
4994 *
4995 * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
4996 *
4997 * Note: does not update Regexp@Global+Variables.
4998 *
4999 * Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5000 * regexp = Regexp.new(pattern)
5001 *
5002 * Returns +true+ if <tt>self+.match(regexp)</tt> returns a MatchData object,
5003 * +false+ otherwise:
5004 *
5005 * 'foo'.match?(/o/) # => true
5006 * 'foo'.match?('o') # => true
5007 * 'foo'.match?(/x/) # => false
5008 *
5009 * If Integer argument +offset+ is given, the search begins at index +offset+:
5010 * 'foo'.match?('f', 1) # => false
5011 * 'foo'.match?('o', 1) # => true
5012 *
5013 */
5014
5015static VALUE
5016rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
5017{
5018 VALUE re;
5019 rb_check_arity(argc, 1, 2);
5020 re = get_pat(argv[0]);
5021 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
5022}
5023
5024enum neighbor_char {
5025 NEIGHBOR_NOT_CHAR,
5026 NEIGHBOR_FOUND,
5027 NEIGHBOR_WRAPPED
5028};
5029
5030static enum neighbor_char
5031enc_succ_char(char *p, long len, rb_encoding *enc)
5032{
5033 long i;
5034 int l;
5035
5036 if (rb_enc_mbminlen(enc) > 1) {
5037 /* wchar, trivial case */
5038 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5039 if (!MBCLEN_CHARFOUND_P(r)) {
5040 return NEIGHBOR_NOT_CHAR;
5041 }
5042 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
5043 l = rb_enc_code_to_mbclen(c, enc);
5044 if (!l) return NEIGHBOR_NOT_CHAR;
5045 if (l != len) return NEIGHBOR_WRAPPED;
5046 rb_enc_mbcput(c, p, enc);
5047 r = rb_enc_precise_mbclen(p, p + len, enc);
5048 if (!MBCLEN_CHARFOUND_P(r)) {
5049 return NEIGHBOR_NOT_CHAR;
5050 }
5051 return NEIGHBOR_FOUND;
5052 }
5053 while (1) {
5054 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
5055 p[i] = '\0';
5056 if (i < 0)
5057 return NEIGHBOR_WRAPPED;
5058 ++((unsigned char*)p)[i];
5059 l = rb_enc_precise_mbclen(p, p+len, enc);
5060 if (MBCLEN_CHARFOUND_P(l)) {
5061 l = MBCLEN_CHARFOUND_LEN(l);
5062 if (l == len) {
5063 return NEIGHBOR_FOUND;
5064 }
5065 else {
5066 memset(p+l, 0xff, len-l);
5067 }
5068 }
5069 if (MBCLEN_INVALID_P(l) && i < len-1) {
5070 long len2;
5071 int l2;
5072 for (len2 = len-1; 0 < len2; len2--) {
5073 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5074 if (!MBCLEN_INVALID_P(l2))
5075 break;
5076 }
5077 memset(p+len2+1, 0xff, len-(len2+1));
5078 }
5079 }
5080}
5081
5082static enum neighbor_char
5083enc_pred_char(char *p, long len, rb_encoding *enc)
5084{
5085 long i;
5086 int l;
5087 if (rb_enc_mbminlen(enc) > 1) {
5088 /* wchar, trivial case */
5089 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5090 if (!MBCLEN_CHARFOUND_P(r)) {
5091 return NEIGHBOR_NOT_CHAR;
5092 }
5093 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
5094 if (!c) return NEIGHBOR_NOT_CHAR;
5095 --c;
5096 l = rb_enc_code_to_mbclen(c, enc);
5097 if (!l) return NEIGHBOR_NOT_CHAR;
5098 if (l != len) return NEIGHBOR_WRAPPED;
5099 rb_enc_mbcput(c, p, enc);
5100 r = rb_enc_precise_mbclen(p, p + len, enc);
5101 if (!MBCLEN_CHARFOUND_P(r)) {
5102 return NEIGHBOR_NOT_CHAR;
5103 }
5104 return NEIGHBOR_FOUND;
5105 }
5106 while (1) {
5107 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
5108 p[i] = '\xff';
5109 if (i < 0)
5110 return NEIGHBOR_WRAPPED;
5111 --((unsigned char*)p)[i];
5112 l = rb_enc_precise_mbclen(p, p+len, enc);
5113 if (MBCLEN_CHARFOUND_P(l)) {
5114 l = MBCLEN_CHARFOUND_LEN(l);
5115 if (l == len) {
5116 return NEIGHBOR_FOUND;
5117 }
5118 else {
5119 memset(p+l, 0, len-l);
5120 }
5121 }
5122 if (MBCLEN_INVALID_P(l) && i < len-1) {
5123 long len2;
5124 int l2;
5125 for (len2 = len-1; 0 < len2; len2--) {
5126 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5127 if (!MBCLEN_INVALID_P(l2))
5128 break;
5129 }
5130 memset(p+len2+1, 0, len-(len2+1));
5131 }
5132 }
5133}
5134
5135/*
5136 overwrite +p+ by succeeding letter in +enc+ and returns
5137 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
5138 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
5139 assuming each ranges are successive, and mbclen
5140 never change in each ranges.
5141 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
5142 character.
5143 */
5144static enum neighbor_char
5145enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
5146{
5147 enum neighbor_char ret;
5148 unsigned int c;
5149 int ctype;
5150 int range;
5151 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5152
5153 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
5154 int try;
5155 const int max_gaps = 1;
5156
5157 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5158 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
5159 ctype = ONIGENC_CTYPE_DIGIT;
5160 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
5161 ctype = ONIGENC_CTYPE_ALPHA;
5162 else
5163 return NEIGHBOR_NOT_CHAR;
5164
5165 MEMCPY(save, p, char, len);
5166 for (try = 0; try <= max_gaps; ++try) {
5167 ret = enc_succ_char(p, len, enc);
5168 if (ret == NEIGHBOR_FOUND) {
5169 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5170 if (rb_enc_isctype(c, ctype, enc))
5171 return NEIGHBOR_FOUND;
5172 }
5173 }
5174 MEMCPY(p, save, char, len);
5175 range = 1;
5176 while (1) {
5177 MEMCPY(save, p, char, len);
5178 ret = enc_pred_char(p, len, enc);
5179 if (ret == NEIGHBOR_FOUND) {
5180 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5181 if (!rb_enc_isctype(c, ctype, enc)) {
5182 MEMCPY(p, save, char, len);
5183 break;
5184 }
5185 }
5186 else {
5187 MEMCPY(p, save, char, len);
5188 break;
5189 }
5190 range++;
5191 }
5192 if (range == 1) {
5193 return NEIGHBOR_NOT_CHAR;
5194 }
5195
5196 if (ctype != ONIGENC_CTYPE_DIGIT) {
5197 MEMCPY(carry, p, char, len);
5198 return NEIGHBOR_WRAPPED;
5199 }
5200
5201 MEMCPY(carry, p, char, len);
5202 enc_succ_char(carry, len, enc);
5203 return NEIGHBOR_WRAPPED;
5204}
5205
5206
5207static VALUE str_succ(VALUE str);
5208
5209/*
5210 * call-seq:
5211 * succ -> new_str
5212 *
5213 * Returns the successor to +self+. The successor is calculated by
5214 * incrementing characters.
5215 *
5216 * The first character to be incremented is the rightmost alphanumeric:
5217 * or, if no alphanumerics, the rightmost character:
5218 *
5219 * 'THX1138'.succ # => "THX1139"
5220 * '<<koala>>'.succ # => "<<koalb>>"
5221 * '***'.succ # => '**+'
5222 *
5223 * The successor to a digit is another digit, "carrying" to the next-left
5224 * character for a "rollover" from 9 to 0, and prepending another digit
5225 * if necessary:
5226 *
5227 * '00'.succ # => "01"
5228 * '09'.succ # => "10"
5229 * '99'.succ # => "100"
5230 *
5231 * The successor to a letter is another letter of the same case,
5232 * carrying to the next-left character for a rollover,
5233 * and prepending another same-case letter if necessary:
5234 *
5235 * 'aa'.succ # => "ab"
5236 * 'az'.succ # => "ba"
5237 * 'zz'.succ # => "aaa"
5238 * 'AA'.succ # => "AB"
5239 * 'AZ'.succ # => "BA"
5240 * 'ZZ'.succ # => "AAA"
5241 *
5242 * The successor to a non-alphanumeric character is the next character
5243 * in the underlying character set's collating sequence,
5244 * carrying to the next-left character for a rollover,
5245 * and prepending another character if necessary:
5246 *
5247 * s = 0.chr * 3
5248 * s # => "\x00\x00\x00"
5249 * s.succ # => "\x00\x00\x01"
5250 * s = 255.chr * 3
5251 * s # => "\xFF\xFF\xFF"
5252 * s.succ # => "\x01\x00\x00\x00"
5253 *
5254 * Carrying can occur between and among mixtures of alphanumeric characters:
5255 *
5256 * s = 'zz99zz99'
5257 * s.succ # => "aaa00aa00"
5258 * s = '99zz99zz'
5259 * s.succ # => "100aa00aa"
5260 *
5261 * The successor to an empty +String+ is a new empty +String+:
5262 *
5263 * ''.succ # => ""
5264 *
5265 */
5266
5267VALUE
5269{
5270 VALUE str;
5271 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5272 rb_enc_cr_str_copy_for_substr(str, orig);
5273 return str_succ(str);
5274}
5275
5276static VALUE
5277str_succ(VALUE str)
5278{
5279 rb_encoding *enc;
5280 char *sbeg, *s, *e, *last_alnum = 0;
5281 int found_alnum = 0;
5282 long l, slen;
5283 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
5284 long carry_pos = 0, carry_len = 1;
5285 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5286
5287 slen = RSTRING_LEN(str);
5288 if (slen == 0) return str;
5289
5290 enc = STR_ENC_GET(str);
5291 sbeg = RSTRING_PTR(str);
5292 s = e = sbeg + slen;
5293
5294 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5295 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5296 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
5297 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
5298 break;
5299 }
5300 }
5301 l = rb_enc_precise_mbclen(s, e, enc);
5302 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5303 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5304 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5305 switch (neighbor) {
5306 case NEIGHBOR_NOT_CHAR:
5307 continue;
5308 case NEIGHBOR_FOUND:
5309 return str;
5310 case NEIGHBOR_WRAPPED:
5311 last_alnum = s;
5312 break;
5313 }
5314 found_alnum = 1;
5315 carry_pos = s - sbeg;
5316 carry_len = l;
5317 }
5318 if (!found_alnum) { /* str contains no alnum */
5319 s = e;
5320 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5321 enum neighbor_char neighbor;
5322 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5323 l = rb_enc_precise_mbclen(s, e, enc);
5324 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5325 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5326 MEMCPY(tmp, s, char, l);
5327 neighbor = enc_succ_char(tmp, l, enc);
5328 switch (neighbor) {
5329 case NEIGHBOR_FOUND:
5330 MEMCPY(s, tmp, char, l);
5331 return str;
5332 break;
5333 case NEIGHBOR_WRAPPED:
5334 MEMCPY(s, tmp, char, l);
5335 break;
5336 case NEIGHBOR_NOT_CHAR:
5337 break;
5338 }
5339 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5340 /* wrapped to \0...\0. search next valid char. */
5341 enc_succ_char(s, l, enc);
5342 }
5343 if (!rb_enc_asciicompat(enc)) {
5344 MEMCPY(carry, s, char, l);
5345 carry_len = l;
5346 }
5347 carry_pos = s - sbeg;
5348 }
5350 }
5351 RESIZE_CAPA(str, slen + carry_len);
5352 sbeg = RSTRING_PTR(str);
5353 s = sbeg + carry_pos;
5354 memmove(s + carry_len, s, slen - carry_pos);
5355 memmove(s, carry, carry_len);
5356 slen += carry_len;
5357 STR_SET_LEN(str, slen);
5358 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5360 return str;
5361}
5362
5363
5364/*
5365 * call-seq:
5366 * succ! -> self
5367 *
5368 * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
5369 */
5370
5371static VALUE
5372rb_str_succ_bang(VALUE str)
5373{
5374 rb_str_modify(str);
5375 str_succ(str);
5376 return str;
5377}
5378
5379static int
5380all_digits_p(const char *s, long len)
5381{
5382 while (len-- > 0) {
5383 if (!ISDIGIT(*s)) return 0;
5384 s++;
5385 }
5386 return 1;
5387}
5388
5389static int
5390str_upto_i(VALUE str, VALUE arg)
5391{
5392 rb_yield(str);
5393 return 0;
5394}
5395
5396/*
5397 * call-seq:
5398 * upto(other_string, exclusive = false) {|string| ... } -> self
5399 * upto(other_string, exclusive = false) -> new_enumerator
5400 *
5401 * With a block given, calls the block with each +String+ value
5402 * returned by successive calls to String#succ;
5403 * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
5404 * the sequence terminates when value +other_string+ is reached;
5405 * returns +self+:
5406 *
5407 * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
5408 * Output:
5409 *
5410 * a8 a9 b0 b1 b2 b3 b4 b5 b6
5411 *
5412 * If argument +exclusive+ is given as a truthy object, the last value is omitted:
5413 *
5414 * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
5415 *
5416 * Output:
5417 *
5418 * a8 a9 b0 b1 b2 b3 b4 b5
5419 *
5420 * If +other_string+ would not be reached, does not call the block:
5421 *
5422 * '25'.upto('5') {|s| fail s }
5423 * 'aa'.upto('a') {|s| fail s }
5424 *
5425 * With no block given, returns a new Enumerator:
5426 *
5427 * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
5428 *
5429 */
5430
5431static VALUE
5432rb_str_upto(int argc, VALUE *argv, VALUE beg)
5433{
5434 VALUE end, exclusive;
5435
5436 rb_scan_args(argc, argv, "11", &end, &exclusive);
5437 RETURN_ENUMERATOR(beg, argc, argv);
5438 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5439}
5440
5441VALUE
5442rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5443{
5444 VALUE current, after_end;
5445 ID succ;
5446 int n, ascii;
5447 rb_encoding *enc;
5448
5449 CONST_ID(succ, "succ");
5450 StringValue(end);
5451 enc = rb_enc_check(beg, end);
5452 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5453 /* single character */
5454 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5455 char c = RSTRING_PTR(beg)[0];
5456 char e = RSTRING_PTR(end)[0];
5457
5458 if (c > e || (excl && c == e)) return beg;
5459 for (;;) {
5460 VALUE str = rb_enc_str_new(&c, 1, enc);
5462 if ((*each)(str, arg)) break;
5463 if (!excl && c == e) break;
5464 c++;
5465 if (excl && c == e) break;
5466 }
5467 return beg;
5468 }
5469 /* both edges are all digits */
5470 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5471 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5472 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5473 VALUE b, e;
5474 int width;
5475
5476 width = RSTRING_LENINT(beg);
5477 b = rb_str_to_inum(beg, 10, FALSE);
5478 e = rb_str_to_inum(end, 10, FALSE);
5479 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5480 long bi = FIX2LONG(b);
5481 long ei = FIX2LONG(e);
5482 rb_encoding *usascii = rb_usascii_encoding();
5483
5484 while (bi <= ei) {
5485 if (excl && bi == ei) break;
5486 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5487 bi++;
5488 }
5489 }
5490 else {
5491 ID op = excl ? '<' : idLE;
5492 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5493
5494 args[0] = INT2FIX(width);
5495 while (rb_funcall(b, op, 1, e)) {
5496 args[1] = b;
5497 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5498 b = rb_funcallv(b, succ, 0, 0);
5499 }
5500 }
5501 return beg;
5502 }
5503 /* normal case */
5504 n = rb_str_cmp(beg, end);
5505 if (n > 0 || (excl && n == 0)) return beg;
5506
5507 after_end = rb_funcallv(end, succ, 0, 0);
5508 current = str_duplicate(rb_cString, beg);
5509 while (!rb_str_equal(current, after_end)) {
5510 VALUE next = Qnil;
5511 if (excl || !rb_str_equal(current, end))
5512 next = rb_funcallv(current, succ, 0, 0);
5513 if ((*each)(current, arg)) break;
5514 if (NIL_P(next)) break;
5515 current = next;
5516 StringValue(current);
5517 if (excl && rb_str_equal(current, end)) break;
5518 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5519 break;
5520 }
5521
5522 return beg;
5523}
5524
5525VALUE
5526rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5527{
5528 VALUE current;
5529 ID succ;
5530
5531 CONST_ID(succ, "succ");
5532 /* both edges are all digits */
5533 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5534 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5535 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5536 int width = RSTRING_LENINT(beg);
5537 b = rb_str_to_inum(beg, 10, FALSE);
5538 if (FIXNUM_P(b)) {
5539 long bi = FIX2LONG(b);
5540 rb_encoding *usascii = rb_usascii_encoding();
5541
5542 while (FIXABLE(bi)) {
5543 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5544 bi++;
5545 }
5546 b = LONG2NUM(bi);
5547 }
5548 args[0] = INT2FIX(width);
5549 while (1) {
5550 args[1] = b;
5551 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5552 b = rb_funcallv(b, succ, 0, 0);
5553 }
5554 }
5555 /* normal case */
5556 current = str_duplicate(rb_cString, beg);
5557 while (1) {
5558 VALUE next = rb_funcallv(current, succ, 0, 0);
5559 if ((*each)(current, arg)) break;
5560 current = next;
5561 StringValue(current);
5562 if (RSTRING_LEN(current) == 0)
5563 break;
5564 }
5565
5566 return beg;
5567}
5568
5569static int
5570include_range_i(VALUE str, VALUE arg)
5571{
5572 VALUE *argp = (VALUE *)arg;
5573 if (!rb_equal(str, *argp)) return 0;
5574 *argp = Qnil;
5575 return 1;
5576}
5577
5578VALUE
5579rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5580{
5581 beg = rb_str_new_frozen(beg);
5582 StringValue(end);
5583 end = rb_str_new_frozen(end);
5584 if (NIL_P(val)) return Qfalse;
5585 val = rb_check_string_type(val);
5586 if (NIL_P(val)) return Qfalse;
5587 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5588 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5589 rb_enc_asciicompat(STR_ENC_GET(val))) {
5590 const char *bp = RSTRING_PTR(beg);
5591 const char *ep = RSTRING_PTR(end);
5592 const char *vp = RSTRING_PTR(val);
5593 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5594 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5595 return Qfalse;
5596 else {
5597 char b = *bp;
5598 char e = *ep;
5599 char v = *vp;
5600
5601 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5602 if (b <= v && v < e) return Qtrue;
5603 return RBOOL(!RTEST(exclusive) && v == e);
5604 }
5605 }
5606 }
5607#if 0
5608 /* both edges are all digits */
5609 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5610 all_digits_p(bp, RSTRING_LEN(beg)) &&
5611 all_digits_p(ep, RSTRING_LEN(end))) {
5612 /* TODO */
5613 }
5614#endif
5615 }
5616 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5617
5618 return RBOOL(NIL_P(val));
5619}
5620
5621static VALUE
5622rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5623{
5624 if (rb_reg_search(re, str, 0, 0) >= 0) {
5625 VALUE match = rb_backref_get();
5626 int nth = rb_reg_backref_number(match, backref);
5627 return rb_reg_nth_match(nth, match);
5628 }
5629 return Qnil;
5630}
5631
5632static VALUE
5633rb_str_aref(VALUE str, VALUE indx)
5634{
5635 long idx;
5636
5637 if (FIXNUM_P(indx)) {
5638 idx = FIX2LONG(indx);
5639 }
5640 else if (RB_TYPE_P(indx, T_REGEXP)) {
5641 return rb_str_subpat(str, indx, INT2FIX(0));
5642 }
5643 else if (RB_TYPE_P(indx, T_STRING)) {
5644 if (rb_str_index(str, indx, 0) != -1)
5645 return str_duplicate(rb_cString, indx);
5646 return Qnil;
5647 }
5648 else {
5649 /* check if indx is Range */
5650 long beg, len = str_strlen(str, NULL);
5651 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5652 case Qfalse:
5653 break;
5654 case Qnil:
5655 return Qnil;
5656 default:
5657 return rb_str_substr(str, beg, len);
5658 }
5659 idx = NUM2LONG(indx);
5660 }
5661
5662 return str_substr(str, idx, 1, FALSE);
5663}
5664
5665
5666/*
5667 * call-seq:
5668 * string[index] -> new_string or nil
5669 * string[start, length] -> new_string or nil
5670 * string[range] -> new_string or nil
5671 * string[regexp, capture = 0] -> new_string or nil
5672 * string[substring] -> new_string or nil
5673 *
5674 * Returns the substring of +self+ specified by the arguments.
5675 * See examples at {String Slices}[rdoc-ref:String@String+Slices].
5676 *
5677 *
5678 */
5679
5680static VALUE
5681rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5682{
5683 if (argc == 2) {
5684 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5685 return rb_str_subpat(str, argv[0], argv[1]);
5686 }
5687 else {
5688 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5689 }
5690 }
5691 rb_check_arity(argc, 1, 2);
5692 return rb_str_aref(str, argv[0]);
5693}
5694
5695VALUE
5697{
5698 char *ptr = RSTRING_PTR(str);
5699 long olen = RSTRING_LEN(str), nlen;
5700
5701 str_modifiable(str);
5702 if (len > olen) len = olen;
5703 nlen = olen - len;
5704 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5705 char *oldptr = ptr;
5706 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5707 STR_SET_EMBED(str);
5708 ptr = RSTRING(str)->as.embed.ary;
5709 memmove(ptr, oldptr + len, nlen);
5710 if (fl == STR_NOEMBED) xfree(oldptr);
5711 }
5712 else {
5713 if (!STR_SHARED_P(str)) {
5714 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5715 rb_enc_cr_str_exact_copy(shared, str);
5716 OBJ_FREEZE(shared);
5717 }
5718 ptr = RSTRING(str)->as.heap.ptr += len;
5719 }
5720 STR_SET_LEN(str, nlen);
5721
5722 if (!SHARABLE_MIDDLE_SUBSTRING) {
5723 TERM_FILL(ptr + nlen, TERM_LEN(str));
5724 }
5726 return str;
5727}
5728
5729static void
5730rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5731{
5732 char *sptr;
5733 long slen;
5734 int cr;
5735
5736 if (beg == 0 && vlen == 0) {
5737 rb_str_drop_bytes(str, len);
5738 return;
5739 }
5740
5741 str_modify_keep_cr(str);
5742 RSTRING_GETMEM(str, sptr, slen);
5743 if (len < vlen) {
5744 /* expand string */
5745 RESIZE_CAPA(str, slen + vlen - len);
5746 sptr = RSTRING_PTR(str);
5747 }
5748
5750 cr = rb_enc_str_coderange(val);
5751 else
5753
5754 if (vlen != len) {
5755 memmove(sptr + beg + vlen,
5756 sptr + beg + len,
5757 slen - (beg + len));
5758 }
5759 if (vlen < beg && len < 0) {
5760 MEMZERO(sptr + slen, char, -len);
5761 }
5762 if (vlen > 0) {
5763 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5764 }
5765 slen += vlen - len;
5766 STR_SET_LEN(str, slen);
5767 TERM_FILL(&sptr[slen], TERM_LEN(str));
5768 ENC_CODERANGE_SET(str, cr);
5769}
5770
5771static inline void
5772rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5773{
5774 rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5775}
5776
5777void
5778rb_str_update(VALUE str, long beg, long len, VALUE val)
5779{
5780 long slen;
5781 char *p, *e;
5782 rb_encoding *enc;
5783 int singlebyte = single_byte_optimizable(str);
5784 int cr;
5785
5786 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5787
5788 StringValue(val);
5789 enc = rb_enc_check(str, val);
5790 slen = str_strlen(str, enc); /* rb_enc_check */
5791
5792 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5793 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5794 }
5795 if (beg < 0) {
5796 beg += slen;
5797 }
5798 RUBY_ASSERT(beg >= 0);
5799 RUBY_ASSERT(beg <= slen);
5800
5801 if (len > slen - beg) {
5802 len = slen - beg;
5803 }
5804 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5805 if (!p) p = RSTRING_END(str);
5806 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5807 if (!e) e = RSTRING_END(str);
5808 /* error check */
5809 beg = p - RSTRING_PTR(str); /* physical position */
5810 len = e - p; /* physical length */
5811 rb_str_update_0(str, beg, len, val);
5812 rb_enc_associate(str, enc);
5814 if (cr != ENC_CODERANGE_BROKEN)
5815 ENC_CODERANGE_SET(str, cr);
5816}
5817
5818static void
5819rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5820{
5821 int nth;
5822 VALUE match;
5823 long start, end, len;
5824 rb_encoding *enc;
5825 struct re_registers *regs;
5826
5827 if (rb_reg_search(re, str, 0, 0) < 0) {
5828 rb_raise(rb_eIndexError, "regexp not matched");
5829 }
5830 match = rb_backref_get();
5831 nth = rb_reg_backref_number(match, backref);
5832 regs = RMATCH_REGS(match);
5833 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5834 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5835 }
5836 if (nth < 0) {
5837 nth += regs->num_regs;
5838 }
5839
5840 start = BEG(nth);
5841 if (start == -1) {
5842 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5843 }
5844 end = END(nth);
5845 len = end - start;
5846 StringValue(val);
5847 enc = rb_enc_check_str(str, val);
5848 rb_str_update_0(str, start, len, val);
5849 rb_enc_associate(str, enc);
5850}
5851
5852static VALUE
5853rb_str_aset(VALUE str, VALUE indx, VALUE val)
5854{
5855 long idx, beg;
5856
5857 switch (TYPE(indx)) {
5858 case T_REGEXP:
5859 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5860 return val;
5861
5862 case T_STRING:
5863 beg = rb_str_index(str, indx, 0);
5864 if (beg < 0) {
5865 rb_raise(rb_eIndexError, "string not matched");
5866 }
5867 beg = rb_str_sublen(str, beg);
5868 rb_str_update(str, beg, str_strlen(indx, NULL), val);
5869 return val;
5870
5871 default:
5872 /* check if indx is Range */
5873 {
5874 long beg, len;
5875 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5876 rb_str_update(str, beg, len, val);
5877 return val;
5878 }
5879 }
5880 /* FALLTHROUGH */
5881
5882 case T_FIXNUM:
5883 idx = NUM2LONG(indx);
5884 rb_str_update(str, idx, 1, val);
5885 return val;
5886 }
5887}
5888
5889/*
5890 * call-seq:
5891 * string[index] = new_string
5892 * string[start, length] = new_string
5893 * string[range] = new_string
5894 * string[regexp, capture = 0] = new_string
5895 * string[substring] = new_string
5896 *
5897 * Replaces all, some, or none of the contents of +self+; returns +new_string+.
5898 * See {String Slices}[rdoc-ref:String@String+Slices].
5899 *
5900 * A few examples:
5901 *
5902 * s = 'foo'
5903 * s[2] = 'rtune' # => "rtune"
5904 * s # => "fortune"
5905 * s[1, 5] = 'init' # => "init"
5906 * s # => "finite"
5907 * s[3..4] = 'al' # => "al"
5908 * s # => "finale"
5909 * s[/e$/] = 'ly' # => "ly"
5910 * s # => "finally"
5911 * s['lly'] = 'ncial' # => "ncial"
5912 * s # => "financial"
5913 *
5914 */
5915
5916static VALUE
5917rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5918{
5919 if (argc == 3) {
5920 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5921 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5922 }
5923 else {
5924 rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5925 }
5926 return argv[2];
5927 }
5928 rb_check_arity(argc, 2, 3);
5929 return rb_str_aset(str, argv[0], argv[1]);
5930}
5931
5932/*
5933 * call-seq:
5934 * insert(index, other_string) -> self
5935 *
5936 * Inserts the given +other_string+ into +self+; returns +self+.
5937 *
5938 * If the Integer +index+ is positive, inserts +other_string+ at offset +index+:
5939 *
5940 * 'foo'.insert(1, 'bar') # => "fbaroo"
5941 *
5942 * If the Integer +index+ is negative, counts backward from the end of +self+
5943 * and inserts +other_string+ at offset <tt>index+1</tt>
5944 * (that is, _after_ <tt>self[index]</tt>):
5945 *
5946 * 'foo'.insert(-2, 'bar') # => "fobaro"
5947 *
5948 */
5949
5950static VALUE
5951rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5952{
5953 long pos = NUM2LONG(idx);
5954
5955 if (pos == -1) {
5956 return rb_str_append(str, str2);
5957 }
5958 else if (pos < 0) {
5959 pos++;
5960 }
5961 rb_str_update(str, pos, 0, str2);
5962 return str;
5963}
5964
5965
5966/*
5967 * call-seq:
5968 * slice!(index) -> new_string or nil
5969 * slice!(start, length) -> new_string or nil
5970 * slice!(range) -> new_string or nil
5971 * slice!(regexp, capture = 0) -> new_string or nil
5972 * slice!(substring) -> new_string or nil
5973 *
5974 * Removes and returns the substring of +self+ specified by the arguments.
5975 * See {String Slices}[rdoc-ref:String@String+Slices].
5976 *
5977 * A few examples:
5978 *
5979 * string = "This is a string"
5980 * string.slice!(2) #=> "i"
5981 * string.slice!(3..6) #=> " is "
5982 * string.slice!(/s.*t/) #=> "sa st"
5983 * string.slice!("r") #=> "r"
5984 * string #=> "Thing"
5985 *
5986 */
5987
5988static VALUE
5989rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
5990{
5991 VALUE result = Qnil;
5992 VALUE indx;
5993 long beg, len = 1;
5994 char *p;
5995
5996 rb_check_arity(argc, 1, 2);
5997 str_modify_keep_cr(str);
5998 indx = argv[0];
5999 if (RB_TYPE_P(indx, T_REGEXP)) {
6000 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
6001 VALUE match = rb_backref_get();
6002 struct re_registers *regs = RMATCH_REGS(match);
6003 int nth = 0;
6004 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
6005 if ((nth += regs->num_regs) <= 0) return Qnil;
6006 }
6007 else if (nth >= regs->num_regs) return Qnil;
6008 beg = BEG(nth);
6009 len = END(nth) - beg;
6010 goto subseq;
6011 }
6012 else if (argc == 2) {
6013 beg = NUM2LONG(indx);
6014 len = NUM2LONG(argv[1]);
6015 goto num_index;
6016 }
6017 else if (FIXNUM_P(indx)) {
6018 beg = FIX2LONG(indx);
6019 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6020 if (!len) return Qnil;
6021 beg = p - RSTRING_PTR(str);
6022 goto subseq;
6023 }
6024 else if (RB_TYPE_P(indx, T_STRING)) {
6025 beg = rb_str_index(str, indx, 0);
6026 if (beg == -1) return Qnil;
6027 len = RSTRING_LEN(indx);
6028 result = str_duplicate(rb_cString, indx);
6029 goto squash;
6030 }
6031 else {
6032 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
6033 case Qnil:
6034 return Qnil;
6035 case Qfalse:
6036 beg = NUM2LONG(indx);
6037 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6038 if (!len) return Qnil;
6039 beg = p - RSTRING_PTR(str);
6040 goto subseq;
6041 default:
6042 goto num_index;
6043 }
6044 }
6045
6046 num_index:
6047 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6048 beg = p - RSTRING_PTR(str);
6049
6050 subseq:
6051 result = rb_str_new(RSTRING_PTR(str)+beg, len);
6052 rb_enc_cr_str_copy_for_substr(result, str);
6053
6054 squash:
6055 if (len > 0) {
6056 if (beg == 0) {
6057 rb_str_drop_bytes(str, len);
6058 }
6059 else {
6060 char *sptr = RSTRING_PTR(str);
6061 long slen = RSTRING_LEN(str);
6062 if (beg + len > slen) /* pathological check */
6063 len = slen - beg;
6064 memmove(sptr + beg,
6065 sptr + beg + len,
6066 slen - (beg + len));
6067 slen -= len;
6068 STR_SET_LEN(str, slen);
6069 TERM_FILL(&sptr[slen], TERM_LEN(str));
6070 }
6071 }
6072 return result;
6073}
6074
6075static VALUE
6076get_pat(VALUE pat)
6077{
6078 VALUE val;
6079
6080 switch (OBJ_BUILTIN_TYPE(pat)) {
6081 case T_REGEXP:
6082 return pat;
6083
6084 case T_STRING:
6085 break;
6086
6087 default:
6088 val = rb_check_string_type(pat);
6089 if (NIL_P(val)) {
6090 Check_Type(pat, T_REGEXP);
6091 }
6092 pat = val;
6093 }
6094
6095 return rb_reg_regcomp(pat);
6096}
6097
6098static VALUE
6099get_pat_quoted(VALUE pat, int check)
6100{
6101 VALUE val;
6102
6103 switch (OBJ_BUILTIN_TYPE(pat)) {
6104 case T_REGEXP:
6105 return pat;
6106
6107 case T_STRING:
6108 break;
6109
6110 default:
6111 val = rb_check_string_type(pat);
6112 if (NIL_P(val)) {
6113 Check_Type(pat, T_REGEXP);
6114 }
6115 pat = val;
6116 }
6117 if (check && is_broken_string(pat)) {
6118 rb_exc_raise(rb_reg_check_preprocess(pat));
6119 }
6120 return pat;
6121}
6122
6123static long
6124rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
6125{
6126 if (BUILTIN_TYPE(pat) == T_STRING) {
6127 pos = rb_str_byteindex(str, pat, pos);
6128 if (set_backref_str) {
6129 if (pos >= 0) {
6130 str = rb_str_new_frozen_String(str);
6131 rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6132 }
6133 else {
6135 }
6136 }
6137 return pos;
6138 }
6139 else {
6140 return rb_reg_search0(pat, str, pos, 0, set_backref_str);
6141 }
6142}
6143
6144
6145/*
6146 * call-seq:
6147 * sub!(pattern, replacement) -> self or nil
6148 * sub!(pattern) {|match| ... } -> self or nil
6149 *
6150 * Replaces the first occurrence (not all occurrences) of the given +pattern+
6151 * on +self+; returns +self+ if a replacement occurred, +nil+ otherwise.
6152 *
6153 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6154 *
6155 * Related: String#sub, String#gsub, String#gsub!.
6156 *
6157 */
6158
6159static VALUE
6160rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
6161{
6162 VALUE pat, repl, hash = Qnil;
6163 int iter = 0;
6164 long plen;
6165 int min_arity = rb_block_given_p() ? 1 : 2;
6166 long beg;
6167
6168 rb_check_arity(argc, min_arity, 2);
6169 if (argc == 1) {
6170 iter = 1;
6171 }
6172 else {
6173 repl = argv[1];
6174 hash = rb_check_hash_type(argv[1]);
6175 if (NIL_P(hash)) {
6176 StringValue(repl);
6177 }
6178 }
6179
6180 pat = get_pat_quoted(argv[0], 1);
6181
6182 str_modifiable(str);
6183 beg = rb_pat_search(pat, str, 0, 1);
6184 if (beg >= 0) {
6185 rb_encoding *enc;
6186 int cr = ENC_CODERANGE(str);
6187 long beg0, end0;
6188 VALUE match, match0 = Qnil;
6189 struct re_registers *regs;
6190 char *p, *rp;
6191 long len, rlen;
6192
6193 match = rb_backref_get();
6194 regs = RMATCH_REGS(match);
6195 if (RB_TYPE_P(pat, T_STRING)) {
6196 beg0 = beg;
6197 end0 = beg0 + RSTRING_LEN(pat);
6198 match0 = pat;
6199 }
6200 else {
6201 beg0 = BEG(0);
6202 end0 = END(0);
6203 if (iter) match0 = rb_reg_nth_match(0, match);
6204 }
6205
6206 if (iter || !NIL_P(hash)) {
6207 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6208
6209 if (iter) {
6210 repl = rb_obj_as_string(rb_yield(match0));
6211 }
6212 else {
6213 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6214 repl = rb_obj_as_string(repl);
6215 }
6216 str_mod_check(str, p, len);
6217 rb_check_frozen(str);
6218 }
6219 else {
6220 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6221 }
6222
6223 enc = rb_enc_compatible(str, repl);
6224 if (!enc) {
6225 rb_encoding *str_enc = STR_ENC_GET(str);
6226 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6227 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
6228 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
6229 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
6230 rb_enc_inspect_name(str_enc),
6231 rb_enc_inspect_name(STR_ENC_GET(repl)));
6232 }
6233 enc = STR_ENC_GET(repl);
6234 }
6235 rb_str_modify(str);
6236 rb_enc_associate(str, enc);
6238 int cr2 = ENC_CODERANGE(repl);
6239 if (cr2 == ENC_CODERANGE_BROKEN ||
6240 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
6242 else
6243 cr = cr2;
6244 }
6245 plen = end0 - beg0;
6246 rlen = RSTRING_LEN(repl);
6247 len = RSTRING_LEN(str);
6248 if (rlen > plen) {
6249 RESIZE_CAPA(str, len + rlen - plen);
6250 }
6251 p = RSTRING_PTR(str);
6252 if (rlen != plen) {
6253 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
6254 }
6255 rp = RSTRING_PTR(repl);
6256 memmove(p + beg0, rp, rlen);
6257 len += rlen - plen;
6258 STR_SET_LEN(str, len);
6259 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
6260 ENC_CODERANGE_SET(str, cr);
6261
6262 RB_GC_GUARD(match);
6263
6264 return str;
6265 }
6266 return Qnil;
6267}
6268
6269
6270/*
6271 * call-seq:
6272 * sub(pattern, replacement) -> new_string
6273 * sub(pattern) {|match| ... } -> new_string
6274 *
6275 * Returns a copy of +self+ with only the first occurrence
6276 * (not all occurrences) of the given +pattern+ replaced.
6277 *
6278 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6279 *
6280 * Related: String#sub!, String#gsub, String#gsub!.
6281 *
6282 */
6283
6284static VALUE
6285rb_str_sub(int argc, VALUE *argv, VALUE str)
6286{
6287 str = str_duplicate(rb_cString, str);
6288 rb_str_sub_bang(argc, argv, str);
6289 return str;
6290}
6291
6292static VALUE
6293str_gsub(int argc, VALUE *argv, VALUE str, int bang)
6294{
6295 VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil;
6296 long beg, beg0, end0;
6297 long offset, blen, slen, len, last;
6298 enum {STR, ITER, MAP} mode = STR;
6299 char *sp, *cp;
6300 int need_backref = -1;
6301 rb_encoding *str_enc;
6302
6303 switch (argc) {
6304 case 1:
6305 RETURN_ENUMERATOR(str, argc, argv);
6306 mode = ITER;
6307 break;
6308 case 2:
6309 repl = argv[1];
6310 hash = rb_check_hash_type(argv[1]);
6311 if (NIL_P(hash)) {
6312 StringValue(repl);
6313 }
6314 else {
6315 mode = MAP;
6316 }
6317 break;
6318 default:
6319 rb_error_arity(argc, 1, 2);
6320 }
6321
6322 pat = get_pat_quoted(argv[0], 1);
6323 beg = rb_pat_search(pat, str, 0, need_backref);
6324 if (beg < 0) {
6325 if (bang) return Qnil; /* no match, no substitution */
6326 return str_duplicate(rb_cString, str);
6327 }
6328
6329 offset = 0;
6330 blen = RSTRING_LEN(str) + 30; /* len + margin */
6331 dest = rb_str_buf_new(blen);
6332 sp = RSTRING_PTR(str);
6333 slen = RSTRING_LEN(str);
6334 cp = sp;
6335 str_enc = STR_ENC_GET(str);
6336 rb_enc_associate(dest, str_enc);
6337 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
6338
6339 do {
6340 VALUE match = rb_backref_get();
6341 struct re_registers *regs = RMATCH_REGS(match);
6342 if (RB_TYPE_P(pat, T_STRING)) {
6343 beg0 = beg;
6344 end0 = beg0 + RSTRING_LEN(pat);
6345 match0 = pat;
6346 }
6347 else {
6348 beg0 = BEG(0);
6349 end0 = END(0);
6350 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
6351 }
6352
6353 if (mode) {
6354 if (mode == ITER) {
6355 val = rb_obj_as_string(rb_yield(match0));
6356 }
6357 else {
6358 val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6359 val = rb_obj_as_string(val);
6360 }
6361 str_mod_check(str, sp, slen);
6362 if (val == dest) { /* paranoid check [ruby-dev:24827] */
6363 rb_raise(rb_eRuntimeError, "block should not cheat");
6364 }
6365 }
6366 else if (need_backref) {
6367 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6368 if (need_backref < 0) {
6369 need_backref = val != repl;
6370 }
6371 }
6372 else {
6373 val = repl;
6374 }
6375
6376 len = beg0 - offset; /* copy pre-match substr */
6377 if (len) {
6378 rb_enc_str_buf_cat(dest, cp, len, str_enc);
6379 }
6380
6381 rb_str_buf_append(dest, val);
6382
6383 last = offset;
6384 offset = end0;
6385 if (beg0 == end0) {
6386 /*
6387 * Always consume at least one character of the input string
6388 * in order to prevent infinite loops.
6389 */
6390 if (RSTRING_LEN(str) <= end0) break;
6391 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
6392 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
6393 offset = end0 + len;
6394 }
6395 cp = RSTRING_PTR(str) + offset;
6396 if (offset > RSTRING_LEN(str)) break;
6397 beg = rb_pat_search(pat, str, offset, need_backref);
6398
6399 RB_GC_GUARD(match);
6400 } while (beg >= 0);
6401 if (RSTRING_LEN(str) > offset) {
6402 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
6403 }
6404 rb_pat_search(pat, str, last, 1);
6405 if (bang) {
6406 str_shared_replace(str, dest);
6407 }
6408 else {
6409 str = dest;
6410 }
6411
6412 return str;
6413}
6414
6415
6416/*
6417 * call-seq:
6418 * gsub!(pattern, replacement) -> self or nil
6419 * gsub!(pattern) {|match| ... } -> self or nil
6420 * gsub!(pattern) -> an_enumerator
6421 *
6422 * Performs the specified substring replacement(s) on +self+;
6423 * returns +self+ if any replacement occurred, +nil+ otherwise.
6424 *
6425 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6426 *
6427 * Returns an Enumerator if no +replacement+ and no block given.
6428 *
6429 * Related: String#sub, String#gsub, String#sub!.
6430 *
6431 */
6432
6433static VALUE
6434rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6435{
6436 str_modify_keep_cr(str);
6437 return str_gsub(argc, argv, str, 1);
6438}
6439
6440
6441/*
6442 * call-seq:
6443 * gsub(pattern, replacement) -> new_string
6444 * gsub(pattern) {|match| ... } -> new_string
6445 * gsub(pattern) -> enumerator
6446 *
6447 * Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
6448 *
6449 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6450 *
6451 * Returns an Enumerator if no +replacement+ and no block given.
6452 *
6453 * Related: String#sub, String#sub!, String#gsub!.
6454 *
6455 */
6456
6457static VALUE
6458rb_str_gsub(int argc, VALUE *argv, VALUE str)
6459{
6460 return str_gsub(argc, argv, str, 0);
6461}
6462
6463
6464/*
6465 * call-seq:
6466 * replace(other_string) -> self
6467 *
6468 * Replaces the contents of +self+ with the contents of +other_string+:
6469 *
6470 * s = 'foo' # => "foo"
6471 * s.replace('bar') # => "bar"
6472 *
6473 */
6474
6475VALUE
6477{
6478 str_modifiable(str);
6479 if (str == str2) return str;
6480
6481 StringValue(str2);
6482 str_discard(str);
6483 return str_replace(str, str2);
6484}
6485
6486/*
6487 * call-seq:
6488 * clear -> self
6489 *
6490 * Removes the contents of +self+:
6491 *
6492 * s = 'foo' # => "foo"
6493 * s.clear # => ""
6494 *
6495 */
6496
6497static VALUE
6498rb_str_clear(VALUE str)
6499{
6500 str_discard(str);
6501 STR_SET_EMBED(str);
6502 STR_SET_LEN(str, 0);
6503 RSTRING_PTR(str)[0] = 0;
6504 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6506 else
6508 return str;
6509}
6510
6511/*
6512 * call-seq:
6513 * chr -> string
6514 *
6515 * Returns a string containing the first character of +self+:
6516 *
6517 * s = 'foo' # => "foo"
6518 * s.chr # => "f"
6519 *
6520 */
6521
6522static VALUE
6523rb_str_chr(VALUE str)
6524{
6525 return rb_str_substr(str, 0, 1);
6526}
6527
6528/*
6529 * call-seq:
6530 * getbyte(index) -> integer or nil
6531 *
6532 * Returns the byte at zero-based +index+ as an integer, or +nil+ if +index+ is out of range:
6533 *
6534 * s = 'abcde' # => "abcde"
6535 * s.getbyte(0) # => 97
6536 * s.getbyte(-1) # => 101
6537 * s.getbyte(5) # => nil
6538 *
6539 * Related: String#setbyte.
6540 */
6541VALUE
6542rb_str_getbyte(VALUE str, VALUE index)
6543{
6544 long pos = NUM2LONG(index);
6545
6546 if (pos < 0)
6547 pos += RSTRING_LEN(str);
6548 if (pos < 0 || RSTRING_LEN(str) <= pos)
6549 return Qnil;
6550
6551 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6552}
6553
6554/*
6555 * call-seq:
6556 * setbyte(index, integer) -> integer
6557 *
6558 * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
6559 *
6560 * s = 'abcde' # => "abcde"
6561 * s.setbyte(0, 98) # => 98
6562 * s # => "bbcde"
6563 *
6564 * Related: String#getbyte.
6565 */
6566VALUE
6567rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6568{
6569 long pos = NUM2LONG(index);
6570 long len = RSTRING_LEN(str);
6571 char *ptr, *head, *left = 0;
6572 rb_encoding *enc;
6573 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6574
6575 if (pos < -len || len <= pos)
6576 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6577 if (pos < 0)
6578 pos += len;
6579
6580 VALUE v = rb_to_int(value);
6581 VALUE w = rb_int_and(v, INT2FIX(0xff));
6582 char byte = (char)(NUM2INT(w) & 0xFF);
6583
6584 if (!str_independent(str))
6585 str_make_independent(str);
6586 enc = STR_ENC_GET(str);
6587 head = RSTRING_PTR(str);
6588 ptr = &head[pos];
6589 if (!STR_EMBED_P(str)) {
6590 cr = ENC_CODERANGE(str);
6591 switch (cr) {
6592 case ENC_CODERANGE_7BIT:
6593 left = ptr;
6594 *ptr = byte;
6595 if (ISASCII(byte)) goto end;
6596 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6597 if (!MBCLEN_CHARFOUND_P(nlen))
6599 else
6601 goto end;
6603 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6604 width = rb_enc_precise_mbclen(left, head+len, enc);
6605 *ptr = byte;
6606 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6607 if (!MBCLEN_CHARFOUND_P(nlen))
6609 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6611 goto end;
6612 }
6613 }
6615 *ptr = byte;
6616
6617 end:
6618 return value;
6619}
6620
6621static VALUE
6622str_byte_substr(VALUE str, long beg, long len, int empty)
6623{
6624 long n = RSTRING_LEN(str);
6625
6626 if (beg > n || len < 0) return Qnil;
6627 if (beg < 0) {
6628 beg += n;
6629 if (beg < 0) return Qnil;
6630 }
6631 if (len > n - beg)
6632 len = n - beg;
6633 if (len <= 0) {
6634 if (!empty) return Qnil;
6635 len = 0;
6636 }
6637
6638 VALUE str2 = str_subseq(str, beg, len);
6639
6640 str_enc_copy_direct(str2, str);
6641
6642 if (RSTRING_LEN(str2) == 0) {
6643 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6645 else
6647 }
6648 else {
6649 switch (ENC_CODERANGE(str)) {
6650 case ENC_CODERANGE_7BIT:
6652 break;
6653 default:
6655 break;
6656 }
6657 }
6658
6659 return str2;
6660}
6661
6662VALUE
6663rb_str_byte_substr(VALUE str, VALUE beg, VALUE len)
6664{
6665 return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE);
6666}
6667
6668static VALUE
6669str_byte_aref(VALUE str, VALUE indx)
6670{
6671 long idx;
6672 if (FIXNUM_P(indx)) {
6673 idx = FIX2LONG(indx);
6674 }
6675 else {
6676 /* check if indx is Range */
6677 long beg, len = RSTRING_LEN(str);
6678
6679 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6680 case Qfalse:
6681 break;
6682 case Qnil:
6683 return Qnil;
6684 default:
6685 return str_byte_substr(str, beg, len, TRUE);
6686 }
6687
6688 idx = NUM2LONG(indx);
6689 }
6690 return str_byte_substr(str, idx, 1, FALSE);
6691}
6692
6693/*
6694 * call-seq:
6695 * byteslice(index, length = 1) -> string or nil
6696 * byteslice(range) -> string or nil
6697 *
6698 * Returns a substring of +self+, or +nil+ if the substring cannot be constructed.
6699 *
6700 * With integer arguments +index+ and +length+ given,
6701 * returns the substring beginning at the given +index+
6702 * of the given +length+ (if possible),
6703 * or +nil+ if +length+ is negative or +index+ falls outside of +self+:
6704 *
6705 * s = '0123456789' # => "0123456789"
6706 * s.byteslice(2) # => "2"
6707 * s.byteslice(200) # => nil
6708 * s.byteslice(4, 3) # => "456"
6709 * s.byteslice(4, 30) # => "456789"
6710 * s.byteslice(4, -1) # => nil
6711 * s.byteslice(40, 2) # => nil
6712 *
6713 * In either case above, counts backwards from the end of +self+
6714 * if +index+ is negative:
6715 *
6716 * s = '0123456789' # => "0123456789"
6717 * s.byteslice(-4) # => "6"
6718 * s.byteslice(-4, 3) # => "678"
6719 *
6720 * With Range argument +range+ given, returns
6721 * <tt>byteslice(range.begin, range.size)</tt>:
6722 *
6723 * s = '0123456789' # => "0123456789"
6724 * s.byteslice(4..6) # => "456"
6725 * s.byteslice(-6..-4) # => "456"
6726 * s.byteslice(5..2) # => "" # range.size is zero.
6727 * s.byteslice(40..42) # => nil
6728 *
6729 * In all cases, a returned string has the same encoding as +self+:
6730 *
6731 * s.encoding # => #<Encoding:UTF-8>
6732 * s.byteslice(4).encoding # => #<Encoding:UTF-8>
6733 *
6734 */
6735
6736static VALUE
6737rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6738{
6739 if (argc == 2) {
6740 long beg = NUM2LONG(argv[0]);
6741 long len = NUM2LONG(argv[1]);
6742 return str_byte_substr(str, beg, len, TRUE);
6743 }
6744 rb_check_arity(argc, 1, 2);
6745 return str_byte_aref(str, argv[0]);
6746}
6747
6748static void
6749str_check_beg_len(VALUE str, long *beg, long *len)
6750{
6751 long end, slen = RSTRING_LEN(str);
6752
6753 if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6754 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6755 rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6756 }
6757 if (*beg < 0) {
6758 *beg += slen;
6759 }
6760 RUBY_ASSERT(*beg >= 0);
6761 RUBY_ASSERT(*beg <= slen);
6762
6763 if (*len > slen - *beg) {
6764 *len = slen - *beg;
6765 }
6766 end = *beg + *len;
6767 str_ensure_byte_pos(str, *beg);
6768 str_ensure_byte_pos(str, end);
6769}
6770
6771/*
6772 * call-seq:
6773 * bytesplice(index, length, str) -> string
6774 * bytesplice(index, length, str, str_index, str_length) -> string
6775 * bytesplice(range, str) -> string
6776 * bytesplice(range, str, str_range) -> string
6777 *
6778 * Replaces some or all of the content of +self+ with +str+, and returns +self+.
6779 * The portion of the string affected is determined using
6780 * the same criteria as String#byteslice, except that +length+ cannot be omitted.
6781 * If the replacement string is not the same length as the text it is replacing,
6782 * the string will be adjusted accordingly.
6783 *
6784 * If +str_index+ and +str_length+, or +str_range+ are given, the content of +self+ is replaced by str.byteslice(str_index, str_length) or str.byteslice(str_range); however the substring of +str+ is not allocated as a new string.
6785 *
6786 * The form that take an Integer will raise an IndexError if the value is out
6787 * of range; the Range form will raise a RangeError.
6788 * If the beginning or ending offset does not land on character (codepoint)
6789 * boundary, an IndexError will be raised.
6790 */
6791
6792static VALUE
6793rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6794{
6795 long beg, len, vbeg, vlen;
6796 VALUE val;
6797 int cr;
6798
6799 rb_check_arity(argc, 2, 5);
6800 if (!(argc == 2 || argc == 3 || argc == 5)) {
6801 rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6802 }
6803 if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6804 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6805 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6806 rb_builtin_class_name(argv[0]));
6807 }
6808 val = argv[1];
6809 StringValue(val);
6810 if (argc == 2) {
6811 /* bytesplice(range, str) */
6812 vbeg = 0;
6813 vlen = RSTRING_LEN(val);
6814 }
6815 else {
6816 /* bytesplice(range, str, str_range) */
6817 if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6818 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6819 rb_builtin_class_name(argv[2]));
6820 }
6821 }
6822 }
6823 else {
6824 beg = NUM2LONG(argv[0]);
6825 len = NUM2LONG(argv[1]);
6826 val = argv[2];
6827 StringValue(val);
6828 if (argc == 3) {
6829 /* bytesplice(index, length, str) */
6830 vbeg = 0;
6831 vlen = RSTRING_LEN(val);
6832 }
6833 else {
6834 /* bytesplice(index, length, str, str_index, str_length) */
6835 vbeg = NUM2LONG(argv[3]);
6836 vlen = NUM2LONG(argv[4]);
6837 }
6838 }
6839 str_check_beg_len(str, &beg, &len);
6840 str_check_beg_len(val, &vbeg, &vlen);
6841 str_modify_keep_cr(str);
6842
6843 if (RB_UNLIKELY(ENCODING_GET_INLINED(str) != ENCODING_GET_INLINED(val))) {
6844 rb_enc_associate(str, rb_enc_check(str, val));
6845 }
6846
6847 rb_str_update_1(str, beg, len, val, vbeg, vlen);
6849 if (cr != ENC_CODERANGE_BROKEN)
6850 ENC_CODERANGE_SET(str, cr);
6851 return str;
6852}
6853
6854/*
6855 * call-seq:
6856 * reverse -> string
6857 *
6858 * Returns a new string with the characters from +self+ in reverse order.
6859 *
6860 * 'stressed'.reverse # => "desserts"
6861 *
6862 */
6863
6864static VALUE
6865rb_str_reverse(VALUE str)
6866{
6867 rb_encoding *enc;
6868 VALUE rev;
6869 char *s, *e, *p;
6870 int cr;
6871
6872 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6873 enc = STR_ENC_GET(str);
6874 rev = rb_str_new(0, RSTRING_LEN(str));
6875 s = RSTRING_PTR(str); e = RSTRING_END(str);
6876 p = RSTRING_END(rev);
6877 cr = ENC_CODERANGE(str);
6878
6879 if (RSTRING_LEN(str) > 1) {
6880 if (single_byte_optimizable(str)) {
6881 while (s < e) {
6882 *--p = *s++;
6883 }
6884 }
6885 else if (cr == ENC_CODERANGE_VALID) {
6886 while (s < e) {
6887 int clen = rb_enc_fast_mbclen(s, e, enc);
6888
6889 p -= clen;
6890 memcpy(p, s, clen);
6891 s += clen;
6892 }
6893 }
6894 else {
6895 cr = rb_enc_asciicompat(enc) ?
6897 while (s < e) {
6898 int clen = rb_enc_mbclen(s, e, enc);
6899
6900 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6901 p -= clen;
6902 memcpy(p, s, clen);
6903 s += clen;
6904 }
6905 }
6906 }
6907 STR_SET_LEN(rev, RSTRING_LEN(str));
6908 str_enc_copy_direct(rev, str);
6909 ENC_CODERANGE_SET(rev, cr);
6910
6911 return rev;
6912}
6913
6914
6915/*
6916 * call-seq:
6917 * reverse! -> self
6918 *
6919 * Returns +self+ with its characters reversed:
6920 *
6921 * s = 'stressed'
6922 * s.reverse! # => "desserts"
6923 * s # => "desserts"
6924 *
6925 */
6926
6927static VALUE
6928rb_str_reverse_bang(VALUE str)
6929{
6930 if (RSTRING_LEN(str) > 1) {
6931 if (single_byte_optimizable(str)) {
6932 char *s, *e, c;
6933
6934 str_modify_keep_cr(str);
6935 s = RSTRING_PTR(str);
6936 e = RSTRING_END(str) - 1;
6937 while (s < e) {
6938 c = *s;
6939 *s++ = *e;
6940 *e-- = c;
6941 }
6942 }
6943 else {
6944 str_shared_replace(str, rb_str_reverse(str));
6945 }
6946 }
6947 else {
6948 str_modify_keep_cr(str);
6949 }
6950 return str;
6951}
6952
6953
6954/*
6955 * call-seq:
6956 * include?(other_string) -> true or false
6957 *
6958 * Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
6959 *
6960 * s = 'foo'
6961 * s.include?('f') # => true
6962 * s.include?('fo') # => true
6963 * s.include?('food') # => false
6964 *
6965 */
6966
6967VALUE
6968rb_str_include(VALUE str, VALUE arg)
6969{
6970 long i;
6971
6972 StringValue(arg);
6973 i = rb_str_index(str, arg, 0);
6974
6975 return RBOOL(i != -1);
6976}
6977
6978
6979/*
6980 * call-seq:
6981 * to_i(base = 10) -> integer
6982 *
6983 * Returns the result of interpreting leading characters in +self+
6984 * as an integer in the given +base+ (which must be in (0, 2..36)):
6985 *
6986 * '123456'.to_i # => 123456
6987 * '123def'.to_i(16) # => 1195503
6988 *
6989 * With +base+ zero, string +object+ may contain leading characters
6990 * to specify the actual base:
6991 *
6992 * '123def'.to_i(0) # => 123
6993 * '0123def'.to_i(0) # => 83
6994 * '0b123def'.to_i(0) # => 1
6995 * '0o123def'.to_i(0) # => 83
6996 * '0d123def'.to_i(0) # => 123
6997 * '0x123def'.to_i(0) # => 1195503
6998 *
6999 * Characters past a leading valid number (in the given +base+) are ignored:
7000 *
7001 * '12.345'.to_i # => 12
7002 * '12345'.to_i(2) # => 1
7003 *
7004 * Returns zero if there is no leading valid number:
7005 *
7006 * 'abcdef'.to_i # => 0
7007 * '2'.to_i(2) # => 0
7008 *
7009 */
7010
7011static VALUE
7012rb_str_to_i(int argc, VALUE *argv, VALUE str)
7013{
7014 int base = 10;
7015
7016 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
7017 rb_raise(rb_eArgError, "invalid radix %d", base);
7018 }
7019 return rb_str_to_inum(str, base, FALSE);
7020}
7021
7022
7023/*
7024 * call-seq:
7025 * to_f -> float
7026 *
7027 * Returns the result of interpreting leading characters in +self+ as a Float:
7028 *
7029 * '3.14159'.to_f # => 3.14159
7030 * '1.234e-2'.to_f # => 0.01234
7031 *
7032 * Characters past a leading valid number (in the given +base+) are ignored:
7033 *
7034 * '3.14 (pi to two places)'.to_f # => 3.14
7035 *
7036 * Returns zero if there is no leading valid number:
7037 *
7038 * 'abcdef'.to_f # => 0.0
7039 *
7040 */
7041
7042static VALUE
7043rb_str_to_f(VALUE str)
7044{
7045 return DBL2NUM(rb_str_to_dbl(str, FALSE));
7046}
7047
7048
7049/*
7050 * call-seq:
7051 * to_s -> self or string
7052 *
7053 * Returns +self+ if +self+ is a +String+,
7054 * or +self+ converted to a +String+ if +self+ is a subclass of +String+.
7055 */
7056
7057static VALUE
7058rb_str_to_s(VALUE str)
7059{
7060 if (rb_obj_class(str) != rb_cString) {
7061 return str_duplicate(rb_cString, str);
7062 }
7063 return str;
7064}
7065
7066#if 0
7067static void
7068str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
7069{
7070 char s[RUBY_MAX_CHAR_LEN];
7071 int n = rb_enc_codelen(c, enc);
7072
7073 rb_enc_mbcput(c, s, enc);
7074 rb_enc_str_buf_cat(str, s, n, enc);
7075}
7076#endif
7077
7078#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
7079
7080int
7081rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
7082{
7083 char buf[CHAR_ESC_LEN + 1];
7084 int l;
7085
7086#if SIZEOF_INT > 4
7087 c &= 0xffffffff;
7088#endif
7089 if (unicode_p) {
7090 if (c < 0x7F && ISPRINT(c)) {
7091 snprintf(buf, CHAR_ESC_LEN, "%c", c);
7092 }
7093 else if (c < 0x10000) {
7094 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
7095 }
7096 else {
7097 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
7098 }
7099 }
7100 else {
7101 if (c < 0x100) {
7102 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
7103 }
7104 else {
7105 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
7106 }
7107 }
7108 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
7109 rb_str_buf_cat(result, buf, l);
7110 return l;
7111}
7112
7113const char *
7114ruby_escaped_char(int c)
7115{
7116 switch (c) {
7117 case '\0': return "\\0";
7118 case '\n': return "\\n";
7119 case '\r': return "\\r";
7120 case '\t': return "\\t";
7121 case '\f': return "\\f";
7122 case '\013': return "\\v";
7123 case '\010': return "\\b";
7124 case '\007': return "\\a";
7125 case '\033': return "\\e";
7126 case '\x7f': return "\\c?";
7127 }
7128 return NULL;
7129}
7130
7131VALUE
7132rb_str_escape(VALUE str)
7133{
7134 int encidx = ENCODING_GET(str);
7135 rb_encoding *enc = rb_enc_from_index(encidx);
7136 const char *p = RSTRING_PTR(str);
7137 const char *pend = RSTRING_END(str);
7138 const char *prev = p;
7139 char buf[CHAR_ESC_LEN + 1];
7140 VALUE result = rb_str_buf_new(0);
7141 int unicode_p = rb_enc_unicode_p(enc);
7142 int asciicompat = rb_enc_asciicompat(enc);
7143
7144 while (p < pend) {
7145 unsigned int c;
7146 const char *cc;
7147 int n = rb_enc_precise_mbclen(p, pend, enc);
7148 if (!MBCLEN_CHARFOUND_P(n)) {
7149 if (p > prev) str_buf_cat(result, prev, p - prev);
7150 n = rb_enc_mbminlen(enc);
7151 if (pend < p + n)
7152 n = (int)(pend - p);
7153 while (n--) {
7154 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7155 str_buf_cat(result, buf, strlen(buf));
7156 prev = ++p;
7157 }
7158 continue;
7159 }
7160 n = MBCLEN_CHARFOUND_LEN(n);
7161 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7162 p += n;
7163 cc = ruby_escaped_char(c);
7164 if (cc) {
7165 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7166 str_buf_cat(result, cc, strlen(cc));
7167 prev = p;
7168 }
7169 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
7170 }
7171 else {
7172 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7173 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7174 prev = p;
7175 }
7176 }
7177 if (p > prev) str_buf_cat(result, prev, p - prev);
7178 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
7179
7180 return result;
7181}
7182
7183/*
7184 * call-seq:
7185 * inspect -> string
7186 *
7187 * Returns a printable version of +self+, enclosed in double-quotes,
7188 * and with special characters escaped:
7189 *
7190 * s = "foo\tbar\tbaz\n"
7191 * s.inspect
7192 * # => "\"foo\\tbar\\tbaz\\n\""
7193 *
7194 */
7195
7196VALUE
7198{
7199 int encidx = ENCODING_GET(str);
7200 rb_encoding *enc = rb_enc_from_index(encidx);
7201 const char *p, *pend, *prev;
7202 char buf[CHAR_ESC_LEN + 1];
7203 VALUE result = rb_str_buf_new(0);
7204 rb_encoding *resenc = rb_default_internal_encoding();
7205 int unicode_p = rb_enc_unicode_p(enc);
7206 int asciicompat = rb_enc_asciicompat(enc);
7207
7208 if (resenc == NULL) resenc = rb_default_external_encoding();
7209 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7210 rb_enc_associate(result, resenc);
7211 str_buf_cat2(result, "\"");
7212
7213 p = RSTRING_PTR(str); pend = RSTRING_END(str);
7214 prev = p;
7215 while (p < pend) {
7216 unsigned int c, cc;
7217 int n;
7218
7219 n = rb_enc_precise_mbclen(p, pend, enc);
7220 if (!MBCLEN_CHARFOUND_P(n)) {
7221 if (p > prev) str_buf_cat(result, prev, p - prev);
7222 n = rb_enc_mbminlen(enc);
7223 if (pend < p + n)
7224 n = (int)(pend - p);
7225 while (n--) {
7226 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7227 str_buf_cat(result, buf, strlen(buf));
7228 prev = ++p;
7229 }
7230 continue;
7231 }
7232 n = MBCLEN_CHARFOUND_LEN(n);
7233 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7234 p += n;
7235 if ((asciicompat || unicode_p) &&
7236 (c == '"'|| c == '\\' ||
7237 (c == '#' &&
7238 p < pend &&
7239 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
7240 (cc = rb_enc_codepoint(p,pend,enc),
7241 (cc == '$' || cc == '@' || cc == '{'))))) {
7242 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7243 str_buf_cat2(result, "\\");
7244 if (asciicompat || enc == resenc) {
7245 prev = p - n;
7246 continue;
7247 }
7248 }
7249 switch (c) {
7250 case '\n': cc = 'n'; break;
7251 case '\r': cc = 'r'; break;
7252 case '\t': cc = 't'; break;
7253 case '\f': cc = 'f'; break;
7254 case '\013': cc = 'v'; break;
7255 case '\010': cc = 'b'; break;
7256 case '\007': cc = 'a'; break;
7257 case 033: cc = 'e'; break;
7258 default: cc = 0; break;
7259 }
7260 if (cc) {
7261 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7262 buf[0] = '\\';
7263 buf[1] = (char)cc;
7264 str_buf_cat(result, buf, 2);
7265 prev = p;
7266 continue;
7267 }
7268 /* The special casing of 0x85 (NEXT_LINE) here is because
7269 * Oniguruma historically treats it as printable, but it
7270 * doesn't match the print POSIX bracket class or character
7271 * property in regexps.
7272 *
7273 * See Ruby Bug #16842 for details:
7274 * https://bugs.ruby-lang.org/issues/16842
7275 */
7276 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
7277 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
7278 continue;
7279 }
7280 else {
7281 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7282 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7283 prev = p;
7284 continue;
7285 }
7286 }
7287 if (p > prev) str_buf_cat(result, prev, p - prev);
7288 str_buf_cat2(result, "\"");
7289
7290 return result;
7291}
7292
7293#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7294
7295/*
7296 * call-seq:
7297 * dump -> string
7298 *
7299 * Returns a printable version of +self+, enclosed in double-quotes,
7300 * with special characters escaped, and with non-printing characters
7301 * replaced by hexadecimal notation:
7302 *
7303 * "hello \n ''".dump # => "\"hello \\n ''\""
7304 * "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7305 *
7306 * Related: String#undump (inverse of String#dump).
7307 *
7308 */
7309
7310VALUE
7312{
7313 int encidx = rb_enc_get_index(str);
7314 rb_encoding *enc = rb_enc_from_index(encidx);
7315 long len;
7316 const char *p, *pend;
7317 char *q, *qend;
7318 VALUE result;
7319 int u8 = (encidx == rb_utf8_encindex());
7320 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
7321
7322 len = 2; /* "" */
7323 if (!rb_enc_asciicompat(enc)) {
7324 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
7325 len += strlen(enc->name);
7326 }
7327
7328 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7329 while (p < pend) {
7330 int clen;
7331 unsigned char c = *p++;
7332
7333 switch (c) {
7334 case '"': case '\\':
7335 case '\n': case '\r':
7336 case '\t': case '\f':
7337 case '\013': case '\010': case '\007': case '\033':
7338 clen = 2;
7339 break;
7340
7341 case '#':
7342 clen = IS_EVSTR(p, pend) ? 2 : 1;
7343 break;
7344
7345 default:
7346 if (ISPRINT(c)) {
7347 clen = 1;
7348 }
7349 else {
7350 if (u8 && c > 0x7F) { /* \u notation */
7351 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7352 if (MBCLEN_CHARFOUND_P(n)) {
7353 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7354 if (cc <= 0xFFFF)
7355 clen = 6; /* \uXXXX */
7356 else if (cc <= 0xFFFFF)
7357 clen = 9; /* \u{XXXXX} */
7358 else
7359 clen = 10; /* \u{XXXXXX} */
7360 p += MBCLEN_CHARFOUND_LEN(n)-1;
7361 break;
7362 }
7363 }
7364 clen = 4; /* \xNN */
7365 }
7366 break;
7367 }
7368
7369 if (clen > LONG_MAX - len) {
7370 rb_raise(rb_eRuntimeError, "string size too big");
7371 }
7372 len += clen;
7373 }
7374
7375 result = rb_str_new(0, len);
7376 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7377 q = RSTRING_PTR(result); qend = q + len + 1;
7378
7379 *q++ = '"';
7380 while (p < pend) {
7381 unsigned char c = *p++;
7382
7383 if (c == '"' || c == '\\') {
7384 *q++ = '\\';
7385 *q++ = c;
7386 }
7387 else if (c == '#') {
7388 if (IS_EVSTR(p, pend)) *q++ = '\\';
7389 *q++ = '#';
7390 }
7391 else if (c == '\n') {
7392 *q++ = '\\';
7393 *q++ = 'n';
7394 }
7395 else if (c == '\r') {
7396 *q++ = '\\';
7397 *q++ = 'r';
7398 }
7399 else if (c == '\t') {
7400 *q++ = '\\';
7401 *q++ = 't';
7402 }
7403 else if (c == '\f') {
7404 *q++ = '\\';
7405 *q++ = 'f';
7406 }
7407 else if (c == '\013') {
7408 *q++ = '\\';
7409 *q++ = 'v';
7410 }
7411 else if (c == '\010') {
7412 *q++ = '\\';
7413 *q++ = 'b';
7414 }
7415 else if (c == '\007') {
7416 *q++ = '\\';
7417 *q++ = 'a';
7418 }
7419 else if (c == '\033') {
7420 *q++ = '\\';
7421 *q++ = 'e';
7422 }
7423 else if (ISPRINT(c)) {
7424 *q++ = c;
7425 }
7426 else {
7427 *q++ = '\\';
7428 if (u8) {
7429 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7430 if (MBCLEN_CHARFOUND_P(n)) {
7431 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7432 p += n;
7433 if (cc <= 0xFFFF)
7434 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7435 else
7436 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7437 q += strlen(q);
7438 continue;
7439 }
7440 }
7441 snprintf(q, qend-q, "x%02X", c);
7442 q += 3;
7443 }
7444 }
7445 *q++ = '"';
7446 *q = '\0';
7447 if (!rb_enc_asciicompat(enc)) {
7448 snprintf(q, qend-q, nonascii_suffix, enc->name);
7449 encidx = rb_ascii8bit_encindex();
7450 }
7451 /* result from dump is ASCII */
7452 rb_enc_associate_index(result, encidx);
7454 return result;
7455}
7456
7457static int
7458unescape_ascii(unsigned int c)
7459{
7460 switch (c) {
7461 case 'n':
7462 return '\n';
7463 case 'r':
7464 return '\r';
7465 case 't':
7466 return '\t';
7467 case 'f':
7468 return '\f';
7469 case 'v':
7470 return '\13';
7471 case 'b':
7472 return '\010';
7473 case 'a':
7474 return '\007';
7475 case 'e':
7476 return 033;
7477 }
7479}
7480
7481static void
7482undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7483{
7484 const char *s = *ss;
7485 unsigned int c;
7486 int codelen;
7487 size_t hexlen;
7488 unsigned char buf[6];
7489 static rb_encoding *enc_utf8 = NULL;
7490
7491 switch (*s) {
7492 case '\\':
7493 case '"':
7494 case '#':
7495 rb_str_cat(undumped, s, 1); /* cat itself */
7496 s++;
7497 break;
7498 case 'n':
7499 case 'r':
7500 case 't':
7501 case 'f':
7502 case 'v':
7503 case 'b':
7504 case 'a':
7505 case 'e':
7506 *buf = unescape_ascii(*s);
7507 rb_str_cat(undumped, (char *)buf, 1);
7508 s++;
7509 break;
7510 case 'u':
7511 if (*binary) {
7512 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7513 }
7514 *utf8 = true;
7515 if (++s >= s_end) {
7516 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7517 }
7518 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7519 if (*penc != enc_utf8) {
7520 *penc = enc_utf8;
7521 rb_enc_associate(undumped, enc_utf8);
7522 }
7523 if (*s == '{') { /* handle \u{...} form */
7524 s++;
7525 for (;;) {
7526 if (s >= s_end) {
7527 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7528 }
7529 if (*s == '}') {
7530 s++;
7531 break;
7532 }
7533 if (ISSPACE(*s)) {
7534 s++;
7535 continue;
7536 }
7537 c = scan_hex(s, s_end-s, &hexlen);
7538 if (hexlen == 0 || hexlen > 6) {
7539 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7540 }
7541 if (c > 0x10ffff) {
7542 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7543 }
7544 if (0xd800 <= c && c <= 0xdfff) {
7545 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7546 }
7547 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7548 rb_str_cat(undumped, (char *)buf, codelen);
7549 s += hexlen;
7550 }
7551 }
7552 else { /* handle \uXXXX form */
7553 c = scan_hex(s, 4, &hexlen);
7554 if (hexlen != 4) {
7555 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7556 }
7557 if (0xd800 <= c && c <= 0xdfff) {
7558 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7559 }
7560 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7561 rb_str_cat(undumped, (char *)buf, codelen);
7562 s += hexlen;
7563 }
7564 break;
7565 case 'x':
7566 if (*utf8) {
7567 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7568 }
7569 *binary = true;
7570 if (++s >= s_end) {
7571 rb_raise(rb_eRuntimeError, "invalid hex escape");
7572 }
7573 *buf = scan_hex(s, 2, &hexlen);
7574 if (hexlen != 2) {
7575 rb_raise(rb_eRuntimeError, "invalid hex escape");
7576 }
7577 rb_str_cat(undumped, (char *)buf, 1);
7578 s += hexlen;
7579 break;
7580 default:
7581 rb_str_cat(undumped, s-1, 2);
7582 s++;
7583 }
7584
7585 *ss = s;
7586}
7587
7588static VALUE rb_str_is_ascii_only_p(VALUE str);
7589
7590/*
7591 * call-seq:
7592 * undump -> string
7593 *
7594 * Returns an unescaped version of +self+:
7595 *
7596 * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
7597 * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7598 * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
7599 * s_undumped == s_orig # => true
7600 *
7601 * Related: String#dump (inverse of String#undump).
7602 *
7603 */
7604
7605static VALUE
7606str_undump(VALUE str)
7607{
7608 const char *s = RSTRING_PTR(str);
7609 const char *s_end = RSTRING_END(str);
7610 rb_encoding *enc = rb_enc_get(str);
7611 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7612 bool utf8 = false;
7613 bool binary = false;
7614 int w;
7615
7617 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7618 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7619 }
7620 if (!str_null_check(str, &w)) {
7621 rb_raise(rb_eRuntimeError, "string contains null byte");
7622 }
7623 if (RSTRING_LEN(str) < 2) goto invalid_format;
7624 if (*s != '"') goto invalid_format;
7625
7626 /* strip '"' at the start */
7627 s++;
7628
7629 for (;;) {
7630 if (s >= s_end) {
7631 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7632 }
7633
7634 if (*s == '"') {
7635 /* epilogue */
7636 s++;
7637 if (s == s_end) {
7638 /* ascii compatible dumped string */
7639 break;
7640 }
7641 else {
7642 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7643 static const char dup_suffix[] = ".dup";
7644 const char *encname;
7645 int encidx;
7646 ptrdiff_t size;
7647
7648 /* check separately for strings dumped by older versions */
7649 size = sizeof(dup_suffix) - 1;
7650 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7651
7652 size = sizeof(force_encoding_suffix) - 1;
7653 if (s_end - s <= size) goto invalid_format;
7654 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7655 s += size;
7656
7657 if (utf8) {
7658 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7659 }
7660
7661 encname = s;
7662 s = memchr(s, '"', s_end-s);
7663 size = s - encname;
7664 if (!s) goto invalid_format;
7665 if (s_end - s != 2) goto invalid_format;
7666 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7667
7668 encidx = rb_enc_find_index2(encname, (long)size);
7669 if (encidx < 0) {
7670 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7671 }
7672 rb_enc_associate_index(undumped, encidx);
7673 }
7674 break;
7675 }
7676
7677 if (*s == '\\') {
7678 s++;
7679 if (s >= s_end) {
7680 rb_raise(rb_eRuntimeError, "invalid escape");
7681 }
7682 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7683 }
7684 else {
7685 rb_str_cat(undumped, s++, 1);
7686 }
7687 }
7688
7689 RB_GC_GUARD(str);
7690
7691 return undumped;
7692invalid_format:
7693 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7694}
7695
7696static void
7697rb_str_check_dummy_enc(rb_encoding *enc)
7698{
7699 if (rb_enc_dummy_p(enc)) {
7700 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7701 rb_enc_name(enc));
7702 }
7703}
7704
7705static rb_encoding *
7706str_true_enc(VALUE str)
7707{
7708 rb_encoding *enc = STR_ENC_GET(str);
7709 rb_str_check_dummy_enc(enc);
7710 return enc;
7711}
7712
7713static OnigCaseFoldType
7714check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7715{
7716 if (argc==0)
7717 return flags;
7718 if (argc>2)
7719 rb_raise(rb_eArgError, "too many options");
7720 if (argv[0]==sym_turkic) {
7721 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7722 if (argc==2) {
7723 if (argv[1]==sym_lithuanian)
7724 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7725 else
7726 rb_raise(rb_eArgError, "invalid second option");
7727 }
7728 }
7729 else if (argv[0]==sym_lithuanian) {
7730 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7731 if (argc==2) {
7732 if (argv[1]==sym_turkic)
7733 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7734 else
7735 rb_raise(rb_eArgError, "invalid second option");
7736 }
7737 }
7738 else if (argc>1)
7739 rb_raise(rb_eArgError, "too many options");
7740 else if (argv[0]==sym_ascii)
7741 flags |= ONIGENC_CASE_ASCII_ONLY;
7742 else if (argv[0]==sym_fold) {
7743 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7744 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7745 else
7746 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7747 }
7748 else
7749 rb_raise(rb_eArgError, "invalid option");
7750 return flags;
7751}
7752
7753static inline bool
7754case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7755{
7756 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7757 return true;
7758 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7759}
7760
7761/* 16 should be long enough to absorb any kind of single character length increase */
7762#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7763#ifndef CASEMAP_DEBUG
7764# define CASEMAP_DEBUG 0
7765#endif
7766
7767struct mapping_buffer;
7768typedef struct mapping_buffer {
7769 size_t capa;
7770 size_t used;
7771 struct mapping_buffer *next;
7772 OnigUChar space[FLEX_ARY_LEN];
7774
7775static void
7776mapping_buffer_free(void *p)
7777{
7778 mapping_buffer *previous_buffer;
7779 mapping_buffer *current_buffer = p;
7780 while (current_buffer) {
7781 previous_buffer = current_buffer;
7782 current_buffer = current_buffer->next;
7783 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7784 }
7785}
7786
7787static const rb_data_type_t mapping_buffer_type = {
7788 "mapping_buffer",
7789 {0, mapping_buffer_free,},
7790 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7791};
7792
7793static VALUE
7794rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7795{
7796 VALUE target;
7797
7798 const OnigUChar *source_current, *source_end;
7799 int target_length = 0;
7800 VALUE buffer_anchor;
7801 mapping_buffer *current_buffer = 0;
7802 mapping_buffer **pre_buffer;
7803 size_t buffer_count = 0;
7804 int buffer_length_or_invalid;
7805
7806 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7807
7808 source_current = (OnigUChar*)RSTRING_PTR(source);
7809 source_end = (OnigUChar*)RSTRING_END(source);
7810
7811 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7812 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7813 while (source_current < source_end) {
7814 /* increase multiplier using buffer count to converge quickly */
7815 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7816 if (CASEMAP_DEBUG) {
7817 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7818 }
7819 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7820 *pre_buffer = current_buffer;
7821 pre_buffer = &current_buffer->next;
7822 current_buffer->next = NULL;
7823 current_buffer->capa = capa;
7824 buffer_length_or_invalid = enc->case_map(flags,
7825 &source_current, source_end,
7826 current_buffer->space,
7827 current_buffer->space+current_buffer->capa,
7828 enc);
7829 if (buffer_length_or_invalid < 0) {
7830 current_buffer = DATA_PTR(buffer_anchor);
7831 DATA_PTR(buffer_anchor) = 0;
7832 mapping_buffer_free(current_buffer);
7833 rb_raise(rb_eArgError, "input string invalid");
7834 }
7835 target_length += current_buffer->used = buffer_length_or_invalid;
7836 }
7837 if (CASEMAP_DEBUG) {
7838 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7839 }
7840
7841 if (buffer_count==1) {
7842 target = rb_str_new((const char*)current_buffer->space, target_length);
7843 }
7844 else {
7845 char *target_current;
7846
7847 target = rb_str_new(0, target_length);
7848 target_current = RSTRING_PTR(target);
7849 current_buffer = DATA_PTR(buffer_anchor);
7850 while (current_buffer) {
7851 memcpy(target_current, current_buffer->space, current_buffer->used);
7852 target_current += current_buffer->used;
7853 current_buffer = current_buffer->next;
7854 }
7855 }
7856 current_buffer = DATA_PTR(buffer_anchor);
7857 DATA_PTR(buffer_anchor) = 0;
7858 mapping_buffer_free(current_buffer);
7859
7860 RB_GC_GUARD(buffer_anchor);
7861
7862 /* TODO: check about string terminator character */
7863 str_enc_copy_direct(target, source);
7864 /*ENC_CODERANGE_SET(mapped, cr);*/
7865
7866 return target;
7867}
7868
7869static VALUE
7870rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7871{
7872 const OnigUChar *source_current, *source_end;
7873 OnigUChar *target_current, *target_end;
7874 long old_length = RSTRING_LEN(source);
7875 int length_or_invalid;
7876
7877 if (old_length == 0) return Qnil;
7878
7879 source_current = (OnigUChar*)RSTRING_PTR(source);
7880 source_end = (OnigUChar*)RSTRING_END(source);
7881 if (source == target) {
7882 target_current = (OnigUChar*)source_current;
7883 target_end = (OnigUChar*)source_end;
7884 }
7885 else {
7886 target_current = (OnigUChar*)RSTRING_PTR(target);
7887 target_end = (OnigUChar*)RSTRING_END(target);
7888 }
7889
7890 length_or_invalid = onigenc_ascii_only_case_map(flags,
7891 &source_current, source_end,
7892 target_current, target_end, enc);
7893 if (length_or_invalid < 0)
7894 rb_raise(rb_eArgError, "input string invalid");
7895 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7896 fprintf(stderr, "problem with rb_str_ascii_casemap"
7897 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7898 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7899 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7900 }
7901
7902 str_enc_copy(target, source);
7903
7904 return target;
7905}
7906
7907static bool
7908upcase_single(VALUE str)
7909{
7910 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7911 bool modified = false;
7912
7913 while (s < send) {
7914 unsigned int c = *(unsigned char*)s;
7915
7916 if ('a' <= c && c <= 'z') {
7917 *s = 'A' + (c - 'a');
7918 modified = true;
7919 }
7920 s++;
7921 }
7922 return modified;
7923}
7924
7925/*
7926 * call-seq:
7927 * upcase!(*options) -> self or nil
7928 *
7929 * Upcases the characters in +self+;
7930 * returns +self+ if any changes were made, +nil+ otherwise:
7931 *
7932 * s = 'Hello World!' # => "Hello World!"
7933 * s.upcase! # => "HELLO WORLD!"
7934 * s # => "HELLO WORLD!"
7935 * s.upcase! # => nil
7936 *
7937 * The casing may be affected by the given +options+;
7938 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7939 *
7940 * Related: String#upcase, String#downcase, String#downcase!.
7941 *
7942 */
7943
7944static VALUE
7945rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7946{
7947 rb_encoding *enc;
7948 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7949
7950 flags = check_case_options(argc, argv, flags);
7951 str_modify_keep_cr(str);
7952 enc = str_true_enc(str);
7953 if (case_option_single_p(flags, enc, str)) {
7954 if (upcase_single(str))
7955 flags |= ONIGENC_CASE_MODIFIED;
7956 }
7957 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7958 rb_str_ascii_casemap(str, str, &flags, enc);
7959 else
7960 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7961
7962 if (ONIGENC_CASE_MODIFIED&flags) return str;
7963 return Qnil;
7964}
7965
7966
7967/*
7968 * call-seq:
7969 * upcase(*options) -> string
7970 *
7971 * Returns a string containing the upcased characters in +self+:
7972 *
7973 * s = 'Hello World!' # => "Hello World!"
7974 * s.upcase # => "HELLO WORLD!"
7975 *
7976 * The casing may be affected by the given +options+;
7977 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7978 *
7979 * Related: String#upcase!, String#downcase, String#downcase!.
7980 *
7981 */
7982
7983static VALUE
7984rb_str_upcase(int argc, VALUE *argv, VALUE str)
7985{
7986 rb_encoding *enc;
7987 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7988 VALUE ret;
7989
7990 flags = check_case_options(argc, argv, flags);
7991 enc = str_true_enc(str);
7992 if (case_option_single_p(flags, enc, str)) {
7993 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7994 str_enc_copy_direct(ret, str);
7995 upcase_single(ret);
7996 }
7997 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7998 ret = rb_str_new(0, RSTRING_LEN(str));
7999 rb_str_ascii_casemap(str, ret, &flags, enc);
8000 }
8001 else {
8002 ret = rb_str_casemap(str, &flags, enc);
8003 }
8004
8005 return ret;
8006}
8007
8008static bool
8009downcase_single(VALUE str)
8010{
8011 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8012 bool modified = false;
8013
8014 while (s < send) {
8015 unsigned int c = *(unsigned char*)s;
8016
8017 if ('A' <= c && c <= 'Z') {
8018 *s = 'a' + (c - 'A');
8019 modified = true;
8020 }
8021 s++;
8022 }
8023
8024 return modified;
8025}
8026
8027/*
8028 * call-seq:
8029 * downcase!(*options) -> self or nil
8030 *
8031 * Downcases the characters in +self+;
8032 * returns +self+ if any changes were made, +nil+ otherwise:
8033 *
8034 * s = 'Hello World!' # => "Hello World!"
8035 * s.downcase! # => "hello world!"
8036 * s # => "hello world!"
8037 * s.downcase! # => nil
8038 *
8039 * The casing may be affected by the given +options+;
8040 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8041 *
8042 * Related: String#downcase, String#upcase, String#upcase!.
8043 *
8044 */
8045
8046static VALUE
8047rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
8048{
8049 rb_encoding *enc;
8050 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8051
8052 flags = check_case_options(argc, argv, flags);
8053 str_modify_keep_cr(str);
8054 enc = str_true_enc(str);
8055 if (case_option_single_p(flags, enc, str)) {
8056 if (downcase_single(str))
8057 flags |= ONIGENC_CASE_MODIFIED;
8058 }
8059 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8060 rb_str_ascii_casemap(str, str, &flags, enc);
8061 else
8062 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8063
8064 if (ONIGENC_CASE_MODIFIED&flags) return str;
8065 return Qnil;
8066}
8067
8068
8069/*
8070 * call-seq:
8071 * downcase(*options) -> string
8072 *
8073 * Returns a string containing the downcased characters in +self+:
8074 *
8075 * s = 'Hello World!' # => "Hello World!"
8076 * s.downcase # => "hello world!"
8077 *
8078 * The casing may be affected by the given +options+;
8079 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8080 *
8081 * Related: String#downcase!, String#upcase, String#upcase!.
8082 *
8083 */
8084
8085static VALUE
8086rb_str_downcase(int argc, VALUE *argv, VALUE str)
8087{
8088 rb_encoding *enc;
8089 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8090 VALUE ret;
8091
8092 flags = check_case_options(argc, argv, flags);
8093 enc = str_true_enc(str);
8094 if (case_option_single_p(flags, enc, str)) {
8095 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8096 str_enc_copy_direct(ret, str);
8097 downcase_single(ret);
8098 }
8099 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8100 ret = rb_str_new(0, RSTRING_LEN(str));
8101 rb_str_ascii_casemap(str, ret, &flags, enc);
8102 }
8103 else {
8104 ret = rb_str_casemap(str, &flags, enc);
8105 }
8106
8107 return ret;
8108}
8109
8110
8111/*
8112 * call-seq:
8113 * capitalize!(*options) -> self or nil
8114 *
8115 * Upcases the first character in +self+;
8116 * downcases the remaining characters;
8117 * returns +self+ if any changes were made, +nil+ otherwise:
8118 *
8119 * s = 'hello World!' # => "hello World!"
8120 * s.capitalize! # => "Hello world!"
8121 * s # => "Hello world!"
8122 * s.capitalize! # => nil
8123 *
8124 * The casing may be affected by the given +options+;
8125 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8126 *
8127 * Related: String#capitalize.
8128 *
8129 */
8130
8131static VALUE
8132rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
8133{
8134 rb_encoding *enc;
8135 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8136
8137 flags = check_case_options(argc, argv, flags);
8138 str_modify_keep_cr(str);
8139 enc = str_true_enc(str);
8140 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8141 if (flags&ONIGENC_CASE_ASCII_ONLY)
8142 rb_str_ascii_casemap(str, str, &flags, enc);
8143 else
8144 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8145
8146 if (ONIGENC_CASE_MODIFIED&flags) return str;
8147 return Qnil;
8148}
8149
8150
8151/*
8152 * call-seq:
8153 * capitalize(*options) -> string
8154 *
8155 * Returns a string containing the characters in +self+;
8156 * the first character is upcased;
8157 * the remaining characters are downcased:
8158 *
8159 * s = 'hello World!' # => "hello World!"
8160 * s.capitalize # => "Hello world!"
8161 *
8162 * The casing may be affected by the given +options+;
8163 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8164 *
8165 * Related: String#capitalize!.
8166 *
8167 */
8168
8169static VALUE
8170rb_str_capitalize(int argc, VALUE *argv, VALUE str)
8171{
8172 rb_encoding *enc;
8173 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8174 VALUE ret;
8175
8176 flags = check_case_options(argc, argv, flags);
8177 enc = str_true_enc(str);
8178 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
8179 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8180 ret = rb_str_new(0, RSTRING_LEN(str));
8181 rb_str_ascii_casemap(str, ret, &flags, enc);
8182 }
8183 else {
8184 ret = rb_str_casemap(str, &flags, enc);
8185 }
8186 return ret;
8187}
8188
8189
8190/*
8191 * call-seq:
8192 * swapcase!(*options) -> self or nil
8193 *
8194 * Upcases each lowercase character in +self+;
8195 * downcases uppercase character;
8196 * returns +self+ if any changes were made, +nil+ otherwise:
8197 *
8198 * s = 'Hello World!' # => "Hello World!"
8199 * s.swapcase! # => "hELLO wORLD!"
8200 * s # => "hELLO wORLD!"
8201 * ''.swapcase! # => nil
8202 *
8203 * The casing may be affected by the given +options+;
8204 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8205 *
8206 * Related: String#swapcase.
8207 *
8208 */
8209
8210static VALUE
8211rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
8212{
8213 rb_encoding *enc;
8214 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8215
8216 flags = check_case_options(argc, argv, flags);
8217 str_modify_keep_cr(str);
8218 enc = str_true_enc(str);
8219 if (flags&ONIGENC_CASE_ASCII_ONLY)
8220 rb_str_ascii_casemap(str, str, &flags, enc);
8221 else
8222 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8223
8224 if (ONIGENC_CASE_MODIFIED&flags) return str;
8225 return Qnil;
8226}
8227
8228
8229/*
8230 * call-seq:
8231 * swapcase(*options) -> string
8232 *
8233 * Returns a string containing the characters in +self+, with cases reversed;
8234 * each uppercase character is downcased;
8235 * each lowercase character is upcased:
8236 *
8237 * s = 'Hello World!' # => "Hello World!"
8238 * s.swapcase # => "hELLO wORLD!"
8239 *
8240 * The casing may be affected by the given +options+;
8241 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8242 *
8243 * Related: String#swapcase!.
8244 *
8245 */
8246
8247static VALUE
8248rb_str_swapcase(int argc, VALUE *argv, VALUE str)
8249{
8250 rb_encoding *enc;
8251 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8252 VALUE ret;
8253
8254 flags = check_case_options(argc, argv, flags);
8255 enc = str_true_enc(str);
8256 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
8257 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8258 ret = rb_str_new(0, RSTRING_LEN(str));
8259 rb_str_ascii_casemap(str, ret, &flags, enc);
8260 }
8261 else {
8262 ret = rb_str_casemap(str, &flags, enc);
8263 }
8264 return ret;
8265}
8266
8267typedef unsigned char *USTR;
8268
8269struct tr {
8270 int gen;
8271 unsigned int now, max;
8272 char *p, *pend;
8273};
8274
8275static unsigned int
8276trnext(struct tr *t, rb_encoding *enc)
8277{
8278 int n;
8279
8280 for (;;) {
8281 nextpart:
8282 if (!t->gen) {
8283 if (t->p == t->pend) return -1;
8284 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
8285 t->p += n;
8286 }
8287 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8288 t->p += n;
8289 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
8290 t->p += n;
8291 if (t->p < t->pend) {
8292 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8293 t->p += n;
8294 if (t->now > c) {
8295 if (t->now < 0x80 && c < 0x80) {
8296 rb_raise(rb_eArgError,
8297 "invalid range \"%c-%c\" in string transliteration",
8298 t->now, c);
8299 }
8300 else {
8301 rb_raise(rb_eArgError, "invalid range in string transliteration");
8302 }
8303 continue; /* not reached */
8304 }
8305 else if (t->now < c) {
8306 t->gen = 1;
8307 t->max = c;
8308 }
8309 }
8310 }
8311 return t->now;
8312 }
8313 else {
8314 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8315 if (t->now == t->max) {
8316 t->gen = 0;
8317 goto nextpart;
8318 }
8319 }
8320 if (t->now < t->max) {
8321 return t->now;
8322 }
8323 else {
8324 t->gen = 0;
8325 return t->max;
8326 }
8327 }
8328 }
8329}
8330
8331static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
8332
8333static VALUE
8334tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
8335{
8336 const unsigned int errc = -1;
8337 unsigned int trans[256];
8338 rb_encoding *enc, *e1, *e2;
8339 struct tr trsrc, trrepl;
8340 int cflag = 0;
8341 unsigned int c, c0, last = 0;
8342 int modify = 0, i, l;
8343 unsigned char *s, *send;
8344 VALUE hash = 0;
8345 int singlebyte = single_byte_optimizable(str);
8346 int termlen;
8347 int cr;
8348
8349#define CHECK_IF_ASCII(c) \
8350 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8351 (cr = ENC_CODERANGE_VALID) : 0)
8352
8353 StringValue(src);
8354 StringValue(repl);
8355 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8356 if (RSTRING_LEN(repl) == 0) {
8357 return rb_str_delete_bang(1, &src, str);
8358 }
8359
8360 cr = ENC_CODERANGE(str);
8361 e1 = rb_enc_check(str, src);
8362 e2 = rb_enc_check(str, repl);
8363 if (e1 == e2) {
8364 enc = e1;
8365 }
8366 else {
8367 enc = rb_enc_check(src, repl);
8368 }
8369 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8370 if (RSTRING_LEN(src) > 1 &&
8371 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
8372 trsrc.p + l < trsrc.pend) {
8373 cflag = 1;
8374 trsrc.p += l;
8375 }
8376 trrepl.p = RSTRING_PTR(repl);
8377 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8378 trsrc.gen = trrepl.gen = 0;
8379 trsrc.now = trrepl.now = 0;
8380 trsrc.max = trrepl.max = 0;
8381
8382 if (cflag) {
8383 for (i=0; i<256; i++) {
8384 trans[i] = 1;
8385 }
8386 while ((c = trnext(&trsrc, enc)) != errc) {
8387 if (c < 256) {
8388 trans[c] = errc;
8389 }
8390 else {
8391 if (!hash) hash = rb_hash_new();
8392 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
8393 }
8394 }
8395 while ((c = trnext(&trrepl, enc)) != errc)
8396 /* retrieve last replacer */;
8397 last = trrepl.now;
8398 for (i=0; i<256; i++) {
8399 if (trans[i] != errc) {
8400 trans[i] = last;
8401 }
8402 }
8403 }
8404 else {
8405 unsigned int r;
8406
8407 for (i=0; i<256; i++) {
8408 trans[i] = errc;
8409 }
8410 while ((c = trnext(&trsrc, enc)) != errc) {
8411 r = trnext(&trrepl, enc);
8412 if (r == errc) r = trrepl.now;
8413 if (c < 256) {
8414 trans[c] = r;
8415 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8416 }
8417 else {
8418 if (!hash) hash = rb_hash_new();
8419 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8420 }
8421 }
8422 }
8423
8424 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8425 cr = ENC_CODERANGE_7BIT;
8426 str_modify_keep_cr(str);
8427 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8428 termlen = rb_enc_mbminlen(enc);
8429 if (sflag) {
8430 int clen, tlen;
8431 long offset, max = RSTRING_LEN(str);
8432 unsigned int save = -1;
8433 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8434
8435 while (s < send) {
8436 int may_modify = 0;
8437
8438 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8439 if (!MBCLEN_CHARFOUND_P(r)) {
8440 xfree(buf);
8441 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8442 }
8443 clen = MBCLEN_CHARFOUND_LEN(r);
8444 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8445
8446 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8447
8448 s += clen;
8449 if (c < 256) {
8450 c = trans[c];
8451 }
8452 else if (hash) {
8453 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8454 if (NIL_P(tmp)) {
8455 if (cflag) c = last;
8456 else c = errc;
8457 }
8458 else if (cflag) c = errc;
8459 else c = NUM2INT(tmp);
8460 }
8461 else {
8462 c = errc;
8463 }
8464 if (c != (unsigned int)-1) {
8465 if (save == c) {
8466 CHECK_IF_ASCII(c);
8467 continue;
8468 }
8469 save = c;
8470 tlen = rb_enc_codelen(c, enc);
8471 modify = 1;
8472 }
8473 else {
8474 save = -1;
8475 c = c0;
8476 if (enc != e1) may_modify = 1;
8477 }
8478 if ((offset = t - buf) + tlen > max) {
8479 size_t MAYBE_UNUSED(old) = max + termlen;
8480 max = offset + tlen + (send - s);
8481 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8482 t = buf + offset;
8483 }
8484 rb_enc_mbcput(c, t, enc);
8485 if (may_modify && memcmp(s, t, tlen) != 0) {
8486 modify = 1;
8487 }
8488 CHECK_IF_ASCII(c);
8489 t += tlen;
8490 }
8491 if (!STR_EMBED_P(str)) {
8492 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8493 }
8494 TERM_FILL((char *)t, termlen);
8495 RSTRING(str)->as.heap.ptr = (char *)buf;
8496 STR_SET_LEN(str, t - buf);
8497 STR_SET_NOEMBED(str);
8498 RSTRING(str)->as.heap.aux.capa = max;
8499 }
8500 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8501 while (s < send) {
8502 c = (unsigned char)*s;
8503 if (trans[c] != errc) {
8504 if (!cflag) {
8505 c = trans[c];
8506 *s = c;
8507 modify = 1;
8508 }
8509 else {
8510 *s = last;
8511 modify = 1;
8512 }
8513 }
8514 CHECK_IF_ASCII(c);
8515 s++;
8516 }
8517 }
8518 else {
8519 int clen, tlen;
8520 long offset, max = (long)((send - s) * 1.2);
8521 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8522
8523 while (s < send) {
8524 int may_modify = 0;
8525
8526 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8527 if (!MBCLEN_CHARFOUND_P(r)) {
8528 xfree(buf);
8529 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8530 }
8531 clen = MBCLEN_CHARFOUND_LEN(r);
8532 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8533
8534 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8535
8536 if (c < 256) {
8537 c = trans[c];
8538 }
8539 else if (hash) {
8540 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8541 if (NIL_P(tmp)) {
8542 if (cflag) c = last;
8543 else c = errc;
8544 }
8545 else if (cflag) c = errc;
8546 else c = NUM2INT(tmp);
8547 }
8548 else {
8549 c = cflag ? last : errc;
8550 }
8551 if (c != errc) {
8552 tlen = rb_enc_codelen(c, enc);
8553 modify = 1;
8554 }
8555 else {
8556 c = c0;
8557 if (enc != e1) may_modify = 1;
8558 }
8559 if ((offset = t - buf) + tlen > max) {
8560 size_t MAYBE_UNUSED(old) = max + termlen;
8561 max = offset + tlen + (long)((send - s) * 1.2);
8562 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8563 t = buf + offset;
8564 }
8565 if (s != t) {
8566 rb_enc_mbcput(c, t, enc);
8567 if (may_modify && memcmp(s, t, tlen) != 0) {
8568 modify = 1;
8569 }
8570 }
8571 CHECK_IF_ASCII(c);
8572 s += clen;
8573 t += tlen;
8574 }
8575 if (!STR_EMBED_P(str)) {
8576 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8577 }
8578 TERM_FILL((char *)t, termlen);
8579 RSTRING(str)->as.heap.ptr = (char *)buf;
8580 STR_SET_LEN(str, t - buf);
8581 STR_SET_NOEMBED(str);
8582 RSTRING(str)->as.heap.aux.capa = max;
8583 }
8584
8585 if (modify) {
8586 if (cr != ENC_CODERANGE_BROKEN)
8587 ENC_CODERANGE_SET(str, cr);
8588 rb_enc_associate(str, enc);
8589 return str;
8590 }
8591 return Qnil;
8592}
8593
8594
8595/*
8596 * call-seq:
8597 * tr!(selector, replacements) -> self or nil
8598 *
8599 * Like String#tr, but modifies +self+ in place.
8600 * Returns +self+ if any changes were made, +nil+ otherwise.
8601 *
8602 */
8603
8604static VALUE
8605rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8606{
8607 return tr_trans(str, src, repl, 0);
8608}
8609
8610
8611/*
8612 * call-seq:
8613 * tr(selector, replacements) -> new_string
8614 *
8615 * Returns a copy of +self+ with each character specified by string +selector+
8616 * translated to the corresponding character in string +replacements+.
8617 * The correspondence is _positional_:
8618 *
8619 * - Each occurrence of the first character specified by +selector+
8620 * is translated to the first character in +replacements+.
8621 * - Each occurrence of the second character specified by +selector+
8622 * is translated to the second character in +replacements+.
8623 * - And so on.
8624 *
8625 * Example:
8626 *
8627 * 'hello'.tr('el', 'ip') #=> "hippo"
8628 *
8629 * If +replacements+ is shorter than +selector+,
8630 * it is implicitly padded with its own last character:
8631 *
8632 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8633 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8634 *
8635 * Arguments +selector+ and +replacements+ must be valid character selectors
8636 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8637 * and may use any of its valid forms, including negation, ranges, and escaping:
8638 *
8639 * # Negation.
8640 * 'hello'.tr('^aeiou', '-') # => "-e--o"
8641 * # Ranges.
8642 * 'ibm'.tr('b-z', 'a-z') # => "hal"
8643 * # Escapes.
8644 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8645 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8646 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8647 *
8648 */
8649
8650static VALUE
8651rb_str_tr(VALUE str, VALUE src, VALUE repl)
8652{
8653 str = str_duplicate(rb_cString, str);
8654 tr_trans(str, src, repl, 0);
8655 return str;
8656}
8657
8658#define TR_TABLE_MAX (UCHAR_MAX+1)
8659#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8660static void
8661tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8662 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8663{
8664 const unsigned int errc = -1;
8665 char buf[TR_TABLE_MAX];
8666 struct tr tr;
8667 unsigned int c;
8668 VALUE table = 0, ptable = 0;
8669 int i, l, cflag = 0;
8670
8671 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8672 tr.gen = tr.now = tr.max = 0;
8673
8674 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8675 cflag = 1;
8676 tr.p += l;
8677 }
8678 if (first) {
8679 for (i=0; i<TR_TABLE_MAX; i++) {
8680 stable[i] = 1;
8681 }
8682 stable[TR_TABLE_MAX] = cflag;
8683 }
8684 else if (stable[TR_TABLE_MAX] && !cflag) {
8685 stable[TR_TABLE_MAX] = 0;
8686 }
8687 for (i=0; i<TR_TABLE_MAX; i++) {
8688 buf[i] = cflag;
8689 }
8690
8691 while ((c = trnext(&tr, enc)) != errc) {
8692 if (c < TR_TABLE_MAX) {
8693 buf[(unsigned char)c] = !cflag;
8694 }
8695 else {
8696 VALUE key = UINT2NUM(c);
8697
8698 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8699 if (cflag) {
8700 ptable = *ctablep;
8701 table = ptable ? ptable : rb_hash_new();
8702 *ctablep = table;
8703 }
8704 else {
8705 table = rb_hash_new();
8706 ptable = *tablep;
8707 *tablep = table;
8708 }
8709 }
8710 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8711 rb_hash_aset(table, key, Qtrue);
8712 }
8713 }
8714 }
8715 for (i=0; i<TR_TABLE_MAX; i++) {
8716 stable[i] = stable[i] && buf[i];
8717 }
8718 if (!table && !cflag) {
8719 *tablep = 0;
8720 }
8721}
8722
8723
8724static int
8725tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8726{
8727 if (c < TR_TABLE_MAX) {
8728 return table[c] != 0;
8729 }
8730 else {
8731 VALUE v = UINT2NUM(c);
8732
8733 if (del) {
8734 if (!NIL_P(rb_hash_lookup(del, v)) &&
8735 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8736 return TRUE;
8737 }
8738 }
8739 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8740 return FALSE;
8741 }
8742 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8743 }
8744}
8745
8746/*
8747 * call-seq:
8748 * delete!(*selectors) -> self or nil
8749 *
8750 * Like String#delete, but modifies +self+ in place.
8751 * Returns +self+ if any changes were made, +nil+ otherwise.
8752 *
8753 */
8754
8755static VALUE
8756rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8757{
8758 char squeez[TR_TABLE_SIZE];
8759 rb_encoding *enc = 0;
8760 char *s, *send, *t;
8761 VALUE del = 0, nodel = 0;
8762 int modify = 0;
8763 int i, ascompat, cr;
8764
8765 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8767 for (i=0; i<argc; i++) {
8768 VALUE s = argv[i];
8769
8770 StringValue(s);
8771 enc = rb_enc_check(str, s);
8772 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8773 }
8774
8775 str_modify_keep_cr(str);
8776 ascompat = rb_enc_asciicompat(enc);
8777 s = t = RSTRING_PTR(str);
8778 send = RSTRING_END(str);
8779 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8780 while (s < send) {
8781 unsigned int c;
8782 int clen;
8783
8784 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8785 if (squeez[c]) {
8786 modify = 1;
8787 }
8788 else {
8789 if (t != s) *t = c;
8790 t++;
8791 }
8792 s++;
8793 }
8794 else {
8795 c = rb_enc_codepoint_len(s, send, &clen, enc);
8796
8797 if (tr_find(c, squeez, del, nodel)) {
8798 modify = 1;
8799 }
8800 else {
8801 if (t != s) rb_enc_mbcput(c, t, enc);
8802 t += clen;
8804 }
8805 s += clen;
8806 }
8807 }
8808 TERM_FILL(t, TERM_LEN(str));
8809 STR_SET_LEN(str, t - RSTRING_PTR(str));
8810 ENC_CODERANGE_SET(str, cr);
8811
8812 if (modify) return str;
8813 return Qnil;
8814}
8815
8816
8817/*
8818 * call-seq:
8819 * delete(*selectors) -> new_string
8820 *
8821 * Returns a copy of +self+ with characters specified by +selectors+ removed
8822 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8823 *
8824 * "hello".delete "l","lo" #=> "heo"
8825 * "hello".delete "lo" #=> "he"
8826 * "hello".delete "aeiou", "^e" #=> "hell"
8827 * "hello".delete "ej-m" #=> "ho"
8828 *
8829 */
8830
8831static VALUE
8832rb_str_delete(int argc, VALUE *argv, VALUE str)
8833{
8834 str = str_duplicate(rb_cString, str);
8835 rb_str_delete_bang(argc, argv, str);
8836 return str;
8837}
8838
8839
8840/*
8841 * call-seq:
8842 * squeeze!(*selectors) -> self or nil
8843 *
8844 * Like String#squeeze, but modifies +self+ in place.
8845 * Returns +self+ if any changes were made, +nil+ otherwise.
8846 */
8847
8848static VALUE
8849rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8850{
8851 char squeez[TR_TABLE_SIZE];
8852 rb_encoding *enc = 0;
8853 VALUE del = 0, nodel = 0;
8854 unsigned char *s, *send, *t;
8855 int i, modify = 0;
8856 int ascompat, singlebyte = single_byte_optimizable(str);
8857 unsigned int save;
8858
8859 if (argc == 0) {
8860 enc = STR_ENC_GET(str);
8861 }
8862 else {
8863 for (i=0; i<argc; i++) {
8864 VALUE s = argv[i];
8865
8866 StringValue(s);
8867 enc = rb_enc_check(str, s);
8868 if (singlebyte && !single_byte_optimizable(s))
8869 singlebyte = 0;
8870 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8871 }
8872 }
8873
8874 str_modify_keep_cr(str);
8875 s = t = (unsigned char *)RSTRING_PTR(str);
8876 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8877 send = (unsigned char *)RSTRING_END(str);
8878 save = -1;
8879 ascompat = rb_enc_asciicompat(enc);
8880
8881 if (singlebyte) {
8882 while (s < send) {
8883 unsigned int c = *s++;
8884 if (c != save || (argc > 0 && !squeez[c])) {
8885 *t++ = save = c;
8886 }
8887 }
8888 }
8889 else {
8890 while (s < send) {
8891 unsigned int c;
8892 int clen;
8893
8894 if (ascompat && (c = *s) < 0x80) {
8895 if (c != save || (argc > 0 && !squeez[c])) {
8896 *t++ = save = c;
8897 }
8898 s++;
8899 }
8900 else {
8901 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8902
8903 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8904 if (t != s) rb_enc_mbcput(c, t, enc);
8905 save = c;
8906 t += clen;
8907 }
8908 s += clen;
8909 }
8910 }
8911 }
8912
8913 TERM_FILL((char *)t, TERM_LEN(str));
8914 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8915 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8916 modify = 1;
8917 }
8918
8919 if (modify) return str;
8920 return Qnil;
8921}
8922
8923
8924/*
8925 * call-seq:
8926 * squeeze(*selectors) -> new_string
8927 *
8928 * Returns a copy of +self+ with characters specified by +selectors+ "squeezed"
8929 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8930 *
8931 * "Squeezed" means that each multiple-character run of a selected character
8932 * is squeezed down to a single character;
8933 * with no arguments given, squeezes all characters:
8934 *
8935 * "yellow moon".squeeze #=> "yelow mon"
8936 * " now is the".squeeze(" ") #=> " now is the"
8937 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
8938 *
8939 */
8940
8941static VALUE
8942rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8943{
8944 str = str_duplicate(rb_cString, str);
8945 rb_str_squeeze_bang(argc, argv, str);
8946 return str;
8947}
8948
8949
8950/*
8951 * call-seq:
8952 * tr_s!(selector, replacements) -> self or nil
8953 *
8954 * Like String#tr_s, but modifies +self+ in place.
8955 * Returns +self+ if any changes were made, +nil+ otherwise.
8956 *
8957 * Related: String#squeeze!.
8958 */
8959
8960static VALUE
8961rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8962{
8963 return tr_trans(str, src, repl, 1);
8964}
8965
8966
8967/*
8968 * call-seq:
8969 * tr_s(selector, replacements) -> string
8970 *
8971 * Like String#tr, but also squeezes the modified portions of the translated string;
8972 * returns a new string (translated and squeezed).
8973 *
8974 * 'hello'.tr_s('l', 'r') #=> "hero"
8975 * 'hello'.tr_s('el', '-') #=> "h-o"
8976 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
8977 *
8978 * Related: String#squeeze.
8979 *
8980 */
8981
8982static VALUE
8983rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8984{
8985 str = str_duplicate(rb_cString, str);
8986 tr_trans(str, src, repl, 1);
8987 return str;
8988}
8989
8990
8991/*
8992 * call-seq:
8993 * count(*selectors) -> integer
8994 *
8995 * Returns the total number of characters in +self+
8996 * that are specified by the given +selectors+
8997 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8998 *
8999 * a = "hello world"
9000 * a.count "lo" #=> 5
9001 * a.count "lo", "o" #=> 2
9002 * a.count "hello", "^l" #=> 4
9003 * a.count "ej-m" #=> 4
9004 *
9005 * "hello^world".count "\\^aeiou" #=> 4
9006 * "hello-world".count "a\\-eo" #=> 4
9007 *
9008 * c = "hello world\\r\\n"
9009 * c.count "\\" #=> 2
9010 * c.count "\\A" #=> 0
9011 * c.count "X-\\w" #=> 3
9012 */
9013
9014static VALUE
9015rb_str_count(int argc, VALUE *argv, VALUE str)
9016{
9017 char table[TR_TABLE_SIZE];
9018 rb_encoding *enc = 0;
9019 VALUE del = 0, nodel = 0, tstr;
9020 char *s, *send;
9021 int i;
9022 int ascompat;
9023 size_t n = 0;
9024
9026
9027 tstr = argv[0];
9028 StringValue(tstr);
9029 enc = rb_enc_check(str, tstr);
9030 if (argc == 1) {
9031 const char *ptstr;
9032 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
9033 (ptstr = RSTRING_PTR(tstr),
9034 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
9035 !is_broken_string(str)) {
9036 int clen;
9037 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9038
9039 s = RSTRING_PTR(str);
9040 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9041 send = RSTRING_END(str);
9042 while (s < send) {
9043 if (*(unsigned char*)s++ == c) n++;
9044 }
9045 return SIZET2NUM(n);
9046 }
9047 }
9048
9049 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9050 for (i=1; i<argc; i++) {
9051 tstr = argv[i];
9052 StringValue(tstr);
9053 enc = rb_enc_check(str, tstr);
9054 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9055 }
9056
9057 s = RSTRING_PTR(str);
9058 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9059 send = RSTRING_END(str);
9060 ascompat = rb_enc_asciicompat(enc);
9061 while (s < send) {
9062 unsigned int c;
9063
9064 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
9065 if (table[c]) {
9066 n++;
9067 }
9068 s++;
9069 }
9070 else {
9071 int clen;
9072 c = rb_enc_codepoint_len(s, send, &clen, enc);
9073 if (tr_find(c, table, del, nodel)) {
9074 n++;
9075 }
9076 s += clen;
9077 }
9078 }
9079
9080 return SIZET2NUM(n);
9081}
9082
9083static VALUE
9084rb_fs_check(VALUE val)
9085{
9086 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
9087 val = rb_check_string_type(val);
9088 if (NIL_P(val)) return 0;
9089 }
9090 return val;
9091}
9092
9093static const char isspacetable[256] = {
9094 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9095 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9096 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9097 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9098 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9099 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9105 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9106 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9107 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9108 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9109 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9110};
9111
9112#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9113
9114static long
9115split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
9116{
9117 if (empty_count >= 0 && len == 0) {
9118 return empty_count + 1;
9119 }
9120 if (empty_count > 0) {
9121 /* make different substrings */
9122 if (result) {
9123 do {
9124 rb_ary_push(result, str_new_empty_String(str));
9125 } while (--empty_count > 0);
9126 }
9127 else {
9128 do {
9129 rb_yield(str_new_empty_String(str));
9130 } while (--empty_count > 0);
9131 }
9132 }
9133 str = rb_str_subseq(str, beg, len);
9134 if (result) {
9135 rb_ary_push(result, str);
9136 }
9137 else {
9138 rb_yield(str);
9139 }
9140 return empty_count;
9141}
9142
9143typedef enum {
9144 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9145} split_type_t;
9146
9147static split_type_t
9148literal_split_pattern(VALUE spat, split_type_t default_type)
9149{
9150 rb_encoding *enc = STR_ENC_GET(spat);
9151 const char *ptr;
9152 long len;
9153 RSTRING_GETMEM(spat, ptr, len);
9154 if (len == 0) {
9155 /* Special case - split into chars */
9156 return SPLIT_TYPE_CHARS;
9157 }
9158 else if (rb_enc_asciicompat(enc)) {
9159 if (len == 1 && ptr[0] == ' ') {
9160 return SPLIT_TYPE_AWK;
9161 }
9162 }
9163 else {
9164 int l;
9165 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
9166 return SPLIT_TYPE_AWK;
9167 }
9168 }
9169 return default_type;
9170}
9171
9172/*
9173 * call-seq:
9174 * split(field_sep = $;, limit = 0) -> array
9175 * split(field_sep = $;, limit = 0) {|substring| ... } -> self
9176 *
9177 * :include: doc/string/split.rdoc
9178 *
9179 */
9180
9181static VALUE
9182rb_str_split_m(int argc, VALUE *argv, VALUE str)
9183{
9184 rb_encoding *enc;
9185 VALUE spat;
9186 VALUE limit;
9187 split_type_t split_type;
9188 long beg, end, i = 0, empty_count = -1;
9189 int lim = 0;
9190 VALUE result, tmp;
9191
9192 result = rb_block_given_p() ? Qfalse : Qnil;
9193 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
9194 lim = NUM2INT(limit);
9195 if (lim <= 0) limit = Qnil;
9196 else if (lim == 1) {
9197 if (RSTRING_LEN(str) == 0)
9198 return result ? rb_ary_new2(0) : str;
9199 tmp = str_duplicate(rb_cString, str);
9200 if (!result) {
9201 rb_yield(tmp);
9202 return str;
9203 }
9204 return rb_ary_new3(1, tmp);
9205 }
9206 i = 1;
9207 }
9208 if (NIL_P(limit) && !lim) empty_count = 0;
9209
9210 enc = STR_ENC_GET(str);
9211 split_type = SPLIT_TYPE_REGEXP;
9212 if (!NIL_P(spat)) {
9213 spat = get_pat_quoted(spat, 0);
9214 }
9215 else if (NIL_P(spat = rb_fs)) {
9216 split_type = SPLIT_TYPE_AWK;
9217 }
9218 else if (!(spat = rb_fs_check(spat))) {
9219 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
9220 }
9221 else {
9222 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
9223 }
9224 if (split_type != SPLIT_TYPE_AWK) {
9225 switch (BUILTIN_TYPE(spat)) {
9226 case T_REGEXP:
9227 rb_reg_options(spat); /* check if uninitialized */
9228 tmp = RREGEXP_SRC(spat);
9229 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9230 if (split_type == SPLIT_TYPE_AWK) {
9231 spat = tmp;
9232 split_type = SPLIT_TYPE_STRING;
9233 }
9234 break;
9235
9236 case T_STRING:
9237 mustnot_broken(spat);
9238 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9239 break;
9240
9241 default:
9243 }
9244 }
9245
9246#define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
9247
9248 beg = 0;
9249 char *ptr = RSTRING_PTR(str);
9250 char *eptr = RSTRING_END(str);
9251 if (split_type == SPLIT_TYPE_AWK) {
9252 char *bptr = ptr;
9253 int skip = 1;
9254 unsigned int c;
9255
9256 if (result) result = rb_ary_new();
9257 end = beg;
9258 if (is_ascii_string(str)) {
9259 while (ptr < eptr) {
9260 c = (unsigned char)*ptr++;
9261 if (skip) {
9262 if (ascii_isspace(c)) {
9263 beg = ptr - bptr;
9264 }
9265 else {
9266 end = ptr - bptr;
9267 skip = 0;
9268 if (!NIL_P(limit) && lim <= i) break;
9269 }
9270 }
9271 else if (ascii_isspace(c)) {
9272 SPLIT_STR(beg, end-beg);
9273 skip = 1;
9274 beg = ptr - bptr;
9275 if (!NIL_P(limit)) ++i;
9276 }
9277 else {
9278 end = ptr - bptr;
9279 }
9280 }
9281 }
9282 else {
9283 while (ptr < eptr) {
9284 int n;
9285
9286 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9287 ptr += n;
9288 if (skip) {
9289 if (rb_isspace(c)) {
9290 beg = ptr - bptr;
9291 }
9292 else {
9293 end = ptr - bptr;
9294 skip = 0;
9295 if (!NIL_P(limit) && lim <= i) break;
9296 }
9297 }
9298 else if (rb_isspace(c)) {
9299 SPLIT_STR(beg, end-beg);
9300 skip = 1;
9301 beg = ptr - bptr;
9302 if (!NIL_P(limit)) ++i;
9303 }
9304 else {
9305 end = ptr - bptr;
9306 }
9307 }
9308 }
9309 }
9310 else if (split_type == SPLIT_TYPE_STRING) {
9311 char *str_start = ptr;
9312 char *substr_start = ptr;
9313 char *sptr = RSTRING_PTR(spat);
9314 long slen = RSTRING_LEN(spat);
9315
9316 if (result) result = rb_ary_new();
9317 mustnot_broken(str);
9318 enc = rb_enc_check(str, spat);
9319 while (ptr < eptr &&
9320 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9321 /* Check we are at the start of a char */
9322 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
9323 if (t != ptr + end) {
9324 ptr = t;
9325 continue;
9326 }
9327 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9328 ptr += end + slen;
9329 substr_start = ptr;
9330 if (!NIL_P(limit) && lim <= ++i) break;
9331 }
9332 beg = ptr - str_start;
9333 }
9334 else if (split_type == SPLIT_TYPE_CHARS) {
9335 char *str_start = ptr;
9336 int n;
9337
9338 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9339 mustnot_broken(str);
9340 enc = rb_enc_get(str);
9341 while (ptr < eptr &&
9342 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9343 SPLIT_STR(ptr - str_start, n);
9344 ptr += n;
9345 if (!NIL_P(limit) && lim <= ++i) break;
9346 }
9347 beg = ptr - str_start;
9348 }
9349 else {
9350 if (result) result = rb_ary_new();
9351 long len = RSTRING_LEN(str);
9352 long start = beg;
9353 long idx;
9354 int last_null = 0;
9355 struct re_registers *regs;
9356 VALUE match = 0;
9357
9358 for (; rb_reg_search(spat, str, start, 0) >= 0;
9359 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
9360 match = rb_backref_get();
9361 if (!result) rb_match_busy(match);
9362 regs = RMATCH_REGS(match);
9363 end = BEG(0);
9364 if (start == end && BEG(0) == END(0)) {
9365 if (!ptr) {
9366 SPLIT_STR(0, 0);
9367 break;
9368 }
9369 else if (last_null == 1) {
9370 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9371 beg = start;
9372 }
9373 else {
9374 if (start == len)
9375 start++;
9376 else
9377 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9378 last_null = 1;
9379 continue;
9380 }
9381 }
9382 else {
9383 SPLIT_STR(beg, end-beg);
9384 beg = start = END(0);
9385 }
9386 last_null = 0;
9387
9388 for (idx=1; idx < regs->num_regs; idx++) {
9389 if (BEG(idx) == -1) continue;
9390 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9391 }
9392 if (!NIL_P(limit) && lim <= ++i) break;
9393 }
9394 if (match) rb_match_unbusy(match);
9395 }
9396 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9397 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9398 }
9399
9400 return result ? result : str;
9401}
9402
9403VALUE
9404rb_str_split(VALUE str, const char *sep0)
9405{
9406 VALUE sep;
9407
9408 StringValue(str);
9409 sep = rb_str_new_cstr(sep0);
9410 return rb_str_split_m(1, &sep, str);
9411}
9412
9413#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9414
9415static inline int
9416enumerator_element(VALUE ary, VALUE e)
9417{
9418 if (ary) {
9419 rb_ary_push(ary, e);
9420 return 0;
9421 }
9422 else {
9423 rb_yield(e);
9424 return 1;
9425 }
9426}
9427
9428#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9429
9430static const char *
9431chomp_newline(const char *p, const char *e, rb_encoding *enc)
9432{
9433 const char *prev = rb_enc_prev_char(p, e, e, enc);
9434 if (rb_enc_is_newline(prev, e, enc)) {
9435 e = prev;
9436 prev = rb_enc_prev_char(p, e, e, enc);
9437 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9438 e = prev;
9439 }
9440 return e;
9441}
9442
9443static VALUE
9444get_rs(void)
9445{
9446 VALUE rs = rb_rs;
9447 if (!NIL_P(rs) &&
9448 (!RB_TYPE_P(rs, T_STRING) ||
9449 RSTRING_LEN(rs) != 1 ||
9450 RSTRING_PTR(rs)[0] != '\n')) {
9451 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9452 }
9453 return rs;
9454}
9455
9456#define rb_rs get_rs()
9457
9458static VALUE
9459rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9460{
9461 rb_encoding *enc;
9462 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9463 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9464 long pos, len, rslen;
9465 int rsnewline = 0;
9466
9467 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9468 rs = rb_rs;
9469 if (!NIL_P(opts)) {
9470 static ID keywords[1];
9471 if (!keywords[0]) {
9472 keywords[0] = rb_intern_const("chomp");
9473 }
9474 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9475 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9476 }
9477
9478 if (NIL_P(rs)) {
9479 if (!ENUM_ELEM(ary, str)) {
9480 return ary;
9481 }
9482 else {
9483 return orig;
9484 }
9485 }
9486
9487 if (!RSTRING_LEN(str)) goto end;
9488 str = rb_str_new_frozen(str);
9489 ptr = subptr = RSTRING_PTR(str);
9490 pend = RSTRING_END(str);
9491 len = RSTRING_LEN(str);
9492 StringValue(rs);
9493 rslen = RSTRING_LEN(rs);
9494
9495 if (rs == rb_default_rs)
9496 enc = rb_enc_get(str);
9497 else
9498 enc = rb_enc_check(str, rs);
9499
9500 if (rslen == 0) {
9501 /* paragraph mode */
9502 int n;
9503 const char *eol = NULL;
9504 subend = subptr;
9505 while (subend < pend) {
9506 long chomp_rslen = 0;
9507 do {
9508 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9509 n = 0;
9510 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9511 if (rb_enc_is_newline(subend + n, pend, enc)) {
9512 if (eol == subend) break;
9513 subend += rslen;
9514 if (subptr) {
9515 eol = subend;
9516 chomp_rslen = -rslen;
9517 }
9518 }
9519 else {
9520 if (!subptr) subptr = subend;
9521 subend += rslen;
9522 }
9523 rslen = 0;
9524 } while (subend < pend);
9525 if (!subptr) break;
9526 if (rslen == 0) chomp_rslen = 0;
9527 line = rb_str_subseq(str, subptr - ptr,
9528 subend - subptr + (chomp ? chomp_rslen : rslen));
9529 if (ENUM_ELEM(ary, line)) {
9530 str_mod_check(str, ptr, len);
9531 }
9532 subptr = eol = NULL;
9533 }
9534 goto end;
9535 }
9536 else {
9537 rsptr = RSTRING_PTR(rs);
9538 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9539 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9540 rsnewline = 1;
9541 }
9542 }
9543
9544 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9545 rs = rb_str_new(rsptr, rslen);
9546 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9547 rsptr = RSTRING_PTR(rs);
9548 rslen = RSTRING_LEN(rs);
9549 }
9550
9551 while (subptr < pend) {
9552 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9553 if (pos < 0) break;
9554 hit = subptr + pos;
9555 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9556 if (hit != adjusted) {
9557 subptr = adjusted;
9558 continue;
9559 }
9560 subend = hit += rslen;
9561 if (chomp) {
9562 if (rsnewline) {
9563 subend = chomp_newline(subptr, subend, enc);
9564 }
9565 else {
9566 subend -= rslen;
9567 }
9568 }
9569 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9570 if (ENUM_ELEM(ary, line)) {
9571 str_mod_check(str, ptr, len);
9572 }
9573 subptr = hit;
9574 }
9575
9576 if (subptr != pend) {
9577 if (chomp) {
9578 if (rsnewline) {
9579 pend = chomp_newline(subptr, pend, enc);
9580 }
9581 else if (pend - subptr >= rslen &&
9582 memcmp(pend - rslen, rsptr, rslen) == 0) {
9583 pend -= rslen;
9584 }
9585 }
9586 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9587 ENUM_ELEM(ary, line);
9588 RB_GC_GUARD(str);
9589 }
9590
9591 end:
9592 if (ary)
9593 return ary;
9594 else
9595 return orig;
9596}
9597
9598/*
9599 * call-seq:
9600 * each_line(line_sep = $/, chomp: false) {|substring| ... } -> self
9601 * each_line(line_sep = $/, chomp: false) -> enumerator
9602 *
9603 * :include: doc/string/each_line.rdoc
9604 *
9605 */
9606
9607static VALUE
9608rb_str_each_line(int argc, VALUE *argv, VALUE str)
9609{
9610 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9611 return rb_str_enumerate_lines(argc, argv, str, 0);
9612}
9613
9614/*
9615 * call-seq:
9616 * lines(Line_sep = $/, chomp: false) -> array_of_strings
9617 *
9618 * Forms substrings ("lines") of +self+ according to the given arguments
9619 * (see String#each_line for details); returns the lines in an array.
9620 *
9621 */
9622
9623static VALUE
9624rb_str_lines(int argc, VALUE *argv, VALUE str)
9625{
9626 VALUE ary = WANTARRAY("lines", 0);
9627 return rb_str_enumerate_lines(argc, argv, str, ary);
9628}
9629
9630static VALUE
9631rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9632{
9633 return LONG2FIX(RSTRING_LEN(str));
9634}
9635
9636static VALUE
9637rb_str_enumerate_bytes(VALUE str, VALUE ary)
9638{
9639 long i;
9640
9641 for (i=0; i<RSTRING_LEN(str); i++) {
9642 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9643 }
9644 if (ary)
9645 return ary;
9646 else
9647 return str;
9648}
9649
9650/*
9651 * call-seq:
9652 * each_byte {|byte| ... } -> self
9653 * each_byte -> enumerator
9654 *
9655 * :include: doc/string/each_byte.rdoc
9656 *
9657 */
9658
9659static VALUE
9660rb_str_each_byte(VALUE str)
9661{
9662 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9663 return rb_str_enumerate_bytes(str, 0);
9664}
9665
9666/*
9667 * call-seq:
9668 * bytes -> array_of_bytes
9669 *
9670 * :include: doc/string/bytes.rdoc
9671 *
9672 */
9673
9674static VALUE
9675rb_str_bytes(VALUE str)
9676{
9677 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9678 return rb_str_enumerate_bytes(str, ary);
9679}
9680
9681static VALUE
9682rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9683{
9684 return rb_str_length(str);
9685}
9686
9687static VALUE
9688rb_str_enumerate_chars(VALUE str, VALUE ary)
9689{
9690 VALUE orig = str;
9691 long i, len, n;
9692 const char *ptr;
9693 rb_encoding *enc;
9694
9695 str = rb_str_new_frozen(str);
9696 ptr = RSTRING_PTR(str);
9697 len = RSTRING_LEN(str);
9698 enc = rb_enc_get(str);
9699
9701 for (i = 0; i < len; i += n) {
9702 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9703 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9704 }
9705 }
9706 else {
9707 for (i = 0; i < len; i += n) {
9708 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9709 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9710 }
9711 }
9712 RB_GC_GUARD(str);
9713 if (ary)
9714 return ary;
9715 else
9716 return orig;
9717}
9718
9719/*
9720 * call-seq:
9721 * each_char {|c| ... } -> self
9722 * each_char -> enumerator
9723 *
9724 * :include: doc/string/each_char.rdoc
9725 *
9726 */
9727
9728static VALUE
9729rb_str_each_char(VALUE str)
9730{
9731 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9732 return rb_str_enumerate_chars(str, 0);
9733}
9734
9735/*
9736 * call-seq:
9737 * chars -> array_of_characters
9738 *
9739 * :include: doc/string/chars.rdoc
9740 *
9741 */
9742
9743static VALUE
9744rb_str_chars(VALUE str)
9745{
9746 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9747 return rb_str_enumerate_chars(str, ary);
9748}
9749
9750static VALUE
9751rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9752{
9753 VALUE orig = str;
9754 int n;
9755 unsigned int c;
9756 const char *ptr, *end;
9757 rb_encoding *enc;
9758
9759 if (single_byte_optimizable(str))
9760 return rb_str_enumerate_bytes(str, ary);
9761
9762 str = rb_str_new_frozen(str);
9763 ptr = RSTRING_PTR(str);
9764 end = RSTRING_END(str);
9765 enc = STR_ENC_GET(str);
9766
9767 while (ptr < end) {
9768 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9769 ENUM_ELEM(ary, UINT2NUM(c));
9770 ptr += n;
9771 }
9772 RB_GC_GUARD(str);
9773 if (ary)
9774 return ary;
9775 else
9776 return orig;
9777}
9778
9779/*
9780 * call-seq:
9781 * each_codepoint {|integer| ... } -> self
9782 * each_codepoint -> enumerator
9783 *
9784 * :include: doc/string/each_codepoint.rdoc
9785 *
9786 */
9787
9788static VALUE
9789rb_str_each_codepoint(VALUE str)
9790{
9791 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9792 return rb_str_enumerate_codepoints(str, 0);
9793}
9794
9795/*
9796 * call-seq:
9797 * codepoints -> array_of_integers
9798 *
9799 * :include: doc/string/codepoints.rdoc
9800 *
9801 */
9802
9803static VALUE
9804rb_str_codepoints(VALUE str)
9805{
9806 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9807 return rb_str_enumerate_codepoints(str, ary);
9808}
9809
9810static regex_t *
9811get_reg_grapheme_cluster(rb_encoding *enc)
9812{
9813 int encidx = rb_enc_to_index(enc);
9814
9815 const OnigUChar source_ascii[] = "\\X";
9816 const OnigUChar *source = source_ascii;
9817 size_t source_len = sizeof(source_ascii) - 1;
9818
9819 switch (encidx) {
9820#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9821#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9822#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9823#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9824#define CASE_UTF(e) \
9825 case ENCINDEX_UTF_##e: { \
9826 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9827 source = source_UTF_##e; \
9828 source_len = sizeof(source_UTF_##e); \
9829 break; \
9830 }
9831 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9832#undef CASE_UTF
9833#undef CHARS_16BE
9834#undef CHARS_16LE
9835#undef CHARS_32BE
9836#undef CHARS_32LE
9837 }
9838
9839 regex_t *reg_grapheme_cluster;
9840 OnigErrorInfo einfo;
9841 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9842 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9843 if (r) {
9844 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9845 onig_error_code_to_str(message, r, &einfo);
9846 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9847 }
9848
9849 return reg_grapheme_cluster;
9850}
9851
9852static regex_t *
9853get_cached_reg_grapheme_cluster(rb_encoding *enc)
9854{
9855 int encidx = rb_enc_to_index(enc);
9856 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9857
9858 if (encidx == rb_utf8_encindex()) {
9859 if (!reg_grapheme_cluster_utf8) {
9860 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9861 }
9862
9863 return reg_grapheme_cluster_utf8;
9864 }
9865
9866 return NULL;
9867}
9868
9869static VALUE
9870rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9871{
9872 size_t grapheme_cluster_count = 0;
9873 rb_encoding *enc = get_encoding(str);
9874 const char *ptr, *end;
9875
9876 if (!rb_enc_unicode_p(enc)) {
9877 return rb_str_length(str);
9878 }
9879
9880 bool cached_reg_grapheme_cluster = true;
9881 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9882 if (!reg_grapheme_cluster) {
9883 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9884 cached_reg_grapheme_cluster = false;
9885 }
9886
9887 ptr = RSTRING_PTR(str);
9888 end = RSTRING_END(str);
9889
9890 while (ptr < end) {
9891 OnigPosition len = onig_match(reg_grapheme_cluster,
9892 (const OnigUChar *)ptr, (const OnigUChar *)end,
9893 (const OnigUChar *)ptr, NULL, 0);
9894 if (len <= 0) break;
9895 grapheme_cluster_count++;
9896 ptr += len;
9897 }
9898
9899 if (!cached_reg_grapheme_cluster) {
9900 onig_free(reg_grapheme_cluster);
9901 }
9902
9903 return SIZET2NUM(grapheme_cluster_count);
9904}
9905
9906static VALUE
9907rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9908{
9909 VALUE orig = str;
9910 rb_encoding *enc = get_encoding(str);
9911 const char *ptr0, *ptr, *end;
9912
9913 if (!rb_enc_unicode_p(enc)) {
9914 return rb_str_enumerate_chars(str, ary);
9915 }
9916
9917 if (!ary) str = rb_str_new_frozen(str);
9918
9919 bool cached_reg_grapheme_cluster = true;
9920 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9921 if (!reg_grapheme_cluster) {
9922 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9923 cached_reg_grapheme_cluster = false;
9924 }
9925
9926 ptr0 = ptr = RSTRING_PTR(str);
9927 end = RSTRING_END(str);
9928
9929 while (ptr < end) {
9930 OnigPosition len = onig_match(reg_grapheme_cluster,
9931 (const OnigUChar *)ptr, (const OnigUChar *)end,
9932 (const OnigUChar *)ptr, NULL, 0);
9933 if (len <= 0) break;
9934 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9935 ptr += len;
9936 }
9937
9938 if (!cached_reg_grapheme_cluster) {
9939 onig_free(reg_grapheme_cluster);
9940 }
9941
9942 RB_GC_GUARD(str);
9943 if (ary)
9944 return ary;
9945 else
9946 return orig;
9947}
9948
9949/*
9950 * call-seq:
9951 * each_grapheme_cluster {|gc| ... } -> self
9952 * each_grapheme_cluster -> enumerator
9953 *
9954 * :include: doc/string/each_grapheme_cluster.rdoc
9955 *
9956 */
9957
9958static VALUE
9959rb_str_each_grapheme_cluster(VALUE str)
9960{
9961 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9962 return rb_str_enumerate_grapheme_clusters(str, 0);
9963}
9964
9965/*
9966 * call-seq:
9967 * grapheme_clusters -> array_of_grapheme_clusters
9968 *
9969 * :include: doc/string/grapheme_clusters.rdoc
9970 *
9971 */
9972
9973static VALUE
9974rb_str_grapheme_clusters(VALUE str)
9975{
9976 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9977 return rb_str_enumerate_grapheme_clusters(str, ary);
9978}
9979
9980static long
9981chopped_length(VALUE str)
9982{
9983 rb_encoding *enc = STR_ENC_GET(str);
9984 const char *p, *p2, *beg, *end;
9985
9986 beg = RSTRING_PTR(str);
9987 end = beg + RSTRING_LEN(str);
9988 if (beg >= end) return 0;
9989 p = rb_enc_prev_char(beg, end, end, enc);
9990 if (!p) return 0;
9991 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
9992 p2 = rb_enc_prev_char(beg, p, end, enc);
9993 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
9994 }
9995 return p - beg;
9996}
9997
9998/*
9999 * call-seq:
10000 * chop! -> self or nil
10001 *
10002 * Like String#chop, but modifies +self+ in place;
10003 * returns +nil+ if +self+ is empty, +self+ otherwise.
10004 *
10005 * Related: String#chomp!.
10006 */
10007
10008static VALUE
10009rb_str_chop_bang(VALUE str)
10010{
10011 str_modify_keep_cr(str);
10012 if (RSTRING_LEN(str) > 0) {
10013 long len;
10014 len = chopped_length(str);
10015 STR_SET_LEN(str, len);
10016 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10017 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10019 }
10020 return str;
10021 }
10022 return Qnil;
10023}
10024
10025
10026/*
10027 * call-seq:
10028 * chop -> new_string
10029 *
10030 * :include: doc/string/chop.rdoc
10031 *
10032 */
10033
10034static VALUE
10035rb_str_chop(VALUE str)
10036{
10037 return rb_str_subseq(str, 0, chopped_length(str));
10038}
10039
10040static long
10041smart_chomp(VALUE str, const char *e, const char *p)
10042{
10043 rb_encoding *enc = rb_enc_get(str);
10044 if (rb_enc_mbminlen(enc) > 1) {
10045 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10046 if (rb_enc_is_newline(pp, e, enc)) {
10047 e = pp;
10048 }
10049 pp = e - rb_enc_mbminlen(enc);
10050 if (pp >= p) {
10051 pp = rb_enc_left_char_head(p, pp, e, enc);
10052 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10053 e = pp;
10054 }
10055 }
10056 }
10057 else {
10058 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
10059 case '\n':
10060 if (--e > p && *(e-1) == '\r') {
10061 --e;
10062 }
10063 break;
10064 case '\r':
10065 --e;
10066 break;
10067 }
10068 }
10069 return e - p;
10070}
10071
10072static long
10073chompped_length(VALUE str, VALUE rs)
10074{
10075 rb_encoding *enc;
10076 int newline;
10077 char *pp, *e, *rsptr;
10078 long rslen;
10079 char *const p = RSTRING_PTR(str);
10080 long len = RSTRING_LEN(str);
10081
10082 if (len == 0) return 0;
10083 e = p + len;
10084 if (rs == rb_default_rs) {
10085 return smart_chomp(str, e, p);
10086 }
10087
10088 enc = rb_enc_get(str);
10089 RSTRING_GETMEM(rs, rsptr, rslen);
10090 if (rslen == 0) {
10091 if (rb_enc_mbminlen(enc) > 1) {
10092 while (e > p) {
10093 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10094 if (!rb_enc_is_newline(pp, e, enc)) break;
10095 e = pp;
10096 pp -= rb_enc_mbminlen(enc);
10097 if (pp >= p) {
10098 pp = rb_enc_left_char_head(p, pp, e, enc);
10099 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10100 e = pp;
10101 }
10102 }
10103 }
10104 }
10105 else {
10106 while (e > p && *(e-1) == '\n') {
10107 --e;
10108 if (e > p && *(e-1) == '\r')
10109 --e;
10110 }
10111 }
10112 return e - p;
10113 }
10114 if (rslen > len) return len;
10115
10116 enc = rb_enc_get(rs);
10117 newline = rsptr[rslen-1];
10118 if (rslen == rb_enc_mbminlen(enc)) {
10119 if (rslen == 1) {
10120 if (newline == '\n')
10121 return smart_chomp(str, e, p);
10122 }
10123 else {
10124 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
10125 return smart_chomp(str, e, p);
10126 }
10127 }
10128
10129 enc = rb_enc_check(str, rs);
10130 if (is_broken_string(rs)) {
10131 return len;
10132 }
10133 pp = e - rslen;
10134 if (p[len-1] == newline &&
10135 (rslen <= 1 ||
10136 memcmp(rsptr, pp, rslen) == 0)) {
10137 if (at_char_boundary(p, pp, e, enc))
10138 return len - rslen;
10139 RB_GC_GUARD(rs);
10140 }
10141 return len;
10142}
10143
10149static VALUE
10150chomp_rs(int argc, const VALUE *argv)
10151{
10152 rb_check_arity(argc, 0, 1);
10153 if (argc > 0) {
10154 VALUE rs = argv[0];
10155 if (!NIL_P(rs)) StringValue(rs);
10156 return rs;
10157 }
10158 else {
10159 return rb_rs;
10160 }
10161}
10162
10163VALUE
10164rb_str_chomp_string(VALUE str, VALUE rs)
10165{
10166 long olen = RSTRING_LEN(str);
10167 long len = chompped_length(str, rs);
10168 if (len >= olen) return Qnil;
10169 str_modify_keep_cr(str);
10170 STR_SET_LEN(str, len);
10171 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10172 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10174 }
10175 return str;
10176}
10177
10178/*
10179 * call-seq:
10180 * chomp!(line_sep = $/) -> self or nil
10181 *
10182 * Like String#chomp, but modifies +self+ in place;
10183 * returns +nil+ if no modification made, +self+ otherwise.
10184 *
10185 */
10186
10187static VALUE
10188rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
10189{
10190 VALUE rs;
10191 str_modifiable(str);
10192 if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
10193 rs = chomp_rs(argc, argv);
10194 if (NIL_P(rs)) return Qnil;
10195 return rb_str_chomp_string(str, rs);
10196}
10197
10198
10199/*
10200 * call-seq:
10201 * chomp(line_sep = $/) -> new_string
10202 *
10203 * :include: doc/string/chomp.rdoc
10204 *
10205 */
10206
10207static VALUE
10208rb_str_chomp(int argc, VALUE *argv, VALUE str)
10209{
10210 VALUE rs = chomp_rs(argc, argv);
10211 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
10212 return rb_str_subseq(str, 0, chompped_length(str, rs));
10213}
10214
10215static long
10216lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10217{
10218 const char *const start = s;
10219
10220 if (!s || s >= e) return 0;
10221
10222 /* remove spaces at head */
10223 if (single_byte_optimizable(str)) {
10224 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
10225 }
10226 else {
10227 while (s < e) {
10228 int n;
10229 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10230
10231 if (cc && !rb_isspace(cc)) break;
10232 s += n;
10233 }
10234 }
10235 return s - start;
10236}
10237
10238/*
10239 * call-seq:
10240 * lstrip! -> self or nil
10241 *
10242 * Like String#lstrip, except that any modifications are made in +self+;
10243 * returns +self+ if any modification are made, +nil+ otherwise.
10244 *
10245 * Related: String#rstrip!, String#strip!.
10246 */
10247
10248static VALUE
10249rb_str_lstrip_bang(VALUE str)
10250{
10251 rb_encoding *enc;
10252 char *start, *s;
10253 long olen, loffset;
10254
10255 str_modify_keep_cr(str);
10256 enc = STR_ENC_GET(str);
10257 RSTRING_GETMEM(str, start, olen);
10258 loffset = lstrip_offset(str, start, start+olen, enc);
10259 if (loffset > 0) {
10260 long len = olen-loffset;
10261 s = start + loffset;
10262 memmove(start, s, len);
10263 STR_SET_LEN(str, len);
10264 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10265 return str;
10266 }
10267 return Qnil;
10268}
10269
10270
10271/*
10272 * call-seq:
10273 * lstrip -> new_string
10274 *
10275 * Returns a copy of +self+ with leading whitespace removed;
10276 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10277 *
10278 * whitespace = "\x00\t\n\v\f\r "
10279 * s = whitespace + 'abc' + whitespace
10280 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10281 * s.lstrip # => "abc\u0000\t\n\v\f\r "
10282 *
10283 * Related: String#rstrip, String#strip.
10284 */
10285
10286static VALUE
10287rb_str_lstrip(VALUE str)
10288{
10289 char *start;
10290 long len, loffset;
10291 RSTRING_GETMEM(str, start, len);
10292 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
10293 if (loffset <= 0) return str_duplicate(rb_cString, str);
10294 return rb_str_subseq(str, loffset, len - loffset);
10295}
10296
10297static long
10298rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10299{
10300 const char *t;
10301
10302 rb_str_check_dummy_enc(enc);
10304 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10305 }
10306 if (!s || s >= e) return 0;
10307 t = e;
10308
10309 /* remove trailing spaces or '\0's */
10310 if (single_byte_optimizable(str)) {
10311 unsigned char c;
10312 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
10313 }
10314 else {
10315 char *tp;
10316
10317 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10318 unsigned int c = rb_enc_codepoint(tp, e, enc);
10319 if (c && !rb_isspace(c)) break;
10320 t = tp;
10321 }
10322 }
10323 return e - t;
10324}
10325
10326/*
10327 * call-seq:
10328 * rstrip! -> self or nil
10329 *
10330 * Like String#rstrip, except that any modifications are made in +self+;
10331 * returns +self+ if any modification are made, +nil+ otherwise.
10332 *
10333 * Related: String#lstrip!, String#strip!.
10334 */
10335
10336static VALUE
10337rb_str_rstrip_bang(VALUE str)
10338{
10339 rb_encoding *enc;
10340 char *start;
10341 long olen, roffset;
10342
10343 str_modify_keep_cr(str);
10344 enc = STR_ENC_GET(str);
10345 RSTRING_GETMEM(str, start, olen);
10346 roffset = rstrip_offset(str, start, start+olen, enc);
10347 if (roffset > 0) {
10348 long len = olen - roffset;
10349
10350 STR_SET_LEN(str, len);
10351 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10352 return str;
10353 }
10354 return Qnil;
10355}
10356
10357
10358/*
10359 * call-seq:
10360 * rstrip -> new_string
10361 *
10362 * Returns a copy of the receiver with trailing whitespace removed;
10363 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10364 *
10365 * whitespace = "\x00\t\n\v\f\r "
10366 * s = whitespace + 'abc' + whitespace
10367 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10368 * s.rstrip # => "\u0000\t\n\v\f\r abc"
10369 *
10370 * Related: String#lstrip, String#strip.
10371 */
10372
10373static VALUE
10374rb_str_rstrip(VALUE str)
10375{
10376 rb_encoding *enc;
10377 char *start;
10378 long olen, roffset;
10379
10380 enc = STR_ENC_GET(str);
10381 RSTRING_GETMEM(str, start, olen);
10382 roffset = rstrip_offset(str, start, start+olen, enc);
10383
10384 if (roffset <= 0) return str_duplicate(rb_cString, str);
10385 return rb_str_subseq(str, 0, olen-roffset);
10386}
10387
10388
10389/*
10390 * call-seq:
10391 * strip! -> self or nil
10392 *
10393 * Like String#strip, except that any modifications are made in +self+;
10394 * returns +self+ if any modification are made, +nil+ otherwise.
10395 *
10396 * Related: String#lstrip!, String#strip!.
10397 */
10398
10399static VALUE
10400rb_str_strip_bang(VALUE str)
10401{
10402 char *start;
10403 long olen, loffset, roffset;
10404 rb_encoding *enc;
10405
10406 str_modify_keep_cr(str);
10407 enc = STR_ENC_GET(str);
10408 RSTRING_GETMEM(str, start, olen);
10409 loffset = lstrip_offset(str, start, start+olen, enc);
10410 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10411
10412 if (loffset > 0 || roffset > 0) {
10413 long len = olen-roffset;
10414 if (loffset > 0) {
10415 len -= loffset;
10416 memmove(start, start + loffset, len);
10417 }
10418 STR_SET_LEN(str, len);
10419 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10420 return str;
10421 }
10422 return Qnil;
10423}
10424
10425
10426/*
10427 * call-seq:
10428 * strip -> new_string
10429 *
10430 * Returns a copy of the receiver with leading and trailing whitespace removed;
10431 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10432 *
10433 * whitespace = "\x00\t\n\v\f\r "
10434 * s = whitespace + 'abc' + whitespace
10435 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10436 * s.strip # => "abc"
10437 *
10438 * Related: String#lstrip, String#rstrip.
10439 */
10440
10441static VALUE
10442rb_str_strip(VALUE str)
10443{
10444 char *start;
10445 long olen, loffset, roffset;
10446 rb_encoding *enc = STR_ENC_GET(str);
10447
10448 RSTRING_GETMEM(str, start, olen);
10449 loffset = lstrip_offset(str, start, start+olen, enc);
10450 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10451
10452 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10453 return rb_str_subseq(str, loffset, olen-loffset-roffset);
10454}
10455
10456static VALUE
10457scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10458{
10459 VALUE result = Qnil;
10460 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10461 if (pos >= 0) {
10462 VALUE match;
10463 struct re_registers *regs;
10464 if (BUILTIN_TYPE(pat) == T_STRING) {
10465 regs = NULL;
10466 end = pos + RSTRING_LEN(pat);
10467 }
10468 else {
10469 match = rb_backref_get();
10470 regs = RMATCH_REGS(match);
10471 pos = BEG(0);
10472 end = END(0);
10473 }
10474
10475 if (pos == end) {
10476 rb_encoding *enc = STR_ENC_GET(str);
10477 /*
10478 * Always consume at least one character of the input string
10479 */
10480 if (RSTRING_LEN(str) > end)
10481 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10482 RSTRING_END(str), enc);
10483 else
10484 *start = end + 1;
10485 }
10486 else {
10487 *start = end;
10488 }
10489
10490 if (!regs || regs->num_regs == 1) {
10491 result = rb_str_subseq(str, pos, end - pos);
10492 return result;
10493 }
10494 else {
10495 result = rb_ary_new2(regs->num_regs);
10496 for (int i = 1; i < regs->num_regs; i++) {
10497 VALUE s = Qnil;
10498 if (BEG(i) >= 0) {
10499 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10500 }
10501
10502 rb_ary_push(result, s);
10503 }
10504 }
10505
10506 RB_GC_GUARD(match);
10507 }
10508
10509 return result;
10510}
10511
10512
10513/*
10514 * call-seq:
10515 * scan(string_or_regexp) -> array
10516 * scan(string_or_regexp) {|matches| ... } -> self
10517 *
10518 * Matches a pattern against +self+; the pattern is:
10519 *
10520 * - +string_or_regexp+ itself, if it is a Regexp.
10521 * - <tt>Regexp.quote(string_or_regexp)</tt>, if +string_or_regexp+ is a string.
10522 *
10523 * Iterates through +self+, generating a collection of matching results:
10524 *
10525 * - If the pattern contains no groups, each result is the
10526 * matched string, <code>$&</code>.
10527 * - If the pattern contains groups, each result is an array
10528 * containing one entry per group.
10529 *
10530 * With no block given, returns an array of the results:
10531 *
10532 * s = 'cruel world'
10533 * s.scan(/\w+/) # => ["cruel", "world"]
10534 * s.scan(/.../) # => ["cru", "el ", "wor"]
10535 * s.scan(/(...)/) # => [["cru"], ["el "], ["wor"]]
10536 * s.scan(/(..)(..)/) # => [["cr", "ue"], ["l ", "wo"]]
10537 *
10538 * With a block given, calls the block with each result; returns +self+:
10539 *
10540 * s.scan(/\w+/) {|w| print "<<#{w}>> " }
10541 * print "\n"
10542 * s.scan(/(.)(.)/) {|x,y| print y, x }
10543 * print "\n"
10544 *
10545 * Output:
10546 *
10547 * <<cruel>> <<world>>
10548 * rceu lowlr
10549 *
10550 */
10551
10552static VALUE
10553rb_str_scan(VALUE str, VALUE pat)
10554{
10555 VALUE result;
10556 long start = 0;
10557 long last = -1, prev = 0;
10558 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10559
10560 pat = get_pat_quoted(pat, 1);
10561 mustnot_broken(str);
10562 if (!rb_block_given_p()) {
10563 VALUE ary = rb_ary_new();
10564
10565 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10566 last = prev;
10567 prev = start;
10568 rb_ary_push(ary, result);
10569 }
10570 if (last >= 0) rb_pat_search(pat, str, last, 1);
10571 else rb_backref_set(Qnil);
10572 return ary;
10573 }
10574
10575 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10576 last = prev;
10577 prev = start;
10578 rb_yield(result);
10579 str_mod_check(str, p, len);
10580 }
10581 if (last >= 0) rb_pat_search(pat, str, last, 1);
10582 return str;
10583}
10584
10585
10586/*
10587 * call-seq:
10588 * hex -> integer
10589 *
10590 * Interprets the leading substring of +self+ as a string of hexadecimal digits
10591 * (with an optional sign and an optional <code>0x</code>) and returns the
10592 * corresponding number;
10593 * returns zero if there is no such leading substring:
10594 *
10595 * '0x0a'.hex # => 10
10596 * '-1234'.hex # => -4660
10597 * '0'.hex # => 0
10598 * 'non-numeric'.hex # => 0
10599 *
10600 * Related: String#oct.
10601 *
10602 */
10603
10604static VALUE
10605rb_str_hex(VALUE str)
10606{
10607 return rb_str_to_inum(str, 16, FALSE);
10608}
10609
10610
10611/*
10612 * call-seq:
10613 * oct -> integer
10614 *
10615 * Interprets the leading substring of +self+ as a string of octal digits
10616 * (with an optional sign) and returns the corresponding number;
10617 * returns zero if there is no such leading substring:
10618 *
10619 * '123'.oct # => 83
10620 * '-377'.oct # => -255
10621 * '0377non-numeric'.oct # => 255
10622 * 'non-numeric'.oct # => 0
10623 *
10624 * If +self+ starts with <tt>0</tt>, radix indicators are honored;
10625 * see Kernel#Integer.
10626 *
10627 * Related: String#hex.
10628 *
10629 */
10630
10631static VALUE
10632rb_str_oct(VALUE str)
10633{
10634 return rb_str_to_inum(str, -8, FALSE);
10635}
10636
10637#ifndef HAVE_CRYPT_R
10638# include "ruby/thread_native.h"
10639# include "ruby/atomic.h"
10640
10641static struct {
10642 rb_nativethread_lock_t lock;
10643} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10644
10645static void
10646crypt_mutex_initialize(void)
10647{
10648}
10649#endif
10650
10651/*
10652 * call-seq:
10653 * crypt(salt_str) -> new_string
10654 *
10655 * Returns the string generated by calling <code>crypt(3)</code>
10656 * standard library function with <code>str</code> and
10657 * <code>salt_str</code>, in this order, as its arguments. Please do
10658 * not use this method any longer. It is legacy; provided only for
10659 * backward compatibility with ruby scripts in earlier days. It is
10660 * bad to use in contemporary programs for several reasons:
10661 *
10662 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10663 * run. The generated string lacks data portability.
10664 *
10665 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10666 * (i.e. silently ends up in unexpected results).
10667 *
10668 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10669 * thread safe.
10670 *
10671 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10672 * very very weak. According to its manpage, Linux's traditional
10673 * <code>crypt(3)</code> output has only 2**56 variations; too
10674 * easy to brute force today. And this is the default behaviour.
10675 *
10676 * * In order to make things robust some OSes implement so-called
10677 * "modular" usage. To go through, you have to do a complex
10678 * build-up of the <code>salt_str</code> parameter, by hand.
10679 * Failure in generation of a proper salt string tends not to
10680 * yield any errors; typos in parameters are normally not
10681 * detectable.
10682 *
10683 * * For instance, in the following example, the second invocation
10684 * of String#crypt is wrong; it has a typo in "round=" (lacks
10685 * "s"). However the call does not fail and something unexpected
10686 * is generated.
10687 *
10688 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10689 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10690 *
10691 * * Even in the "modular" mode, some hash functions are considered
10692 * archaic and no longer recommended at all; for instance module
10693 * <code>$1$</code> is officially abandoned by its author: see
10694 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10695 * instance module <code>$3$</code> is considered completely
10696 * broken: see the manpage of FreeBSD.
10697 *
10698 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10699 * written above, <code>crypt(3)</code> on Mac OS never fails.
10700 * This means even if you build up a proper salt string it
10701 * generates a traditional DES hash anyways, and there is no way
10702 * for you to be aware of.
10703 *
10704 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10705 *
10706 * If for some reason you cannot migrate to other secure contemporary
10707 * password hashing algorithms, install the string-crypt gem and
10708 * <code>require 'string/crypt'</code> to continue using it.
10709 */
10710
10711static VALUE
10712rb_str_crypt(VALUE str, VALUE salt)
10713{
10714#ifdef HAVE_CRYPT_R
10715 VALUE databuf;
10716 struct crypt_data *data;
10717# define CRYPT_END() ALLOCV_END(databuf)
10718#else
10719 extern char *crypt(const char *, const char *);
10720# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10721#endif
10722 VALUE result;
10723 const char *s, *saltp;
10724 char *res;
10725#ifdef BROKEN_CRYPT
10726 char salt_8bit_clean[3];
10727#endif
10728
10729 StringValue(salt);
10730 mustnot_wchar(str);
10731 mustnot_wchar(salt);
10732 s = StringValueCStr(str);
10733 saltp = RSTRING_PTR(salt);
10734 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10735 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10736 }
10737
10738#ifdef BROKEN_CRYPT
10739 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10740 salt_8bit_clean[0] = saltp[0] & 0x7f;
10741 salt_8bit_clean[1] = saltp[1] & 0x7f;
10742 salt_8bit_clean[2] = '\0';
10743 saltp = salt_8bit_clean;
10744 }
10745#endif
10746#ifdef HAVE_CRYPT_R
10747 data = ALLOCV(databuf, sizeof(struct crypt_data));
10748# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10749 data->initialized = 0;
10750# endif
10751 res = crypt_r(s, saltp, data);
10752#else
10753 crypt_mutex_initialize();
10754 rb_nativethread_lock_lock(&crypt_mutex.lock);
10755 res = crypt(s, saltp);
10756#endif
10757 if (!res) {
10758 int err = errno;
10759 CRYPT_END();
10760 rb_syserr_fail(err, "crypt");
10761 }
10762 result = rb_str_new_cstr(res);
10763 CRYPT_END();
10764 return result;
10765}
10766
10767
10768/*
10769 * call-seq:
10770 * ord -> integer
10771 *
10772 * :include: doc/string/ord.rdoc
10773 *
10774 */
10775
10776static VALUE
10777rb_str_ord(VALUE s)
10778{
10779 unsigned int c;
10780
10781 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10782 return UINT2NUM(c);
10783}
10784/*
10785 * call-seq:
10786 * sum(n = 16) -> integer
10787 *
10788 * :include: doc/string/sum.rdoc
10789 *
10790 */
10791
10792static VALUE
10793rb_str_sum(int argc, VALUE *argv, VALUE str)
10794{
10795 int bits = 16;
10796 char *ptr, *p, *pend;
10797 long len;
10798 VALUE sum = INT2FIX(0);
10799 unsigned long sum0 = 0;
10800
10801 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10802 bits = 0;
10803 }
10804 ptr = p = RSTRING_PTR(str);
10805 len = RSTRING_LEN(str);
10806 pend = p + len;
10807
10808 while (p < pend) {
10809 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10810 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10811 str_mod_check(str, ptr, len);
10812 sum0 = 0;
10813 }
10814 sum0 += (unsigned char)*p;
10815 p++;
10816 }
10817
10818 if (bits == 0) {
10819 if (sum0) {
10820 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10821 }
10822 }
10823 else {
10824 if (sum == INT2FIX(0)) {
10825 if (bits < (int)sizeof(long)*CHAR_BIT) {
10826 sum0 &= (((unsigned long)1)<<bits)-1;
10827 }
10828 sum = LONG2FIX(sum0);
10829 }
10830 else {
10831 VALUE mod;
10832
10833 if (sum0) {
10834 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10835 }
10836
10837 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10838 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10839 sum = rb_funcall(sum, '&', 1, mod);
10840 }
10841 }
10842 return sum;
10843}
10844
10845static VALUE
10846rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10847{
10848 rb_encoding *enc;
10849 VALUE w;
10850 long width, len, flen = 1, fclen = 1;
10851 VALUE res;
10852 char *p;
10853 const char *f = " ";
10854 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10855 VALUE pad;
10856 int singlebyte = 1, cr;
10857 int termlen;
10858
10859 rb_scan_args(argc, argv, "11", &w, &pad);
10860 enc = STR_ENC_GET(str);
10861 termlen = rb_enc_mbminlen(enc);
10862 width = NUM2LONG(w);
10863 if (argc == 2) {
10864 StringValue(pad);
10865 enc = rb_enc_check(str, pad);
10866 f = RSTRING_PTR(pad);
10867 flen = RSTRING_LEN(pad);
10868 fclen = str_strlen(pad, enc); /* rb_enc_check */
10869 singlebyte = single_byte_optimizable(pad);
10870 if (flen == 0 || fclen == 0) {
10871 rb_raise(rb_eArgError, "zero width padding");
10872 }
10873 }
10874 len = str_strlen(str, enc); /* rb_enc_check */
10875 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
10876 n = width - len;
10877 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
10878 rlen = n - llen;
10879 cr = ENC_CODERANGE(str);
10880 if (flen > 1) {
10881 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10882 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10883 }
10884 size = RSTRING_LEN(str);
10885 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10886 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10887 (len += llen2 + rlen2) >= LONG_MAX - size) {
10888 rb_raise(rb_eArgError, "argument too big");
10889 }
10890 len += size;
10891 res = str_enc_new(rb_cString, 0, len, enc);
10892 p = RSTRING_PTR(res);
10893 if (flen <= 1) {
10894 memset(p, *f, llen);
10895 p += llen;
10896 }
10897 else {
10898 while (llen >= fclen) {
10899 memcpy(p,f,flen);
10900 p += flen;
10901 llen -= fclen;
10902 }
10903 if (llen > 0) {
10904 memcpy(p, f, llen2);
10905 p += llen2;
10906 }
10907 }
10908 memcpy(p, RSTRING_PTR(str), size);
10909 p += size;
10910 if (flen <= 1) {
10911 memset(p, *f, rlen);
10912 p += rlen;
10913 }
10914 else {
10915 while (rlen >= fclen) {
10916 memcpy(p,f,flen);
10917 p += flen;
10918 rlen -= fclen;
10919 }
10920 if (rlen > 0) {
10921 memcpy(p, f, rlen2);
10922 p += rlen2;
10923 }
10924 }
10925 TERM_FILL(p, termlen);
10926 STR_SET_LEN(res, p-RSTRING_PTR(res));
10927
10928 if (argc == 2)
10929 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
10930 if (cr != ENC_CODERANGE_BROKEN)
10931 ENC_CODERANGE_SET(res, cr);
10932
10933 RB_GC_GUARD(pad);
10934 return res;
10935}
10936
10937
10938/*
10939 * call-seq:
10940 * ljust(size, pad_string = ' ') -> new_string
10941 *
10942 * :include: doc/string/ljust.rdoc
10943 *
10944 * Related: String#rjust, String#center.
10945 *
10946 */
10947
10948static VALUE
10949rb_str_ljust(int argc, VALUE *argv, VALUE str)
10950{
10951 return rb_str_justify(argc, argv, str, 'l');
10952}
10953
10954/*
10955 * call-seq:
10956 * rjust(size, pad_string = ' ') -> new_string
10957 *
10958 * :include: doc/string/rjust.rdoc
10959 *
10960 * Related: String#ljust, String#center.
10961 *
10962 */
10963
10964static VALUE
10965rb_str_rjust(int argc, VALUE *argv, VALUE str)
10966{
10967 return rb_str_justify(argc, argv, str, 'r');
10968}
10969
10970
10971/*
10972 * call-seq:
10973 * center(size, pad_string = ' ') -> new_string
10974 *
10975 * :include: doc/string/center.rdoc
10976 *
10977 * Related: String#ljust, String#rjust.
10978 *
10979 */
10980
10981static VALUE
10982rb_str_center(int argc, VALUE *argv, VALUE str)
10983{
10984 return rb_str_justify(argc, argv, str, 'c');
10985}
10986
10987/*
10988 * call-seq:
10989 * partition(string_or_regexp) -> [head, match, tail]
10990 *
10991 * :include: doc/string/partition.rdoc
10992 *
10993 */
10994
10995static VALUE
10996rb_str_partition(VALUE str, VALUE sep)
10997{
10998 long pos;
10999
11000 sep = get_pat_quoted(sep, 0);
11001 if (RB_TYPE_P(sep, T_REGEXP)) {
11002 if (rb_reg_search(sep, str, 0, 0) < 0) {
11003 goto failed;
11004 }
11005 VALUE match = rb_backref_get();
11006 struct re_registers *regs = RMATCH_REGS(match);
11007
11008 pos = BEG(0);
11009 sep = rb_str_subseq(str, pos, END(0) - pos);
11010 }
11011 else {
11012 pos = rb_str_index(str, sep, 0);
11013 if (pos < 0) goto failed;
11014 }
11015 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11016 sep,
11017 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11018 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11019
11020 failed:
11021 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11022}
11023
11024/*
11025 * call-seq:
11026 * rpartition(sep) -> [head, match, tail]
11027 *
11028 * :include: doc/string/rpartition.rdoc
11029 *
11030 */
11031
11032static VALUE
11033rb_str_rpartition(VALUE str, VALUE sep)
11034{
11035 long pos = RSTRING_LEN(str);
11036
11037 sep = get_pat_quoted(sep, 0);
11038 if (RB_TYPE_P(sep, T_REGEXP)) {
11039 if (rb_reg_search(sep, str, pos, 1) < 0) {
11040 goto failed;
11041 }
11042 VALUE match = rb_backref_get();
11043 struct re_registers *regs = RMATCH_REGS(match);
11044
11045 pos = BEG(0);
11046 sep = rb_str_subseq(str, pos, END(0) - pos);
11047 }
11048 else {
11049 pos = rb_str_sublen(str, pos);
11050 pos = rb_str_rindex(str, sep, pos);
11051 if (pos < 0) {
11052 goto failed;
11053 }
11054 }
11055
11056 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11057 sep,
11058 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11059 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11060 failed:
11061 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
11062}
11063
11064/*
11065 * call-seq:
11066 * start_with?(*string_or_regexp) -> true or false
11067 *
11068 * :include: doc/string/start_with_p.rdoc
11069 *
11070 */
11071
11072static VALUE
11073rb_str_start_with(int argc, VALUE *argv, VALUE str)
11074{
11075 int i;
11076
11077 for (i=0; i<argc; i++) {
11078 VALUE tmp = argv[i];
11079 if (RB_TYPE_P(tmp, T_REGEXP)) {
11080 if (rb_reg_start_with_p(tmp, str))
11081 return Qtrue;
11082 }
11083 else {
11084 const char *p, *s, *e;
11085 long slen, tlen;
11086 rb_encoding *enc;
11087
11088 StringValue(tmp);
11089 enc = rb_enc_check(str, tmp);
11090 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11091 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11092 p = RSTRING_PTR(str);
11093 e = p + slen;
11094 s = p + tlen;
11095 if (!at_char_right_boundary(p, s, e, enc))
11096 continue;
11097 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11098 return Qtrue;
11099 }
11100 }
11101 return Qfalse;
11102}
11103
11104/*
11105 * call-seq:
11106 * end_with?(*strings) -> true or false
11107 *
11108 * :include: doc/string/end_with_p.rdoc
11109 *
11110 */
11111
11112static VALUE
11113rb_str_end_with(int argc, VALUE *argv, VALUE str)
11114{
11115 int i;
11116
11117 for (i=0; i<argc; i++) {
11118 VALUE tmp = argv[i];
11119 const char *p, *s, *e;
11120 long slen, tlen;
11121 rb_encoding *enc;
11122
11123 StringValue(tmp);
11124 enc = rb_enc_check(str, tmp);
11125 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11126 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11127 p = RSTRING_PTR(str);
11128 e = p + slen;
11129 s = e - tlen;
11130 if (!at_char_boundary(p, s, e, enc))
11131 continue;
11132 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11133 return Qtrue;
11134 }
11135 return Qfalse;
11136}
11137
11147static long
11148deleted_prefix_length(VALUE str, VALUE prefix)
11149{
11150 const char *strptr, *prefixptr;
11151 long olen, prefixlen;
11152 rb_encoding *enc = rb_enc_get(str);
11153
11154 StringValue(prefix);
11155
11156 if (!is_broken_string(prefix) ||
11157 !rb_enc_asciicompat(enc) ||
11158 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11159 enc = rb_enc_check(str, prefix);
11160 }
11161
11162 /* return 0 if not start with prefix */
11163 prefixlen = RSTRING_LEN(prefix);
11164 if (prefixlen <= 0) return 0;
11165 olen = RSTRING_LEN(str);
11166 if (olen < prefixlen) return 0;
11167 strptr = RSTRING_PTR(str);
11168 prefixptr = RSTRING_PTR(prefix);
11169 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
11170 if (is_broken_string(prefix)) {
11171 if (!is_broken_string(str)) {
11172 /* prefix in a valid string cannot be broken */
11173 return 0;
11174 }
11175 const char *strend = strptr + olen;
11176 const char *after_prefix = strptr + prefixlen;
11177 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11178 /* prefix does not end at char-boundary */
11179 return 0;
11180 }
11181 }
11182 /* prefix part in `str` also should be valid. */
11183
11184 return prefixlen;
11185}
11186
11187/*
11188 * call-seq:
11189 * delete_prefix!(prefix) -> self or nil
11190 *
11191 * Like String#delete_prefix, except that +self+ is modified in place.
11192 * Returns +self+ if the prefix is removed, +nil+ otherwise.
11193 *
11194 */
11195
11196static VALUE
11197rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
11198{
11199 long prefixlen;
11200 str_modify_keep_cr(str);
11201
11202 prefixlen = deleted_prefix_length(str, prefix);
11203 if (prefixlen <= 0) return Qnil;
11204
11205 return rb_str_drop_bytes(str, prefixlen);
11206}
11207
11208/*
11209 * call-seq:
11210 * delete_prefix(prefix) -> new_string
11211 *
11212 * :include: doc/string/delete_prefix.rdoc
11213 *
11214 */
11215
11216static VALUE
11217rb_str_delete_prefix(VALUE str, VALUE prefix)
11218{
11219 long prefixlen;
11220
11221 prefixlen = deleted_prefix_length(str, prefix);
11222 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
11223
11224 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11225}
11226
11236static long
11237deleted_suffix_length(VALUE str, VALUE suffix)
11238{
11239 const char *strptr, *suffixptr;
11240 long olen, suffixlen;
11241 rb_encoding *enc;
11242
11243 StringValue(suffix);
11244 if (is_broken_string(suffix)) return 0;
11245 enc = rb_enc_check(str, suffix);
11246
11247 /* return 0 if not start with suffix */
11248 suffixlen = RSTRING_LEN(suffix);
11249 if (suffixlen <= 0) return 0;
11250 olen = RSTRING_LEN(str);
11251 if (olen < suffixlen) return 0;
11252 strptr = RSTRING_PTR(str);
11253 suffixptr = RSTRING_PTR(suffix);
11254 const char *strend = strptr + olen;
11255 const char *before_suffix = strend - suffixlen;
11256 if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
11257 if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
11258
11259 return suffixlen;
11260}
11261
11262/*
11263 * call-seq:
11264 * delete_suffix!(suffix) -> self or nil
11265 *
11266 * Like String#delete_suffix, except that +self+ is modified in place.
11267 * Returns +self+ if the suffix is removed, +nil+ otherwise.
11268 *
11269 */
11270
11271static VALUE
11272rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
11273{
11274 long olen, suffixlen, len;
11275 str_modifiable(str);
11276
11277 suffixlen = deleted_suffix_length(str, suffix);
11278 if (suffixlen <= 0) return Qnil;
11279
11280 olen = RSTRING_LEN(str);
11281 str_modify_keep_cr(str);
11282 len = olen - suffixlen;
11283 STR_SET_LEN(str, len);
11284 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
11285 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
11287 }
11288 return str;
11289}
11290
11291/*
11292 * call-seq:
11293 * delete_suffix(suffix) -> new_string
11294 *
11295 * :include: doc/string/delete_suffix.rdoc
11296 *
11297 */
11298
11299static VALUE
11300rb_str_delete_suffix(VALUE str, VALUE suffix)
11301{
11302 long suffixlen;
11303
11304 suffixlen = deleted_suffix_length(str, suffix);
11305 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
11306
11307 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11308}
11309
11310void
11311rb_str_setter(VALUE val, ID id, VALUE *var)
11312{
11313 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
11314 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
11315 }
11316 *var = val;
11317}
11318
11319static void
11320rb_fs_setter(VALUE val, ID id, VALUE *var)
11321{
11322 val = rb_fs_check(val);
11323 if (!val) {
11324 rb_raise(rb_eTypeError,
11325 "value of %"PRIsVALUE" must be String or Regexp",
11326 rb_id2str(id));
11327 }
11328 if (!NIL_P(val)) {
11329 rb_warn_deprecated("'$;'", NULL);
11330 }
11331 *var = val;
11332}
11333
11334
11335/*
11336 * call-seq:
11337 * force_encoding(encoding) -> self
11338 *
11339 * :include: doc/string/force_encoding.rdoc
11340 *
11341 */
11342
11343static VALUE
11344rb_str_force_encoding(VALUE str, VALUE enc)
11345{
11346 str_modifiable(str);
11347
11348 rb_encoding *encoding = rb_to_encoding(enc);
11349 int idx = rb_enc_to_index(encoding);
11350
11351 // If the encoding is unchanged, we do nothing.
11352 if (ENCODING_GET(str) == idx) {
11353 return str;
11354 }
11355
11356 rb_enc_associate_index(str, idx);
11357
11358 // If the coderange was 7bit and the new encoding is ASCII-compatible
11359 // we can keep the coderange.
11360 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
11361 return str;
11362 }
11363
11365 return str;
11366}
11367
11368/*
11369 * call-seq:
11370 * b -> string
11371 *
11372 * :include: doc/string/b.rdoc
11373 *
11374 */
11375
11376static VALUE
11377rb_str_b(VALUE str)
11378{
11379 VALUE str2;
11380 if (STR_EMBED_P(str)) {
11381 str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11382 }
11383 else {
11384 str2 = str_alloc_heap(rb_cString);
11385 }
11386 str_replace_shared_without_enc(str2, str);
11387
11388 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11389 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
11390 // If we know the receiver's code range then we know the result's code range.
11391 int cr = ENC_CODERANGE(str);
11392 switch (cr) {
11393 case ENC_CODERANGE_7BIT:
11395 break;
11399 break;
11400 default:
11401 ENC_CODERANGE_CLEAR(str2);
11402 break;
11403 }
11404 }
11405
11406 return str2;
11407}
11408
11409/*
11410 * call-seq:
11411 * valid_encoding? -> true or false
11412 *
11413 * Returns +true+ if +self+ is encoded correctly, +false+ otherwise:
11414 *
11415 * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? # => true
11416 * "\xc2".force_encoding("UTF-8").valid_encoding? # => false
11417 * "\x80".force_encoding("UTF-8").valid_encoding? # => false
11418 */
11419
11420static VALUE
11421rb_str_valid_encoding_p(VALUE str)
11422{
11423 int cr = rb_enc_str_coderange(str);
11424
11425 return RBOOL(cr != ENC_CODERANGE_BROKEN);
11426}
11427
11428/*
11429 * call-seq:
11430 * ascii_only? -> true or false
11431 *
11432 * Returns +true+ if +self+ contains only ASCII characters,
11433 * +false+ otherwise:
11434 *
11435 * 'abc'.ascii_only? # => true
11436 * "abc\u{6666}".ascii_only? # => false
11437 *
11438 */
11439
11440static VALUE
11441rb_str_is_ascii_only_p(VALUE str)
11442{
11443 int cr = rb_enc_str_coderange(str);
11444
11445 return RBOOL(cr == ENC_CODERANGE_7BIT);
11446}
11447
11448VALUE
11450{
11451 static const char ellipsis[] = "...";
11452 const long ellipsislen = sizeof(ellipsis) - 1;
11453 rb_encoding *const enc = rb_enc_get(str);
11454 const long blen = RSTRING_LEN(str);
11455 const char *const p = RSTRING_PTR(str), *e = p + blen;
11456 VALUE estr, ret = 0;
11457
11458 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11459 if (len * rb_enc_mbminlen(enc) >= blen ||
11460 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11461 ret = str;
11462 }
11463 else if (len <= ellipsislen ||
11464 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11465 if (rb_enc_asciicompat(enc)) {
11466 ret = rb_str_new(ellipsis, len);
11467 rb_enc_associate(ret, enc);
11468 }
11469 else {
11470 estr = rb_usascii_str_new(ellipsis, len);
11471 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11472 }
11473 }
11474 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11475 rb_str_cat(ret, ellipsis, ellipsislen);
11476 }
11477 else {
11478 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11479 rb_enc_from_encoding(enc), 0, Qnil);
11480 rb_str_append(ret, estr);
11481 }
11482 return ret;
11483}
11484
11485static VALUE
11486str_compat_and_valid(VALUE str, rb_encoding *enc)
11487{
11488 int cr;
11489 str = StringValue(str);
11490 cr = rb_enc_str_coderange(str);
11491 if (cr == ENC_CODERANGE_BROKEN) {
11492 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11493 }
11494 else {
11495 rb_encoding *e = STR_ENC_GET(str);
11496 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11497 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11498 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11499 }
11500 }
11501 return str;
11502}
11503
11504static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11505
11506VALUE
11508{
11509 rb_encoding *enc = STR_ENC_GET(str);
11510 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11511}
11512
11513VALUE
11514rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11515{
11516 int cr = ENC_CODERANGE_UNKNOWN;
11517 if (enc == STR_ENC_GET(str)) {
11518 /* cached coderange makes sense only when enc equals the
11519 * actual encoding of str */
11520 cr = ENC_CODERANGE(str);
11521 }
11522 return enc_str_scrub(enc, str, repl, cr);
11523}
11524
11525static VALUE
11526enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11527{
11528 int encidx;
11529 VALUE buf = Qnil;
11530 const char *rep, *p, *e, *p1, *sp;
11531 long replen = -1;
11532 long slen;
11533
11534 if (rb_block_given_p()) {
11535 if (!NIL_P(repl))
11536 rb_raise(rb_eArgError, "both of block and replacement given");
11537 replen = 0;
11538 }
11539
11540 if (ENC_CODERANGE_CLEAN_P(cr))
11541 return Qnil;
11542
11543 if (!NIL_P(repl)) {
11544 repl = str_compat_and_valid(repl, enc);
11545 }
11546
11547 if (rb_enc_dummy_p(enc)) {
11548 return Qnil;
11549 }
11550 encidx = rb_enc_to_index(enc);
11551
11552#define DEFAULT_REPLACE_CHAR(str) do { \
11553 static const char replace[sizeof(str)-1] = str; \
11554 rep = replace; replen = (int)sizeof(replace); \
11555 } while (0)
11556
11557 slen = RSTRING_LEN(str);
11558 p = RSTRING_PTR(str);
11559 e = RSTRING_END(str);
11560 p1 = p;
11561 sp = p;
11562
11563 if (rb_enc_asciicompat(enc)) {
11564 int rep7bit_p;
11565 if (!replen) {
11566 rep = NULL;
11567 rep7bit_p = FALSE;
11568 }
11569 else if (!NIL_P(repl)) {
11570 rep = RSTRING_PTR(repl);
11571 replen = RSTRING_LEN(repl);
11572 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11573 }
11574 else if (encidx == rb_utf8_encindex()) {
11575 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11576 rep7bit_p = FALSE;
11577 }
11578 else {
11579 DEFAULT_REPLACE_CHAR("?");
11580 rep7bit_p = TRUE;
11581 }
11582 cr = ENC_CODERANGE_7BIT;
11583
11584 p = search_nonascii(p, e);
11585 if (!p) {
11586 p = e;
11587 }
11588 while (p < e) {
11589 int ret = rb_enc_precise_mbclen(p, e, enc);
11590 if (MBCLEN_NEEDMORE_P(ret)) {
11591 break;
11592 }
11593 else if (MBCLEN_CHARFOUND_P(ret)) {
11595 p += MBCLEN_CHARFOUND_LEN(ret);
11596 }
11597 else if (MBCLEN_INVALID_P(ret)) {
11598 /*
11599 * p1~p: valid ascii/multibyte chars
11600 * p ~e: invalid bytes + unknown bytes
11601 */
11602 long clen = rb_enc_mbmaxlen(enc);
11603 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11604 if (p > p1) {
11605 rb_str_buf_cat(buf, p1, p - p1);
11606 }
11607
11608 if (e - p < clen) clen = e - p;
11609 if (clen <= 2) {
11610 clen = 1;
11611 }
11612 else {
11613 const char *q = p;
11614 clen--;
11615 for (; clen > 1; clen--) {
11616 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11617 if (MBCLEN_NEEDMORE_P(ret)) break;
11618 if (MBCLEN_INVALID_P(ret)) continue;
11620 }
11621 }
11622 if (rep) {
11623 rb_str_buf_cat(buf, rep, replen);
11624 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11625 }
11626 else {
11627 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11628 str_mod_check(str, sp, slen);
11629 repl = str_compat_and_valid(repl, enc);
11630 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11633 }
11634 p += clen;
11635 p1 = p;
11636 p = search_nonascii(p, e);
11637 if (!p) {
11638 p = e;
11639 break;
11640 }
11641 }
11642 else {
11644 }
11645 }
11646 if (NIL_P(buf)) {
11647 if (p == e) {
11648 ENC_CODERANGE_SET(str, cr);
11649 return Qnil;
11650 }
11651 buf = rb_str_buf_new(RSTRING_LEN(str));
11652 }
11653 if (p1 < p) {
11654 rb_str_buf_cat(buf, p1, p - p1);
11655 }
11656 if (p < e) {
11657 if (rep) {
11658 rb_str_buf_cat(buf, rep, replen);
11659 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11660 }
11661 else {
11662 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11663 str_mod_check(str, sp, slen);
11664 repl = str_compat_and_valid(repl, enc);
11665 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11668 }
11669 }
11670 }
11671 else {
11672 /* ASCII incompatible */
11673 long mbminlen = rb_enc_mbminlen(enc);
11674 if (!replen) {
11675 rep = NULL;
11676 }
11677 else if (!NIL_P(repl)) {
11678 rep = RSTRING_PTR(repl);
11679 replen = RSTRING_LEN(repl);
11680 }
11681 else if (encidx == ENCINDEX_UTF_16BE) {
11682 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11683 }
11684 else if (encidx == ENCINDEX_UTF_16LE) {
11685 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11686 }
11687 else if (encidx == ENCINDEX_UTF_32BE) {
11688 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11689 }
11690 else if (encidx == ENCINDEX_UTF_32LE) {
11691 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11692 }
11693 else {
11694 DEFAULT_REPLACE_CHAR("?");
11695 }
11696
11697 while (p < e) {
11698 int ret = rb_enc_precise_mbclen(p, e, enc);
11699 if (MBCLEN_NEEDMORE_P(ret)) {
11700 break;
11701 }
11702 else if (MBCLEN_CHARFOUND_P(ret)) {
11703 p += MBCLEN_CHARFOUND_LEN(ret);
11704 }
11705 else if (MBCLEN_INVALID_P(ret)) {
11706 const char *q = p;
11707 long clen = rb_enc_mbmaxlen(enc);
11708 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11709 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11710
11711 if (e - p < clen) clen = e - p;
11712 if (clen <= mbminlen * 2) {
11713 clen = mbminlen;
11714 }
11715 else {
11716 clen -= mbminlen;
11717 for (; clen > mbminlen; clen-=mbminlen) {
11718 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11719 if (MBCLEN_NEEDMORE_P(ret)) break;
11720 if (MBCLEN_INVALID_P(ret)) continue;
11722 }
11723 }
11724 if (rep) {
11725 rb_str_buf_cat(buf, rep, replen);
11726 }
11727 else {
11728 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11729 str_mod_check(str, sp, slen);
11730 repl = str_compat_and_valid(repl, enc);
11731 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11732 }
11733 p += clen;
11734 p1 = p;
11735 }
11736 else {
11738 }
11739 }
11740 if (NIL_P(buf)) {
11741 if (p == e) {
11743 return Qnil;
11744 }
11745 buf = rb_str_buf_new(RSTRING_LEN(str));
11746 }
11747 if (p1 < p) {
11748 rb_str_buf_cat(buf, p1, p - p1);
11749 }
11750 if (p < e) {
11751 if (rep) {
11752 rb_str_buf_cat(buf, rep, replen);
11753 }
11754 else {
11755 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11756 str_mod_check(str, sp, slen);
11757 repl = str_compat_and_valid(repl, enc);
11758 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11759 }
11760 }
11762 }
11763 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11764 return buf;
11765}
11766
11767/*
11768 * call-seq:
11769 * scrub(replacement_string = default_replacement) -> new_string
11770 * scrub{|bytes| ... } -> new_string
11771 *
11772 * :include: doc/string/scrub.rdoc
11773 *
11774 */
11775static VALUE
11776str_scrub(int argc, VALUE *argv, VALUE str)
11777{
11778 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11779 VALUE new = rb_str_scrub(str, repl);
11780 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11781}
11782
11783/*
11784 * call-seq:
11785 * scrub! -> self
11786 * scrub!(replacement_string = default_replacement) -> self
11787 * scrub!{|bytes| ... } -> self
11788 *
11789 * Like String#scrub, except that any replacements are made in +self+.
11790 *
11791 */
11792static VALUE
11793str_scrub_bang(int argc, VALUE *argv, VALUE str)
11794{
11795 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11796 VALUE new = rb_str_scrub(str, repl);
11797 if (!NIL_P(new)) rb_str_replace(str, new);
11798 return str;
11799}
11800
11801static ID id_normalize;
11802static ID id_normalized_p;
11803static VALUE mUnicodeNormalize;
11804
11805static VALUE
11806unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11807{
11808 static int UnicodeNormalizeRequired = 0;
11809 VALUE argv2[2];
11810
11811 if (!UnicodeNormalizeRequired) {
11812 rb_require("unicode_normalize/normalize.rb");
11813 UnicodeNormalizeRequired = 1;
11814 }
11815 argv2[0] = str;
11816 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11817 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11818}
11819
11820/*
11821 * call-seq:
11822 * unicode_normalize(form = :nfc) -> string
11823 *
11824 * Returns a copy of +self+ with
11825 * {Unicode normalization}[https://unicode.org/reports/tr15] applied.
11826 *
11827 * Argument +form+ must be one of the following symbols
11828 * (see {Unicode normalization forms}[https://unicode.org/reports/tr15/#Norm_Forms]):
11829 *
11830 * - +:nfc+: Canonical decomposition, followed by canonical composition.
11831 * - +:nfd+: Canonical decomposition.
11832 * - +:nfkc+: Compatibility decomposition, followed by canonical composition.
11833 * - +:nfkd+: Compatibility decomposition.
11834 *
11835 * The encoding of +self+ must be one of:
11836 *
11837 * - Encoding::UTF_8
11838 * - Encoding::UTF_16BE
11839 * - Encoding::UTF_16LE
11840 * - Encoding::UTF_32BE
11841 * - Encoding::UTF_32LE
11842 * - Encoding::GB18030
11843 * - Encoding::UCS_2BE
11844 * - Encoding::UCS_4BE
11845 *
11846 * Examples:
11847 *
11848 * "a\u0300".unicode_normalize # => "a"
11849 * "\u00E0".unicode_normalize(:nfd) # => "a "
11850 *
11851 * Related: String#unicode_normalize!, String#unicode_normalized?.
11852 */
11853static VALUE
11854rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11855{
11856 return unicode_normalize_common(argc, argv, str, id_normalize);
11857}
11858
11859/*
11860 * call-seq:
11861 * unicode_normalize!(form = :nfc) -> self
11862 *
11863 * Like String#unicode_normalize, except that the normalization
11864 * is performed on +self+.
11865 *
11866 * Related String#unicode_normalized?.
11867 *
11868 */
11869static VALUE
11870rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
11871{
11872 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11873}
11874
11875/* call-seq:
11876 * unicode_normalized?(form = :nfc) -> true or false
11877 *
11878 * Returns +true+ if +self+ is in the given +form+ of Unicode normalization,
11879 * +false+ otherwise.
11880 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11881 *
11882 * Examples:
11883 *
11884 * "a\u0300".unicode_normalized? # => false
11885 * "a\u0300".unicode_normalized?(:nfd) # => true
11886 * "\u00E0".unicode_normalized? # => true
11887 * "\u00E0".unicode_normalized?(:nfd) # => false
11888 *
11889 *
11890 * Raises an exception if +self+ is not in a Unicode encoding:
11891 *
11892 * s = "\xE0".force_encoding('ISO-8859-1')
11893 * s.unicode_normalized? # Raises Encoding::CompatibilityError.
11894 *
11895 * Related: String#unicode_normalize, String#unicode_normalize!.
11896 *
11897 */
11898static VALUE
11899rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
11900{
11901 return unicode_normalize_common(argc, argv, str, id_normalized_p);
11902}
11903
11904/**********************************************************************
11905 * Document-class: Symbol
11906 *
11907 * A +Symbol+ object represents a named identifier inside the Ruby interpreter.
11908 *
11909 * You can create a +Symbol+ object explicitly with:
11910 *
11911 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
11912 *
11913 * The same +Symbol+ object will be
11914 * created for a given name or string for the duration of a program's
11915 * execution, regardless of the context or meaning of that name. Thus
11916 * if <code>Fred</code> is a constant in one context, a method in
11917 * another, and a class in a third, the +Symbol+ <code>:Fred</code>
11918 * will be the same object in all three contexts.
11919 *
11920 * module One
11921 * class Fred
11922 * end
11923 * $f1 = :Fred
11924 * end
11925 * module Two
11926 * Fred = 1
11927 * $f2 = :Fred
11928 * end
11929 * def Fred()
11930 * end
11931 * $f3 = :Fred
11932 * $f1.object_id #=> 2514190
11933 * $f2.object_id #=> 2514190
11934 * $f3.object_id #=> 2514190
11935 *
11936 * Constant, method, and variable names are returned as symbols:
11937 *
11938 * module One
11939 * Two = 2
11940 * def three; 3 end
11941 * @four = 4
11942 * @@five = 5
11943 * $six = 6
11944 * end
11945 * seven = 7
11946 *
11947 * One.constants
11948 * # => [:Two]
11949 * One.instance_methods(true)
11950 * # => [:three]
11951 * One.instance_variables
11952 * # => [:@four]
11953 * One.class_variables
11954 * # => [:@@five]
11955 * global_variables.grep(/six/)
11956 * # => [:$six]
11957 * local_variables
11958 * # => [:seven]
11959 *
11960 * A +Symbol+ object differs from a String object in that
11961 * a +Symbol+ object represents an identifier, while a String object
11962 * represents text or data.
11963 *
11964 * == What's Here
11965 *
11966 * First, what's elsewhere. \Class +Symbol+:
11967 *
11968 * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
11969 * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
11970 *
11971 * Here, class +Symbol+ provides methods that are useful for:
11972 *
11973 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
11974 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
11975 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
11976 *
11977 * === Methods for Querying
11978 *
11979 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
11980 * - #=~: Returns the index of the first substring in symbol that matches a
11981 * given Regexp or other object; returns +nil+ if no match is found.
11982 * - #[], #slice : Returns a substring of symbol
11983 * determined by a given index, start/length, or range, or string.
11984 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
11985 * - #encoding: Returns the Encoding object that represents the encoding
11986 * of symbol.
11987 * - #end_with?: Returns +true+ if symbol ends with
11988 * any of the given strings.
11989 * - #match: Returns a MatchData object if symbol
11990 * matches a given Regexp; +nil+ otherwise.
11991 * - #match?: Returns +true+ if symbol
11992 * matches a given Regexp; +false+ otherwise.
11993 * - #length, #size: Returns the number of characters in symbol.
11994 * - #start_with?: Returns +true+ if symbol starts with
11995 * any of the given strings.
11996 *
11997 * === Methods for Comparing
11998 *
11999 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
12000 * or larger than symbol.
12001 * - #==, #===: Returns +true+ if a given symbol has the same content and
12002 * encoding.
12003 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
12004 * symbol is smaller than, equal to, or larger than symbol.
12005 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
12006 * after Unicode case folding; +false+ otherwise.
12007 *
12008 * === Methods for Converting
12009 *
12010 * - #capitalize: Returns symbol with the first character upcased
12011 * and all other characters downcased.
12012 * - #downcase: Returns symbol with all characters downcased.
12013 * - #inspect: Returns the string representation of +self+ as a symbol literal.
12014 * - #name: Returns the frozen string corresponding to symbol.
12015 * - #succ, #next: Returns the symbol that is the successor to symbol.
12016 * - #swapcase: Returns symbol with all upcase characters downcased
12017 * and all downcase characters upcased.
12018 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
12019 * - #to_s, #id2name: Returns the string corresponding to +self+.
12020 * - #to_sym, #intern: Returns +self+.
12021 * - #upcase: Returns symbol with all characters upcased.
12022 *
12023 */
12024
12025
12026/*
12027 * call-seq:
12028 * symbol == object -> true or false
12029 *
12030 * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
12031 */
12032
12033#define sym_equal rb_obj_equal
12034
12035static int
12036sym_printable(const char *s, const char *send, rb_encoding *enc)
12037{
12038 while (s < send) {
12039 int n;
12040 int c = rb_enc_precise_mbclen(s, send, enc);
12041
12042 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
12043 n = MBCLEN_CHARFOUND_LEN(c);
12044 c = rb_enc_mbc_to_codepoint(s, send, enc);
12045 if (!rb_enc_isprint(c, enc)) return FALSE;
12046 s += n;
12047 }
12048 return TRUE;
12049}
12050
12051int
12052rb_str_symname_p(VALUE sym)
12053{
12054 rb_encoding *enc;
12055 const char *ptr;
12056 long len;
12057 rb_encoding *resenc = rb_default_internal_encoding();
12058
12059 if (resenc == NULL) resenc = rb_default_external_encoding();
12060 enc = STR_ENC_GET(sym);
12061 ptr = RSTRING_PTR(sym);
12062 len = RSTRING_LEN(sym);
12063 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
12064 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
12065 return FALSE;
12066 }
12067 return TRUE;
12068}
12069
12070VALUE
12071rb_str_quote_unprintable(VALUE str)
12072{
12073 rb_encoding *enc;
12074 const char *ptr;
12075 long len;
12076 rb_encoding *resenc;
12077
12078 Check_Type(str, T_STRING);
12079 resenc = rb_default_internal_encoding();
12080 if (resenc == NULL) resenc = rb_default_external_encoding();
12081 enc = STR_ENC_GET(str);
12082 ptr = RSTRING_PTR(str);
12083 len = RSTRING_LEN(str);
12084 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12085 !sym_printable(ptr, ptr + len, enc)) {
12086 return rb_str_escape(str);
12087 }
12088 return str;
12089}
12090
12091VALUE
12092rb_id_quote_unprintable(ID id)
12093{
12094 VALUE str = rb_id2str(id);
12095 if (!rb_str_symname_p(str)) {
12096 return rb_str_escape(str);
12097 }
12098 return str;
12099}
12100
12101/*
12102 * call-seq:
12103 * inspect -> string
12104 *
12105 * Returns a string representation of +self+ (including the leading colon):
12106 *
12107 * :foo.inspect # => ":foo"
12108 *
12109 * Related: Symbol#to_s, Symbol#name.
12110 *
12111 */
12112
12113static VALUE
12114sym_inspect(VALUE sym)
12115{
12116 VALUE str = rb_sym2str(sym);
12117 const char *ptr;
12118 long len;
12119 char *dest;
12120
12121 if (!rb_str_symname_p(str)) {
12122 str = rb_str_inspect(str);
12123 len = RSTRING_LEN(str);
12124 rb_str_resize(str, len + 1);
12125 dest = RSTRING_PTR(str);
12126 memmove(dest + 1, dest, len);
12127 }
12128 else {
12129 rb_encoding *enc = STR_ENC_GET(str);
12130 VALUE orig_str = str;
12131
12132 len = RSTRING_LEN(orig_str);
12133 str = rb_enc_str_new(0, len + 1, enc);
12134
12135 // Get data pointer after allocation
12136 ptr = RSTRING_PTR(orig_str);
12137 dest = RSTRING_PTR(str);
12138 memcpy(dest + 1, ptr, len);
12139
12140 RB_GC_GUARD(orig_str);
12141 }
12142 dest[0] = ':';
12143
12145
12146 return str;
12147}
12148
12149VALUE
12151{
12152 VALUE str = str_new_shared(rb_cString, rb_sym2str(sym));
12153 FL_SET_RAW(str, STR_CHILLED_SYMBOL_TO_S);
12154 return str;
12155}
12156
12157VALUE
12158rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
12159{
12160 VALUE obj;
12161
12162 if (argc < 1) {
12163 rb_raise(rb_eArgError, "no receiver given");
12164 }
12165 obj = argv[0];
12166 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
12167}
12168
12169/*
12170 * call-seq:
12171 * succ
12172 *
12173 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
12174 *
12175 * :foo.succ # => :fop
12176 *
12177 * Related: String#succ.
12178 */
12179
12180static VALUE
12181sym_succ(VALUE sym)
12182{
12183 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
12184}
12185
12186/*
12187 * call-seq:
12188 * symbol <=> object -> -1, 0, +1, or nil
12189 *
12190 * If +object+ is a symbol,
12191 * returns the equivalent of <tt>symbol.to_s <=> object.to_s</tt>:
12192 *
12193 * :bar <=> :foo # => -1
12194 * :foo <=> :foo # => 0
12195 * :foo <=> :bar # => 1
12196 *
12197 * Otherwise, returns +nil+:
12198 *
12199 * :foo <=> 'bar' # => nil
12200 *
12201 * Related: String#<=>.
12202 */
12203
12204static VALUE
12205sym_cmp(VALUE sym, VALUE other)
12206{
12207 if (!SYMBOL_P(other)) {
12208 return Qnil;
12209 }
12210 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
12211}
12212
12213/*
12214 * call-seq:
12215 * casecmp(object) -> -1, 0, 1, or nil
12216 *
12217 * :include: doc/symbol/casecmp.rdoc
12218 *
12219 */
12220
12221static VALUE
12222sym_casecmp(VALUE sym, VALUE other)
12223{
12224 if (!SYMBOL_P(other)) {
12225 return Qnil;
12226 }
12227 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
12228}
12229
12230/*
12231 * call-seq:
12232 * casecmp?(object) -> true, false, or nil
12233 *
12234 * :include: doc/symbol/casecmp_p.rdoc
12235 *
12236 */
12237
12238static VALUE
12239sym_casecmp_p(VALUE sym, VALUE other)
12240{
12241 if (!SYMBOL_P(other)) {
12242 return Qnil;
12243 }
12244 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
12245}
12246
12247/*
12248 * call-seq:
12249 * symbol =~ object -> integer or nil
12250 *
12251 * Equivalent to <tt>symbol.to_s =~ object</tt>,
12252 * including possible updates to global variables;
12253 * see String#=~.
12254 *
12255 */
12256
12257static VALUE
12258sym_match(VALUE sym, VALUE other)
12259{
12260 return rb_str_match(rb_sym2str(sym), other);
12261}
12262
12263/*
12264 * call-seq:
12265 * match(pattern, offset = 0) -> matchdata or nil
12266 * match(pattern, offset = 0) {|matchdata| } -> object
12267 *
12268 * Equivalent to <tt>self.to_s.match</tt>,
12269 * including possible updates to global variables;
12270 * see String#match.
12271 *
12272 */
12273
12274static VALUE
12275sym_match_m(int argc, VALUE *argv, VALUE sym)
12276{
12277 return rb_str_match_m(argc, argv, rb_sym2str(sym));
12278}
12279
12280/*
12281 * call-seq:
12282 * match?(pattern, offset) -> true or false
12283 *
12284 * Equivalent to <tt>sym.to_s.match?</tt>;
12285 * see String#match.
12286 *
12287 */
12288
12289static VALUE
12290sym_match_m_p(int argc, VALUE *argv, VALUE sym)
12291{
12292 return rb_str_match_m_p(argc, argv, sym);
12293}
12294
12295/*
12296 * call-seq:
12297 * symbol[index] -> string or nil
12298 * symbol[start, length] -> string or nil
12299 * symbol[range] -> string or nil
12300 * symbol[regexp, capture = 0] -> string or nil
12301 * symbol[substring] -> string or nil
12302 *
12303 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
12304 *
12305 */
12306
12307static VALUE
12308sym_aref(int argc, VALUE *argv, VALUE sym)
12309{
12310 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
12311}
12312
12313/*
12314 * call-seq:
12315 * length -> integer
12316 *
12317 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
12318 */
12319
12320static VALUE
12321sym_length(VALUE sym)
12322{
12323 return rb_str_length(rb_sym2str(sym));
12324}
12325
12326/*
12327 * call-seq:
12328 * empty? -> true or false
12329 *
12330 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
12331 *
12332 */
12333
12334static VALUE
12335sym_empty(VALUE sym)
12336{
12337 return rb_str_empty(rb_sym2str(sym));
12338}
12339
12340/*
12341 * call-seq:
12342 * upcase(*options) -> symbol
12343 *
12344 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
12345 *
12346 * See String#upcase.
12347 *
12348 */
12349
12350static VALUE
12351sym_upcase(int argc, VALUE *argv, VALUE sym)
12352{
12353 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
12354}
12355
12356/*
12357 * call-seq:
12358 * downcase(*options) -> symbol
12359 *
12360 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
12361 *
12362 * See String#downcase.
12363 *
12364 * Related: Symbol#upcase.
12365 *
12366 */
12367
12368static VALUE
12369sym_downcase(int argc, VALUE *argv, VALUE sym)
12370{
12371 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
12372}
12373
12374/*
12375 * call-seq:
12376 * capitalize(*options) -> symbol
12377 *
12378 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
12379 *
12380 * See String#capitalize.
12381 *
12382 */
12383
12384static VALUE
12385sym_capitalize(int argc, VALUE *argv, VALUE sym)
12386{
12387 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
12388}
12389
12390/*
12391 * call-seq:
12392 * swapcase(*options) -> symbol
12393 *
12394 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
12395 *
12396 * See String#swapcase.
12397 *
12398 */
12399
12400static VALUE
12401sym_swapcase(int argc, VALUE *argv, VALUE sym)
12402{
12403 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
12404}
12405
12406/*
12407 * call-seq:
12408 * start_with?(*string_or_regexp) -> true or false
12409 *
12410 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
12411 *
12412 */
12413
12414static VALUE
12415sym_start_with(int argc, VALUE *argv, VALUE sym)
12416{
12417 return rb_str_start_with(argc, argv, rb_sym2str(sym));
12418}
12419
12420/*
12421 * call-seq:
12422 * end_with?(*strings) -> true or false
12423 *
12424 *
12425 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
12426 *
12427 */
12428
12429static VALUE
12430sym_end_with(int argc, VALUE *argv, VALUE sym)
12431{
12432 return rb_str_end_with(argc, argv, rb_sym2str(sym));
12433}
12434
12435/*
12436 * call-seq:
12437 * encoding -> encoding
12438 *
12439 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12440 *
12441 */
12442
12443static VALUE
12444sym_encoding(VALUE sym)
12445{
12446 return rb_obj_encoding(rb_sym2str(sym));
12447}
12448
12449static VALUE
12450string_for_symbol(VALUE name)
12451{
12452 if (!RB_TYPE_P(name, T_STRING)) {
12453 VALUE tmp = rb_check_string_type(name);
12454 if (NIL_P(tmp)) {
12455 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol nor a string",
12456 name);
12457 }
12458 name = tmp;
12459 }
12460 return name;
12461}
12462
12463ID
12465{
12466 if (SYMBOL_P(name)) {
12467 return SYM2ID(name);
12468 }
12469 name = string_for_symbol(name);
12470 return rb_intern_str(name);
12471}
12472
12473VALUE
12475{
12476 if (SYMBOL_P(name)) {
12477 return name;
12478 }
12479 name = string_for_symbol(name);
12480 return rb_str_intern(name);
12481}
12482
12483/*
12484 * call-seq:
12485 * Symbol.all_symbols -> array_of_symbols
12486 *
12487 * Returns an array of all symbols currently in Ruby's symbol table:
12488 *
12489 * Symbol.all_symbols.size # => 9334
12490 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12491 *
12492 */
12493
12494static VALUE
12495sym_all_symbols(VALUE _)
12496{
12497 return rb_sym_all_symbols();
12498}
12499
12500VALUE
12501rb_str_to_interned_str(VALUE str)
12502{
12503 return rb_fstring(str);
12504}
12505
12506VALUE
12507rb_interned_str(const char *ptr, long len)
12508{
12509 struct RString fake_str;
12510 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), true, false);
12511}
12512
12513VALUE
12514rb_interned_str_cstr(const char *ptr)
12515{
12516 return rb_interned_str(ptr, strlen(ptr));
12517}
12518
12519VALUE
12520rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12521{
12522 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12523 rb_enc_autoload(enc);
12524 }
12525
12526 struct RString fake_str;
12527 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, false);
12528}
12529
12530VALUE
12531rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc)
12532{
12533 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12534 rb_enc_autoload(enc);
12535 }
12536
12537 struct RString fake_str;
12538 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, true);
12539}
12540
12541VALUE
12542rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
12543{
12544 return rb_enc_interned_str(ptr, strlen(ptr), enc);
12545}
12546
12547#if USE_YJIT
12548void
12549rb_yjit_str_concat_codepoint(VALUE str, VALUE codepoint)
12550{
12551 if (RB_LIKELY(ENCODING_GET_INLINED(str) == rb_ascii8bit_encindex())) {
12552 ssize_t code = RB_NUM2SSIZE(codepoint);
12553
12554 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12555 rb_str_buf_cat_byte(str, (char) code);
12556 return;
12557 }
12558 }
12559
12560 rb_str_concat(str, codepoint);
12561}
12562#endif
12563
12564void
12565Init_String(void)
12566{
12567 rb_cString = rb_define_class("String", rb_cObject);
12568 RUBY_ASSERT(rb_vm_fstring_table());
12569 st_foreach(rb_vm_fstring_table(), fstring_set_class_i, rb_cString);
12571 rb_define_alloc_func(rb_cString, empty_str_alloc);
12572 rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12573 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12574 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12575 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12576 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12579 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12580 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12581 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12582 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12585 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12586 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12587 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12588 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12591 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12592 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12593 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12594 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12595 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12597 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12599 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12600 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12601 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12602 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12603 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12604 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12606 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12607 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12608 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12609 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12610 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12611 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12612 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12613 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12615 rb_define_method(rb_cString, "+@", str_uplus, 0);
12616 rb_define_method(rb_cString, "-@", str_uminus, 0);
12617 rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12618 rb_define_alias(rb_cString, "dedup", "-@");
12619
12620 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12621 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12622 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12623 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12626 rb_define_method(rb_cString, "undump", str_undump, 0);
12627
12628 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12629 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12630 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12631 sym_fold = ID2SYM(rb_intern_const("fold"));
12632
12633 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12634 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12635 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12636 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12637
12638 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12639 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12640 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12641 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12642
12643 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12644 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12645 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12646 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12647 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12648 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12649 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12650 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12651 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12652 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12653 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12654 rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
12656 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12657 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12658 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12659 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12660 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12661
12662 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12663 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12664 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12665
12666 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12667
12668 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12669 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12670 rb_define_method(rb_cString, "center", rb_str_center, -1);
12671
12672 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12673 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12674 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12675 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12676 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12677 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12678 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12679 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12680 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12681
12682 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12683 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12684 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12685 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12686 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12687 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12688 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12689 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12690 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12691
12692 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12693 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12694 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12695 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12696 rb_define_method(rb_cString, "count", rb_str_count, -1);
12697
12698 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12699 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12700 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12701 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12702
12703 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12704 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12705 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12706 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12707 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12708
12709 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12710
12711 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12712 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12713
12714 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12715 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12716
12717 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12718 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12719 rb_define_method(rb_cString, "b", rb_str_b, 0);
12720 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12721 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12722
12723 /* define UnicodeNormalize module here so that we don't have to look it up */
12724 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12725 id_normalize = rb_intern_const("normalize");
12726 id_normalized_p = rb_intern_const("normalized?");
12727
12728 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12729 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12730 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12731
12732 rb_fs = Qnil;
12733 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12734 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12735 rb_gc_register_address(&rb_fs);
12736
12737 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12741 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12742
12743 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12744 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12745 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12746 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12747 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12748 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12749
12750 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12751 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12752 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12753 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12754
12755 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12756 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12757 rb_define_method(rb_cSymbol, "length", sym_length, 0);
12758 rb_define_method(rb_cSymbol, "size", sym_length, 0);
12759 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12760 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12761 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12762
12763 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12764 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12765 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12766 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12767
12768 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12769 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12770
12771 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12772}
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:199
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
Definition assert.h:291
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:219
Atomic operations.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
Definition coderange.h:39
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1198
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition fl_type.h:883
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
Definition fl_type.h:469
@ RUBY_FL_FREEZE
This flag has something to do with data immutability.
Definition fl_type.h:324
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1187
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:980
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1095
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2345
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2166
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:2635
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:937
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:2424
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:108
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:106
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:134
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1682
#define FL_EXIVAR
Old name of RUBY_FL_EXIVAR.
Definition fl_type.h:66
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:404
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:137
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1683
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
Definition value_type.h:57
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:135
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:203
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:109
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:401
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:132
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:129
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:658
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:67
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
Definition fl_type.h:126
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:518
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:519
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
Definition memory.h:405
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:68
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:131
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:67
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:108
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:133
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:110
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:130
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:138
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:69
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition error.c:476
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3877
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1434
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1430
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1437
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1428
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1432
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:669
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:2097
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition object.c:2115
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1272
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3483
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:247
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition object.c:576
VALUE rb_cSymbol
Symbol class.
Definition string.c:79
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:179
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition object.c:1260
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:78
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3192
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition gc.h:603
Encoding relates APIs.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:683
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:704
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:571
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:447
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition encoding.h:99
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:619
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:726
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1285
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:900
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1150
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:2926
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1169
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:12520
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:252
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2249
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition string.c:3611
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:1098
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1390
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1291
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:919
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:12542
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:784
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:414
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1475
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2651
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2914
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1731
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1099
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1186
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition gc.h:479
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:206
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:239
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:669
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition io.c:207
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:1835
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1042
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:1841
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1887
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1235
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4198
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3695
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1489
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1905
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1677
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1455
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2400
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1583
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition string.h:945
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition string.h:939
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3676
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1366
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:12150
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2472
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "defaultexternal" encoding.
Definition string.c:1342
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1671
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:2954
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:5268
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:4045
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition string.c:3051
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:11449
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1752
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1498
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1713
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1681
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:1132
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1532
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:954
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1461
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1916
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:4031
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition string.c:3444
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2338
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
Definition string.c:1934
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1639
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1567
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6476
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:3059
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1146
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:12514
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition string.c:1372
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "defaultexternal" encoding.
Definition string.h:1604
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition string.c:3642
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:3001
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:4147
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3268
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:7197
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2692
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:12507
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:4101
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:3918
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:4076
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1692
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3618
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:3176
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5778
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:11507
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1625
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1627
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:631
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2850
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:3148
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1656
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3251
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1144
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1549
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2648
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7311
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1354
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1643
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2352
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1514
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5696
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:9404
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1138
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition symbol.c:878
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:1871
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
Definition variable.c:1888
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:2960
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1291
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:284
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:970
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:12474
ID rb_to_id(VALUE str)
Definition string.c:12464
int capa
Designed capacity of the buffer.
Definition io.h:11
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1844
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3479
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4442
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:215
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1354
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:360
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:150
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:138
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1384
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2827
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:468
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:442
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:488
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2711
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1378
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2722
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1704
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:449
struct rb_data_type_struct rb_data_type_t
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:197
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1417
#define errno
Ractor-aware version of errno.
Definition ruby.h:388
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
Definition size_t.h:49
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:75
Ruby's String.
Definition rstring.h:196
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:199
long capa
Capacity of *ptr.
Definition rstring.h:232
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:206
union RString::@157025041137035241047331270155043025061071337053::@157067065136062356112324002106172053054013023024::@365170260060164113275356137374160141226332013204 aux
Auxiliary info.
struct RString::@157025041137035241047331270155043025061071337053::@153056146250355212360325351117351053336274231135 embed
Embedded contents.
VALUE shared
Parent of the string.
Definition rstring.h:240
char * ptr
Pointer to the contents of the string.
Definition rstring.h:222
union RString::@157025041137035241047331270155043025061071337053 as
String's specific fields.
struct RString::@157025041137035241047331270155043025061071337053::@157067065136062356112324002106172053054013023024 heap
Strings that use separated memory region for contents use this pattern.
Definition string.c:8269
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:300
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
Definition value_type.h:225
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:433
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376
ruby_value_type
C-level type of an object.
Definition value_type.h:113