diff --git a/BUILD b/BUILD index 480646fc1a85ac68251f3566e55250ba532451cc..00330b69036a693805a3f3f8c6a9a99cfc84deec 100644 --- a/BUILD +++ b/BUILD @@ -9,22 +9,20 @@ licenses(["notice"]) exports_files(["LICENSE"]) config_setting( - name = "darwin", + name = "macos", values = {"cpu": "darwin"}, ) config_setting( - name = "windows", - values = {"cpu": "x64_windows"}, + name = "wasm", + values = {"cpu": "wasm32"}, ) config_setting( - name = "windows_msvc", - values = {"cpu": "x64_windows_msvc"}, + name = "windows", + values = {"cpu": "x64_windows"}, ) -load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library", "cc_test") - cc_library( name = "re2", srcs = [ @@ -75,17 +73,17 @@ cc_library( "re2/stringpiece.h", ], copts = select({ + ":wasm": [], ":windows": [], - ":windows_msvc": [], "//conditions:default": ["-pthread"], }), linkopts = select({ - # Darwin doesn't need `-pthread' when linking and it appears that + # macOS doesn't need `-pthread' when linking and it appears that # older versions of Clang will warn about the unused command line # argument, so just don't pass it. - ":darwin": [], + ":macos": [], + ":wasm": [], ":windows": [], - ":windows_msvc": [], "//conditions:default": ["-pthread"], }), visibility = ["//visibility:public"], diff --git a/CMakeLists.txt b/CMakeLists.txt index 5c980f04622010a8d7936afce787e718495a45be..fcd3870f2fa4182711b8893422ad5219dc734c19 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,14 +5,15 @@ # Old enough to support Ubuntu Xenial. cmake_minimum_required(VERSION 3.5.1) -if(POLICY CMP0048) - cmake_policy(SET CMP0048 NEW) -endif() - project(RE2 CXX) include(CTest) include(GNUInstallDirs) +if(NOT CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 11) + set(CMAKE_CXX_STANDARD_REQUIRED ON) +endif() + option(BUILD_SHARED_LIBS "build shared libraries" OFF) option(USEPCRE "use PCRE in tests and benchmarks" OFF) @@ -20,6 +21,10 @@ option(USEPCRE "use PCRE in tests and benchmarks" OFF) # so we provide an option similar to BUILD_TESTING, but just for RE2. option(RE2_BUILD_TESTING "enable testing for RE2" ON) +# ABI version +# http://tldp.org/HOWTO/Program-Library-HOWTO/shared-libraries.html +set(SONAME 9) + set(EXTRA_TARGET_LINK_LIBRARIES) if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") @@ -36,19 +41,14 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") # Without a byte order mark (BOM), Visual Studio assumes that the source # file is encoded using the current user code page, so we specify UTF-8. add_compile_options(/utf-8) -elseif(CYGWIN OR MINGW) - # See https://stackoverflow.com/questions/38139631 for details. - add_compile_options(-std=gnu++11) -elseif(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang") - add_compile_options(-std=c++11) endif() if(WIN32) add_definitions(-DUNICODE -D_UNICODE -DSTRICT -DNOMINMAX) add_definitions(-D_CRT_SECURE_NO_WARNINGS -D_SCL_SECURE_NO_WARNINGS) elseif(UNIX) - add_compile_options(-pthread) - list(APPEND EXTRA_TARGET_LINK_LIBRARIES -pthread) + set(THREADS_PREFER_PTHREAD_FLAG ON) + find_package(Threads REQUIRED) endif() if(USEPCRE) @@ -56,8 +56,6 @@ if(USEPCRE) list(APPEND EXTRA_TARGET_LINK_LIBRARIES pcre) endif() -include_directories(${CMAKE_CURRENT_SOURCE_DIR}) - set(RE2_SOURCES re2/bitstate.cc re2/compile.cc @@ -84,8 +82,14 @@ set(RE2_SOURCES ) add_library(re2 ${RE2_SOURCES}) +target_include_directories(re2 PUBLIC $) +set_target_properties(re2 PROPERTIES SOVERSION ${SONAME} VERSION ${SONAME}.0.0) add_library(re2::re2 ALIAS re2) +if(UNIX) + target_link_libraries(re2 PUBLIC Threads::Threads) +endif() + if(RE2_BUILD_TESTING) set(TESTING_SOURCES re2/testing/backtrack.cc @@ -99,6 +103,7 @@ if(RE2_BUILD_TESTING) ) add_library(testing STATIC ${TESTING_SOURCES}) + target_link_libraries(testing PUBLIC re2) set(TEST_TARGETS charclass_test @@ -130,14 +135,14 @@ if(RE2_BUILD_TESTING) foreach(target ${TEST_TARGETS}) add_executable(${target} re2/testing/${target}.cc util/test.cc) - target_link_libraries(${target} testing re2 ${EXTRA_TARGET_LINK_LIBRARIES}) + target_link_libraries(${target} testing ${EXTRA_TARGET_LINK_LIBRARIES}) add_test(NAME ${target} COMMAND ${target}) - endforeach(target) + endforeach() foreach(target ${BENCHMARK_TARGETS}) add_executable(${target} re2/testing/${target}.cc util/benchmark.cc) - target_link_libraries(${target} testing re2 ${EXTRA_TARGET_LINK_LIBRARIES}) - endforeach(target) + target_link_libraries(${target} testing ${EXTRA_TARGET_LINK_LIBRARIES}) + endforeach() endif() set(RE2_HEADERS diff --git a/Makefile b/Makefile index 15d782464178e0d23f65759b0dbc7f819abe8305..2409093966e84d4218f8e7c1a10e77119c625b68 100644 --- a/Makefile +++ b/Makefile @@ -44,7 +44,7 @@ endif # ABI version # http://tldp.org/HOWTO/Program-Library-HOWTO/shared-libraries.html -SONAME=0 +SONAME=9 # To rebuild the Tables generated by Perl and Python scripts (requires Internet # access for Unicode data), uncomment the following line: @@ -55,7 +55,7 @@ ifeq ($(shell uname),Darwin) SOEXT=dylib SOEXTVER=$(SONAME).$(SOEXT) SOEXTVER00=$(SONAME).0.0.$(SOEXT) -MAKE_SHARED_LIBRARY=$(CXX) -dynamiclib -Wl,-install_name,$(libdir)/libre2.$(SOEXTVER),-exported_symbols_list,libre2.symbols.darwin $(RE2_LDFLAGS) $(LDFLAGS) +MAKE_SHARED_LIBRARY=$(CXX) -dynamiclib -Wl,-compatibility_version,$(SONAME),-current_version,$(SONAME).0.0,-install_name,$(libdir)/libre2.$(SOEXTVER),-exported_symbols_list,libre2.symbols.darwin $(RE2_LDFLAGS) $(LDFLAGS) else ifeq ($(shell uname),SunOS) SOEXT=so SOEXTVER=$(SOEXT).$(SONAME) @@ -68,6 +68,7 @@ SOEXTVER00=$(SOEXT).$(SONAME).0.0 MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.$(SOEXTVER),--version-script,libre2.symbols $(RE2_LDFLAGS) $(LDFLAGS) endif +.PHONY: all all: obj/libre2.a obj/so/libre2.$(SOEXT) INSTALL_HFILES=\ @@ -176,40 +177,49 @@ DTESTOFILES=$(patsubst obj/%,obj/dbg/%,$(TESTOFILES)) DTESTS=$(patsubst obj/%,obj/dbg/%,$(TESTS)) DBIGTESTS=$(patsubst obj/%,obj/dbg/%,$(BIGTESTS)) +.PRECIOUS: obj/%.o obj/%.o: %.cc $(HFILES) @mkdir -p $$(dirname $@) $(CXX) -c -o $@ $(CPPFLAGS) $(RE2_CXXFLAGS) $(CXXFLAGS) -DNDEBUG $*.cc +.PRECIOUS: obj/dbg/%.o obj/dbg/%.o: %.cc $(HFILES) @mkdir -p $$(dirname $@) $(CXX) -c -o $@ $(CPPFLAGS) $(RE2_CXXFLAGS) $(CXXFLAGS) $*.cc +.PRECIOUS: obj/so/%.o obj/so/%.o: %.cc $(HFILES) @mkdir -p $$(dirname $@) $(CXX) -c -o $@ -fPIC $(CPPFLAGS) $(RE2_CXXFLAGS) $(CXXFLAGS) -DNDEBUG $*.cc +.PRECIOUS: obj/libre2.a obj/libre2.a: $(OFILES) @mkdir -p obj $(AR) $(ARFLAGS) obj/libre2.a $(OFILES) +.PRECIOUS: obj/dbg/libre2.a obj/dbg/libre2.a: $(DOFILES) @mkdir -p obj/dbg $(AR) $(ARFLAGS) obj/dbg/libre2.a $(DOFILES) -obj/so/libre2.$(SOEXT): $(SOFILES) +.PRECIOUS: obj/so/libre2.$(SOEXT) +obj/so/libre2.$(SOEXT): $(SOFILES) libre2.symbols libre2.symbols.darwin @mkdir -p obj/so $(MAKE_SHARED_LIBRARY) -o obj/so/libre2.$(SOEXTVER) $(SOFILES) ln -sf libre2.$(SOEXTVER) $@ +.PRECIOUS: obj/dbg/test/% obj/dbg/test/%: obj/dbg/libre2.a obj/dbg/re2/testing/%.o $(DTESTOFILES) obj/dbg/util/test.o @mkdir -p obj/dbg/test $(CXX) -o $@ obj/dbg/re2/testing/$*.o $(DTESTOFILES) obj/dbg/util/test.o obj/dbg/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) +.PRECIOUS: obj/test/% obj/test/%: obj/libre2.a obj/re2/testing/%.o $(TESTOFILES) obj/util/test.o @mkdir -p obj/test $(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) obj/util/test.o obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) # Test the shared lib, falling back to the static lib for private symbols +.PRECIOUS: obj/so/test/% obj/so/test/%: obj/so/libre2.$(SOEXT) obj/libre2.a obj/re2/testing/%.o $(TESTOFILES) obj/util/test.o @mkdir -p obj/so/test $(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) obj/util/test.o -Lobj/so -lre2 obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) @@ -223,69 +233,100 @@ obj/test/regexp_benchmark: obj/libre2.a obj/re2/testing/regexp_benchmark.o $(TES # is simply a way to check that the target builds and then to run it against a # fixed set of inputs. To perform real fuzzing, refer to the documentation for # libFuzzer (llvm.org/docs/LibFuzzer.html) and AFL (lcamtuf.coredump.cx/afl/). +obj/test/re2_fuzzer: CXXFLAGS:=-I./re2/fuzzing/compiler-rt/include $(CXXFLAGS) obj/test/re2_fuzzer: obj/libre2.a obj/re2/fuzzing/re2_fuzzer.o obj/util/fuzz.o @mkdir -p obj/test $(CXX) -o $@ obj/re2/fuzzing/re2_fuzzer.o obj/util/fuzz.o obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) ifdef REBUILD_TABLES +.PRECIOUS: re2/perl_groups.cc re2/perl_groups.cc: re2/make_perl_groups.pl perl $< > $@ -re2/unicode_%.cc: re2/make_unicode_%.py - python $< > $@ - -.PRECIOUS: re2/perl_groups.cc re2/unicode_casefold.cc re2/unicode_groups.cc +.PRECIOUS: re2/unicode_%.cc +re2/unicode_%.cc: re2/make_unicode_%.py re2/unicode.py + python3 $< > $@ endif +.PHONY: distclean distclean: clean rm -f re2/perl_groups.cc re2/unicode_casefold.cc re2/unicode_groups.cc +.PHONY: clean clean: rm -rf obj rm -f re2/*.pyc +.PHONY: testofiles testofiles: $(TESTOFILES) +.PHONY: test test: $(DTESTS) $(TESTS) $(STESTS) debug-test static-test shared-test +.PHONY: debug-test debug-test: $(DTESTS) @./runtests $(DTESTS) +.PHONY: static-test static-test: $(TESTS) @./runtests $(TESTS) +.PHONY: shared-test shared-test: $(STESTS) @./runtests -shared-library-path obj/so $(STESTS) +.PHONY: debug-bigtest debug-bigtest: $(DTESTS) $(DBIGTESTS) @./runtests $(DTESTS) $(DBIGTESTS) +.PHONY: static-bigtest static-bigtest: $(TESTS) $(BIGTESTS) @./runtests $(TESTS) $(BIGTESTS) +.PHONY: shared-bigtest shared-bigtest: $(STESTS) $(SBIGTESTS) @./runtests -shared-library-path obj/so $(STESTS) $(SBIGTESTS) +.PHONY: benchmark benchmark: obj/test/regexp_benchmark +.PHONY: fuzz fuzz: obj/test/re2_fuzzer -install: obj/libre2.a obj/so/libre2.$(SOEXT) - mkdir -p $(DESTDIR)$(includedir)/re2 $(DESTDIR)$(libdir)/pkgconfig - $(INSTALL_DATA) $(INSTALL_HFILES) $(DESTDIR)$(includedir)/re2 +.PHONY: install +install: static-install shared-install + +.PHONY: static +static: obj/libre2.a + +.PHONY: static-install +static-install: obj/libre2.a common-install $(INSTALL) obj/libre2.a $(DESTDIR)$(libdir)/libre2.a + +.PHONY: shared +shared: obj/so/libre2.$(SOEXT) + +.PHONY: shared-install +shared-install: obj/so/libre2.$(SOEXT) common-install $(INSTALL) obj/so/libre2.$(SOEXT) $(DESTDIR)$(libdir)/libre2.$(SOEXTVER00) ln -sf libre2.$(SOEXTVER00) $(DESTDIR)$(libdir)/libre2.$(SOEXTVER) ln -sf libre2.$(SOEXTVER00) $(DESTDIR)$(libdir)/libre2.$(SOEXT) + +.PHONY: common-install +common-install: + mkdir -p $(DESTDIR)$(includedir)/re2 $(DESTDIR)$(libdir)/pkgconfig + $(INSTALL_DATA) $(INSTALL_HFILES) $(DESTDIR)$(includedir)/re2 $(INSTALL_DATA) re2.pc $(DESTDIR)$(libdir)/pkgconfig/re2.pc $(SED_INPLACE) -e "s#@includedir@#$(includedir)#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc $(SED_INPLACE) -e "s#@libdir@#$(libdir)#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc +.PHONY: testinstall testinstall: static-testinstall shared-testinstall @echo @echo Install tests passed. @echo +.PHONY: static-testinstall static-testinstall: CXXFLAGS:=-std=c++11 -pthread -I$(DESTDIR)$(includedir) $(CXXFLAGS) static-testinstall: LDFLAGS:=-pthread -L$(DESTDIR)$(libdir) -l:libre2.a $(LDICU) $(LDFLAGS) static-testinstall: @@ -300,6 +341,7 @@ else obj/testinstall endif +.PHONY: shared-testinstall shared-testinstall: CXXFLAGS:=-std=c++11 -pthread -I$(DESTDIR)$(includedir) $(CXXFLAGS) shared-testinstall: LDFLAGS:=-pthread -L$(DESTDIR)$(libdir) -lre2 $(LDICU) $(LDFLAGS) shared-testinstall: @@ -312,19 +354,14 @@ else LD_LIBRARY_PATH="$(DESTDIR)$(libdir):$(LD_LIBRARY_PATH)" obj/testinstall endif +.PHONY: benchlog benchlog: obj/test/regexp_benchmark (echo '==BENCHMARK==' `hostname` `date`; \ (uname -a; $(CXX) --version; git rev-parse --short HEAD; file obj/test/regexp_benchmark) | sed 's/^/# /'; \ echo; \ ./obj/test/regexp_benchmark 'PCRE|RE2') | tee -a benchlog.$$(hostname | sed 's/\..*//') -# Keep gmake from deleting intermediate files it creates. -# This makes repeated builds faster and preserves debug info on OS X. - -.PRECIOUS: obj/%.o obj/dbg/%.o obj/so/%.o obj/libre2.a \ - obj/dbg/libre2.a obj/so/libre2.a \ - obj/test/% obj/so/test/% obj/dbg/test/% - +.PHONY: log log: $(MAKE) clean $(MAKE) CXXFLAGS="$(CXXFLAGS) -DLOGGING=1" \ @@ -340,6 +377,3 @@ log: echo '#' RE2 basic search tests built by make $@ >re2-search.txt echo '#' $$(date) >>re2-search.txt obj/test/search_test |grep -v '^PASS$$' >>re2-search.txt - -x: x.cc obj/libre2.a - g++ -I. -o x x.cc obj/libre2.a diff --git a/README b/README index d1ef431b2b703ca24fbda11cad269f3c0dfae33b..caee6afb6b0051dabe19e7edf7f393752de42162 100644 --- a/README +++ b/README @@ -27,12 +27,16 @@ under the BSD-style license found in the LICENSE file. RE2's native language is C++. +The Python wrapper is at https://github.com/google/re2/tree/abseil/python +and on PyPI (https://pypi.org/project/google-re2/). + A C wrapper is at https://github.com/marcomaggi/cre2/. +A D wrapper is at https://github.com/ShigekiKarita/re2d/ and on DUB (code.dlang.org). An Erlang wrapper is at https://github.com/dukesoferl/re2/ and on Hex (hex.pm). An Inferno wrapper is at https://github.com/powerman/inferno-re2/. A Node.js wrapper is at https://github.com/uhop/node-re2/ and on NPM (npmjs.com). An OCaml wrapper is at https://github.com/janestreet/re2/ and on OPAM (opam.ocaml.org). A Perl wrapper is at https://github.com/dgl/re-engine-RE2/ and on CPAN (cpan.org). -A Python wrapper is at https://github.com/facebook/pyre2/ and on PyPI (pypi.org). -An R wrapper is at https://github.com/qinwf/re2r/ and on CRAN (cran.r-project.org). +An R wrapper is at https://github.com/girishji/re2/ and on CRAN (cran.r-project.org). A Ruby wrapper is at https://github.com/mudge/re2/ and on RubyGems (rubygems.org). +A WebAssembly wrapper is at https://github.com/google/re2-wasm/ and on NPM (npmjs.com). diff --git a/WORKSPACE b/WORKSPACE index 484abfe2a4bf9f7ec2ae330fa7a8f36b2037cb94..b35619c116692e95b8ac8aaa5960441f4bd72d07 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -5,11 +5,3 @@ # Bazel (http://bazel.io/) WORKSPACE file for RE2. workspace(name = "com_googlesource_code_re2") - -load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") - -http_archive( - name = "rules_cc", - strip_prefix = "rules_cc-master", - urls = ["https://github.com/bazelbuild/rules_cc/archive/master.zip"], -) diff --git a/doc/mksyntaxgo b/doc/mksyntaxgo old mode 100755 new mode 100644 index caad9b60b0315c2658271e238789c9204f7f7642..d30d281460e97ad2a2a62e29cfa180e9c5a2e46c --- a/doc/mksyntaxgo +++ b/doc/mksyntaxgo @@ -15,7 +15,7 @@ sam -d $out <<'!' ,s/\n\n\n+/\n\n/g ,x/(^.* .*\n)+/ | awk -F' ' '{printf(" %-14s %s\n", $1, $2)}' 1,2c -// Copyright 2012 The Go Authors. All rights reserved. +// Copyright 2012 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. @@ -33,6 +33,7 @@ Parts of the syntax can be disabled by passing alternate flags to Parse. . $a +Unicode character classes are those in unicode.Categories and unicode.Scripts. */ package syntax . diff --git a/doc/syntax.html b/doc/syntax.html index aa08b1108b9d0f8f4a28e3d25f9ae099901ebdf3..f0e01381bc8254528466b4c620246f2f8f36c5a7 100644 --- a/doc/syntax.html +++ b/doc/syntax.html @@ -47,6 +47,10 @@ x{-n}(≡ x{n}?) VIM x=(≡ x?) VIM +Implementation restriction: The counting forms x{n,m}, x{n,}, and x{n} +reject forms that create a minimum or maximum repetition count above 1000. +Unlimited repetitions are not subject to this restriction. + Possessive repetitions: x*+zero or more x, possessive x++one or more x, possessive @@ -56,10 +60,10 @@ x{n}+exactly n x, possessive Grouping: -(re)numbered capturing group -(?P<name>re)named & numbered capturing group -(?<name>re)named & numbered capturing group -(?'name're)named & numbered capturing group +(re)numbered capturing group (submatch) +(?P<name>re)named & numbered capturing group (submatch) +(?<name>re)named & numbered capturing group (submatch) +(?'name're)named & numbered capturing group (submatch) (?:re)non-capturing group (?flags)set flags within current group; non-capturing (?flags:re)set flags during re; non-capturing @@ -80,8 +84,8 @@ ^at beginning of text or line (m=true) $at end of text (like \z not \Z) or line (m=true) \Aat beginning of text -\bat word boundary (\w on one side and \W, \A, or \z on the other) -\Bnot a word boundary +\bat ASCII word boundary (\w on one side and \W, \A, or \z on the other) +\Bnot at ASCII word boundary \Gat beginning of subtext being searched PCRE \Gat end of last match PERL \Zat end of text, or before newline at end of text @@ -166,7 +170,7 @@ [\p{Name}]named Unicode property inside character class (≡ \p{Name}) [^\p{Name}]named Unicode property inside negated character class (≡ \P{Name}) -Perl character classes: +Perl character classes (all ASCII-only): \ddigits (≡ [0-9]) \Dnot digits (≡ [^0-9]) \swhitespace (≡ [\t\n\f\r ]) @@ -237,105 +241,167 @@ Zsspace separator Unicode character class names--scripts: -ArabicArabic -ArmenianArmenian -BalineseBalinese -BamumBamum -BatakBatak -BengaliBengali -BopomofoBopomofo -BrahmiBrahmi -BrailleBraille -BugineseBuginese -BuhidBuhid -Canadian_AboriginalCanadian Aboriginal -CarianCarian -ChakmaChakma -ChamCham -CherokeeCherokee -Commoncharacters not specific to one script -CopticCoptic -CuneiformCuneiform -CypriotCypriot -CyrillicCyrillic -DeseretDeseret -DevanagariDevanagari -Egyptian_HieroglyphsEgyptian Hieroglyphs -EthiopicEthiopic -GeorgianGeorgian -GlagoliticGlagolitic -GothicGothic -GreekGreek -GujaratiGujarati -GurmukhiGurmukhi -HanHan -HangulHangul -HanunooHanunoo -HebrewHebrew -HiraganaHiragana -Imperial_AramaicImperial Aramaic -Inheritedinherit script from previous character -Inscriptional_PahlaviInscriptional Pahlavi -Inscriptional_ParthianInscriptional Parthian -JavaneseJavanese -KaithiKaithi -KannadaKannada -KatakanaKatakana -Kayah_LiKayah Li -KharoshthiKharoshthi -KhmerKhmer -LaoLao -LatinLatin -LepchaLepcha -LimbuLimbu -Linear_BLinear B -LycianLycian -LydianLydian -MalayalamMalayalam -MandaicMandaic -Meetei_MayekMeetei Mayek -Meroitic_CursiveMeroitic Cursive -Meroitic_HieroglyphsMeroitic Hieroglyphs -MiaoMiao -MongolianMongolian -MyanmarMyanmar -New_Tai_LueNew Tai Lue (aka Simplified Tai Lue) -NkoNko -OghamOgham -Ol_ChikiOl Chiki -Old_ItalicOld Italic -Old_PersianOld Persian -Old_South_ArabianOld South Arabian -Old_TurkicOld Turkic -OriyaOriya -OsmanyaOsmanya -Phags_Pa'Phags Pa -PhoenicianPhoenician -RejangRejang -RunicRunic -SaurashtraSaurashtra -SharadaSharada -ShavianShavian -SinhalaSinhala -Sora_SompengSora Sompeng -SundaneseSundanese -Syloti_NagriSyloti Nagri -SyriacSyriac -TagalogTagalog -TagbanwaTagbanwa -Tai_LeTai Le -Tai_ThamTai Tham -Tai_VietTai Viet -TakriTakri -TamilTamil -TeluguTelugu -ThaanaThaana -ThaiThai -TibetanTibetan -TifinaghTifinagh -UgariticUgaritic -VaiVai -YiYi +Adlam +Ahom +Anatolian_Hieroglyphs +Arabic +Armenian +Avestan +Balinese +Bamum +Bassa_Vah +Batak +Bengali +Bhaiksuki +Bopomofo +Brahmi +Braille +Buginese +Buhid +Canadian_Aboriginal +Carian +Caucasian_Albanian +Chakma +Cham +Cherokee +Chorasmian +Common +Coptic +Cuneiform +Cypriot +Cypro_Minoan +Cyrillic +Deseret +Devanagari +Dives_Akuru +Dogra +Duployan +Egyptian_Hieroglyphs +Elbasan +Elymaic +Ethiopic +Georgian +Glagolitic +Gothic +Grantha +Greek +Gujarati +Gunjala_Gondi +Gurmukhi +Han +Hangul +Hanifi_Rohingya +Hanunoo +Hatran +Hebrew +Hiragana +Imperial_Aramaic +Inherited +Inscriptional_Pahlavi +Inscriptional_Parthian +Javanese +Kaithi +Kannada +Katakana +Kayah_Li +Kharoshthi +Khitan_Small_Script +Khmer +Khojki +Khudawadi +Lao +Latin +Lepcha +Limbu +Linear_A +Linear_B +Lisu +Lycian +Lydian +Mahajani +Makasar +Malayalam +Mandaic +Manichaean +Marchen +Masaram_Gondi +Medefaidrin +Meetei_Mayek +Mende_Kikakui +Meroitic_Cursive +Meroitic_Hieroglyphs +Miao +Modi +Mongolian +Mro +Multani +Myanmar +Nabataean +Nandinagari +New_Tai_Lue +Newa +Nko +Nushu +Nyiakeng_Puachue_Hmong +Ogham +Ol_Chiki +Old_Hungarian +Old_Italic +Old_North_Arabian +Old_Permic +Old_Persian +Old_Sogdian +Old_South_Arabian +Old_Turkic +Old_Uyghur +Oriya +Osage +Osmanya +Pahawh_Hmong +Palmyrene +Pau_Cin_Hau +Phags_Pa +Phoenician +Psalter_Pahlavi +Rejang +Runic +Samaritan +Saurashtra +Sharada +Shavian +Siddham +SignWriting +Sinhala +Sogdian +Sora_Sompeng +Soyombo +Sundanese +Syloti_Nagri +Syriac +Tagalog +Tagbanwa +Tai_Le +Tai_Tham +Tai_Viet +Takri +Tamil +Tangsa +Tangut +Telugu +Thaana +Thai +Tibetan +Tifinagh +Tirhuta +Toto +Ugaritic +Vai +Vithkuqi +Wancho +Warang_Citi +Yezidi +Yi +Zanabazar_Square Vim character classes: \iidentifier character VIM diff --git a/doc/syntax.txt b/doc/syntax.txt index cb04bbf05eb80a921e133eb356e51ecc54af9bc8..c12a482dde0317054ed72d6cb360aa09e6044fc8 100644 --- a/doc/syntax.txt +++ b/doc/syntax.txt @@ -253,13 +253,16 @@ Caucasian_Albanian Chakma Cham Cherokee +Chorasmian Common Coptic Cuneiform Cypriot +Cypro_Minoan Cyrillic Deseret Devanagari +Dives_Akuru Dogra Duployan Egyptian_Hieroglyphs @@ -291,6 +294,7 @@ Kannada Katakana Kayah_Li Kharoshthi +Khitan_Small_Script Khmer Khojki Khudawadi @@ -338,6 +342,7 @@ Old_Persian Old_Sogdian Old_South_Arabian Old_Turkic +Old_Uyghur Oriya Osage Osmanya @@ -369,6 +374,7 @@ Tai_Tham Tai_Viet Takri Tamil +Tangsa Tangut Telugu Thaana @@ -376,10 +382,13 @@ Thai Tibetan Tifinagh Tirhuta +Toto Ugaritic Vai +Vithkuqi Wancho Warang_Citi +Yezidi Yi Zanabazar_Square diff --git a/libre2.map b/libre2.map index 5287893e2a4ad72f75e319d7d38e4a82bb5a89ac..cf15d159896a6a740e82de4202a62c847ddbe823 100644 --- a/libre2.map +++ b/libre2.map @@ -4,6 +4,8 @@ "re2::RE2::~RE2()"; "re2::RE2::RE2(re2::StringPiece const&, re2::RE2::Options const&)"; "re2::RE2::FullMatchN(re2::StringPiece const&, re2::RE2 const&, re2::RE2::Arg const* const*, int)"; + "re2::RE2::GlobalReplace(std::__h::basic_string, std::__h::allocator >*, re2::RE2 const&, re2::StringPiece const&)"; + "re2::RE2::RE2(std::__h::basic_string, std::__h::allocator > const&)"; }; local: *; diff --git a/libre2.symbols b/libre2.symbols index 8308b6489222c2bd35ca07b2eb3b47f12b480759..93b71b486233e22a440feeae7175b784cf06f151 100644 --- a/libre2.symbols +++ b/libre2.symbols @@ -11,6 +11,9 @@ # re2::FilteredRE2* _ZN3re211FilteredRE2*; _ZNK3re211FilteredRE2*; + # re2::re2_internal* + _ZN3re212re2_internal*; + _ZNK3re212re2_internal*; local: *; }; diff --git a/libre2.symbols.darwin b/libre2.symbols.darwin index 31e8c52209eeb7290008a3aa4ea7aafedadd3976..41ac96f93b10cafc08f091dbc0eee6191566775a 100644 --- a/libre2.symbols.darwin +++ b/libre2.symbols.darwin @@ -10,3 +10,6 @@ __ZN3re2ls* # re2::FilteredRE2* __ZN3re211FilteredRE2* __ZNK3re211FilteredRE2* +# re2::re2_internal* +__ZN3re212re2_internal* +__ZNK3re212re2_internal* diff --git a/re2/bitmap256.h b/re2/bitmap256.h index f649b4ccca09286d9ae2e7f6796b58a32c538e3b..4899379e4d9992ff8229ec9fc7ab9343ff8a2380 100644 --- a/re2/bitmap256.h +++ b/re2/bitmap256.h @@ -32,7 +32,7 @@ class Bitmap256 { DCHECK_GE(c, 0); DCHECK_LE(c, 255); - return (words_[c / 64] & (1ULL << (c % 64))) != 0; + return (words_[c / 64] & (uint64_t{1} << (c % 64))) != 0; } // Sets the bit with index c. @@ -40,7 +40,7 @@ class Bitmap256 { DCHECK_GE(c, 0); DCHECK_LE(c, 255); - words_[c / 64] |= (1ULL << (c % 64)); + words_[c / 64] |= (uint64_t{1} << (c % 64)); } // Finds the next non-zero bit with index >= c. @@ -51,7 +51,6 @@ class Bitmap256 { // Finds the least significant non-zero bit in n. static int FindLSBSet(uint64_t n) { DCHECK_NE(n, 0); - #if defined(__GNUC__) return __builtin_ctzll(n); #elif defined(_MSC_VER) && defined(_M_X64) @@ -89,7 +88,7 @@ int Bitmap256::FindNextSetBit(int c) const { // Check the word that contains the bit. Mask out any lower bits. int i = c / 64; - uint64_t word = words_[i] & (~0ULL << (c % 64)); + uint64_t word = words_[i] & (~uint64_t{0} << (c % 64)); if (word != 0) return (i * 64) + FindLSBSet(word); diff --git a/re2/bitstate.cc b/re2/bitstate.cc index f15c1e43fe2276d1c4e1e4d41cbf12ca5a8185ec..877e54823411addc606aa0969e5ae58f07248f36 100644 --- a/re2/bitstate.cc +++ b/re2/bitstate.cc @@ -7,7 +7,7 @@ // Prog::SearchBitState is a regular expression search with submatch // tracking for small regular expressions and texts. Similarly to // testing/backtrack.cc, it allocates a bitmap with (count of -// lists) * (length of prog) bits to make sure it never explores the +// lists) * (length of text) bits to make sure it never explores the // same (instruction list, character position) multiple times. This // limits the search to run in time linear in the length of the text. // @@ -63,11 +63,14 @@ class BitState { int nsubmatch_; // # of submatches to fill in // Search state - static const int VisitedBits = 32; - PODArray visited_; // bitmap: (list ID, char*) pairs visited + static constexpr int kVisitedBits = 64; + PODArray visited_; // bitmap: (list ID, char*) pairs visited PODArray cap_; // capture registers PODArray job_; // stack of text positions to explore int njob_; // stack size + + BitState(const BitState&) = delete; + BitState& operator=(const BitState&) = delete; }; BitState::BitState(Prog* prog) @@ -87,9 +90,9 @@ BitState::BitState(Prog* prog) bool BitState::ShouldVisit(int id, const char* p) { int n = prog_->list_heads()[id] * static_cast(text_.size()+1) + static_cast(p-text_.data()); - if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1)))) + if (visited_[n/kVisitedBits] & (uint64_t{1} << (n & (kVisitedBits-1)))) return false; - visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1)); + visited_[n/kVisitedBits] |= uint64_t{1} << (n & (kVisitedBits-1)); return true; } @@ -290,9 +293,9 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context, context_ = context; if (context_.data() == NULL) context_ = text; - if (prog_->anchor_start() && context_.begin() != text.begin()) + if (prog_->anchor_start() && BeginPtr(context_) != BeginPtr(text)) return false; - if (prog_->anchor_end() && context_.end() != text.end()) + if (prog_->anchor_end() && EndPtr(context_) != EndPtr(text)) return false; anchored_ = anchored || prog_->anchor_start(); longest_ = longest || prog_->anchor_end(); @@ -304,8 +307,8 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context, // Allocate scratch space. int nvisited = prog_->list_count() * static_cast(text.size()+1); - nvisited = (nvisited + VisitedBits-1) / VisitedBits; - visited_ = PODArray(nvisited); + nvisited = (nvisited + kVisitedBits-1) / kVisitedBits; + visited_ = PODArray(nvisited); memset(visited_.data(), 0, nvisited*sizeof visited_[0]); int ncap = 2*nsubmatch; @@ -329,14 +332,13 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context, // This looks like it's quadratic in the size of the text, // but we are not clearing visited_ between calls to TrySearch, // so no work is duplicated and it ends up still being linear. - for (const char* p = text.data(); p <= text.data() + text.size(); p++) { - // Try to use memchr to find the first byte quickly. - int fb = prog_->first_byte(); - if (fb >= 0 && p < text.data() + text.size() && (p[0] & 0xFF) != fb) { - p = reinterpret_cast( - memchr(p, fb, text.data() + text.size() - p)); + const char* etext = text.data() + text.size(); + for (const char* p = text.data(); p <= etext; p++) { + // Try to use prefix accel (e.g. memchr) to skip ahead. + if (p < etext && prog_->can_prefix_accel()) { + p = reinterpret_cast(prog_->PrefixAccel(p, etext - p)); if (p == NULL) - p = text.data() + text.size(); + p = etext; } cap_[0] = p; @@ -375,7 +377,7 @@ bool Prog::SearchBitState(const StringPiece& text, bool longest = kind != kFirstMatch; if (!b.Search(text, context, anchored, longest, match, nmatch)) return false; - if (kind == kFullMatch && match[0].end() != text.end()) + if (kind == kFullMatch && EndPtr(match[0]) != EndPtr(text)) return false; return true; } diff --git a/re2/compile.cc b/re2/compile.cc index 7848dfcf4ebf346f528a0969ae5eee6f6b77fa70..61d801a630f54a799565293e65f28aee0f250ffc 100644 --- a/re2/compile.cc +++ b/re2/compile.cc @@ -30,92 +30,60 @@ namespace re2 { // See http://swtch.com/~rsc/regexp/regexp1.html for inspiration. // // Because the out and out1 fields in Inst are no longer pointers, -// we can't use pointers directly here either. Instead, p refers -// to inst_[p>>1].out (p&1 == 0) or inst_[p>>1].out1 (p&1 == 1). -// p == 0 represents the NULL list. This is okay because instruction #0 +// we can't use pointers directly here either. Instead, head refers +// to inst_[head>>1].out (head&1 == 0) or inst_[head>>1].out1 (head&1 == 1). +// head == 0 represents the NULL list. This is okay because instruction #0 // is always the fail instruction, which never appears on a list. - struct PatchList { - uint32_t p; - // Returns patch list containing just p. - static PatchList Mk(uint32_t p); + static PatchList Mk(uint32_t p) { + return {p, p}; + } - // Patches all the entries on l to have value v. + // Patches all the entries on l to have value p. // Caller must not ever use patch list again. - static void Patch(Prog::Inst *inst0, PatchList l, uint32_t v); - - // Deref returns the next pointer pointed at by p. - static PatchList Deref(Prog::Inst *inst0, PatchList l); - - // Appends two patch lists and returns result. - static PatchList Append(Prog::Inst *inst0, PatchList l1, PatchList l2); -}; - -static PatchList nullPatchList = { 0 }; - -// Returns patch list containing just p. -PatchList PatchList::Mk(uint32_t p) { - PatchList l; - l.p = p; - return l; -} - -// Returns the next pointer pointed at by l. -PatchList PatchList::Deref(Prog::Inst* inst0, PatchList l) { - Prog::Inst* ip = &inst0[l.p>>1]; - if (l.p&1) - l.p = ip->out1(); - else - l.p = ip->out(); - return l; -} - -// Patches all the entries on l to have value v. -void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32_t val) { - while (l.p != 0) { - Prog::Inst* ip = &inst0[l.p>>1]; - if (l.p&1) { - l.p = ip->out1(); - ip->out1_ = val; - } else { - l.p = ip->out(); - ip->set_out(val); + static void Patch(Prog::Inst* inst0, PatchList l, uint32_t p) { + while (l.head != 0) { + Prog::Inst* ip = &inst0[l.head>>1]; + if (l.head&1) { + l.head = ip->out1(); + ip->out1_ = p; + } else { + l.head = ip->out(); + ip->set_out(p); + } } } -} -// Appends two patch lists and returns result. -PatchList PatchList::Append(Prog::Inst* inst0, PatchList l1, PatchList l2) { - if (l1.p == 0) - return l2; - if (l2.p == 0) - return l1; - - PatchList l = l1; - for (;;) { - PatchList next = PatchList::Deref(inst0, l); - if (next.p == 0) - break; - l = next; + // Appends two patch lists and returns result. + static PatchList Append(Prog::Inst* inst0, PatchList l1, PatchList l2) { + if (l1.head == 0) + return l2; + if (l2.head == 0) + return l1; + Prog::Inst* ip = &inst0[l1.tail>>1]; + if (l1.tail&1) + ip->out1_ = l2.head; + else + ip->set_out(l2.head); + return {l1.head, l2.tail}; } - Prog::Inst* ip = &inst0[l.p>>1]; - if (l.p&1) - ip->out1_ = l2.p; - else - ip->set_out(l2.p); + uint32_t head; + uint32_t tail; // for constant-time append +}; - return l1; -} +static const PatchList kNullPatchList = {0, 0}; // Compiled program fragment. struct Frag { uint32_t begin; PatchList end; + bool nullable; - Frag() : begin(0) { end.p = 0; } // needed so Frag can go in vector - Frag(uint32_t begin, PatchList end) : begin(begin), end(end) {} + Frag() : begin(0), end(kNullPatchList), nullable(false) {} + Frag(uint32_t begin, PatchList end, bool nullable) + : begin(begin), end(end), nullable(nullable) {} }; // Input encodings. @@ -212,8 +180,8 @@ class Compiler : public Regexp::Walker { int AddSuffixRecursive(int root, int id); // Finds the trie node for the given suffix. Returns a Frag in order to - // distinguish between pointing at the root node directly (end.p == 0) - // and pointing at an Alt's out1 or out (end.p&1 == 1 or 0, respectively). + // distinguish between pointing at the root node directly (end.head == 0) + // and pointing at an Alt's out1 or out (end.head&1 == 1 or 0, respectively). Frag FindByteRange(int root, int id); // Compares two ByteRanges and returns true iff they are equal. @@ -225,8 +193,8 @@ class Compiler : public Regexp::Walker { // Single rune. Frag Literal(Rune r, bool foldcase); - void Setup(Regexp::ParseFlags, int64_t, RE2::Anchor); - Prog* Finish(); + void Setup(Regexp::ParseFlags flags, int64_t max_mem, RE2::Anchor anchor); + Prog* Finish(Regexp* re); // Returns .* where dot = any byte Frag DotStar(); @@ -298,7 +266,7 @@ int Compiler::AllocInst(int n) { // Returns an unmatchable fragment. Frag Compiler::NoMatch() { - return Frag(0, nullPatchList); + return Frag(); } // Is a an unmatchable fragment? @@ -314,7 +282,7 @@ Frag Compiler::Cat(Frag a, Frag b) { // Elide no-op. Prog::Inst* begin = &inst_[a.begin]; if (begin->opcode() == kInstNop && - a.end.p == (a.begin << 1) && + a.end.head == (a.begin << 1) && begin->out() == 0) { // in case refs to a somewhere PatchList::Patch(inst_.data(), a.end, b.begin); @@ -324,11 +292,11 @@ Frag Compiler::Cat(Frag a, Frag b) { // To run backward over string, reverse all concatenations. if (reversed_) { PatchList::Patch(inst_.data(), b.end, a.begin); - return Frag(b.begin, a.end); + return Frag(b.begin, a.end, b.nullable && a.nullable); } PatchList::Patch(inst_.data(), a.end, b.begin); - return Frag(a.begin, b.end); + return Frag(a.begin, b.end, a.nullable && b.nullable); } // Given fragments for a and b, returns fragment for a|b. @@ -344,7 +312,8 @@ Frag Compiler::Alt(Frag a, Frag b) { return NoMatch(); inst_[id].InitAlt(a.begin, b.begin); - return Frag(id, PatchList::Append(inst_.data(), a.end, b.end)); + return Frag(id, PatchList::Append(inst_.data(), a.end, b.end), + a.nullable || b.nullable); } // When capturing submatches in like-Perl mode, a kOpAlt Inst @@ -354,27 +323,44 @@ Frag Compiler::Alt(Frag a, Frag b) { // then the operator is greedy. If out1_ is the repetition // (and out_ moves forward), then the operator is non-greedy. -// Given a fragment a, returns a fragment for a* or a*? (if nongreedy) -Frag Compiler::Star(Frag a, bool nongreedy) { +// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy) +Frag Compiler::Plus(Frag a, bool nongreedy) { int id = AllocInst(1); if (id < 0) return NoMatch(); - inst_[id].InitAlt(0, 0); - PatchList::Patch(inst_.data(), a.end, id); + PatchList pl; if (nongreedy) { - inst_[id].out1_ = a.begin; - return Frag(id, PatchList::Mk(id << 1)); + inst_[id].InitAlt(0, a.begin); + pl = PatchList::Mk(id << 1); } else { - inst_[id].set_out(a.begin); - return Frag(id, PatchList::Mk((id << 1) | 1)); + inst_[id].InitAlt(a.begin, 0); + pl = PatchList::Mk((id << 1) | 1); } + PatchList::Patch(inst_.data(), a.end, id); + return Frag(a.begin, pl, a.nullable); } -// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy) -Frag Compiler::Plus(Frag a, bool nongreedy) { - // a+ is just a* with a different entry point. - Frag f = Star(a, nongreedy); - return Frag(a.begin, f.end); +// Given a fragment for a, returns a fragment for a* or a*? (if nongreedy) +Frag Compiler::Star(Frag a, bool nongreedy) { + // When the subexpression is nullable, one Alt isn't enough to guarantee + // correct priority ordering within the transitive closure. The simplest + // solution is to handle it as (a+)? instead, which adds the second Alt. + if (a.nullable) + return Quest(Plus(a, nongreedy), nongreedy); + + int id = AllocInst(1); + if (id < 0) + return NoMatch(); + PatchList pl; + if (nongreedy) { + inst_[id].InitAlt(0, a.begin); + pl = PatchList::Mk(id << 1); + } else { + inst_[id].InitAlt(a.begin, 0); + pl = PatchList::Mk((id << 1) | 1); + } + PatchList::Patch(inst_.data(), a.end, id); + return Frag(id, pl, true); } // Given a fragment for a, returns a fragment for a? or a?? (if nongreedy) @@ -392,7 +378,7 @@ Frag Compiler::Quest(Frag a, bool nongreedy) { inst_[id].InitAlt(a.begin, 0); pl = PatchList::Mk((id << 1) | 1); } - return Frag(id, PatchList::Append(inst_.data(), pl, a.end)); + return Frag(id, PatchList::Append(inst_.data(), pl, a.end), true); } // Returns a fragment for the byte range lo-hi. @@ -401,7 +387,7 @@ Frag Compiler::ByteRange(int lo, int hi, bool foldcase) { if (id < 0) return NoMatch(); inst_[id].InitByteRange(lo, hi, foldcase, 0); - return Frag(id, PatchList::Mk(id << 1)); + return Frag(id, PatchList::Mk(id << 1), false); } // Returns a no-op fragment. Sometimes unavoidable. @@ -410,7 +396,7 @@ Frag Compiler::Nop() { if (id < 0) return NoMatch(); inst_[id].InitNop(0); - return Frag(id, PatchList::Mk(id << 1)); + return Frag(id, PatchList::Mk(id << 1), true); } // Returns a fragment that signals a match. @@ -419,7 +405,7 @@ Frag Compiler::Match(int32_t match_id) { if (id < 0) return NoMatch(); inst_[id].InitMatch(match_id); - return Frag(id, nullPatchList); + return Frag(id, kNullPatchList, false); } // Returns a fragment matching a particular empty-width op (like ^ or $) @@ -428,7 +414,7 @@ Frag Compiler::EmptyWidth(EmptyOp empty) { if (id < 0) return NoMatch(); inst_[id].InitEmptyWidth(empty, 0); - return Frag(id, PatchList::Mk(id << 1)); + return Frag(id, PatchList::Mk(id << 1), true); } // Given a fragment a, returns a fragment with capturing parens around a. @@ -442,7 +428,7 @@ Frag Compiler::Capture(Frag a, int n) { inst_[id+1].InitCapture(2*n+1, 0); PatchList::Patch(inst_.data(), a.end, id+1); - return Frag(id, PatchList::Mk((id+1) << 1)); + return Frag(id, PatchList::Mk((id+1) << 1), a.nullable); } // A Rune is a name for a Unicode code point. @@ -467,7 +453,7 @@ static int MaxRune(int len) { void Compiler::BeginRange() { rune_cache_.clear(); rune_range_.begin = 0; - rune_range_.end = nullPatchList; + rune_range_.end = kNullPatchList; } int Compiler::UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, @@ -548,9 +534,9 @@ int Compiler::AddSuffixRecursive(int root, int id) { } int br; - if (f.end.p == 0) + if (f.end.head == 0) br = root; - else if (f.end.p&1) + else if (f.end.head&1) br = inst_[f.begin].out1(); else br = inst_[f.begin].out(); @@ -566,9 +552,9 @@ int Compiler::AddSuffixRecursive(int root, int id) { // Ensure that the parent points to the clone, not to the original. // Note that this could leave the head unreachable except via the cache. br = byterange; - if (f.end.p == 0) + if (f.end.head == 0) root = br; - else if (f.end.p&1) + else if (f.end.head&1) inst_[f.begin].out1_ = br; else inst_[f.begin].set_out(br); @@ -601,7 +587,7 @@ bool Compiler::ByteRangeEqual(int id1, int id2) { Frag Compiler::FindByteRange(int root, int id) { if (inst_[root].opcode() == kInstByteRange) { if (ByteRangeEqual(root, id)) - return Frag(root, nullPatchList); + return Frag(root, kNullPatchList, false); else return NoMatch(); } @@ -609,7 +595,7 @@ Frag Compiler::FindByteRange(int root, int id) { while (inst_[root].opcode() == kInstAlt) { int out1 = inst_[root].out1(); if (ByteRangeEqual(out1, id)) - return Frag(root, PatchList::Mk((root << 1) | 1)); + return Frag(root, PatchList::Mk((root << 1) | 1), false); // CharClass is a sorted list of ranges, so if out1 of the root Alt wasn't // what we're looking for, then we can stop immediately. Unfortunately, we @@ -621,7 +607,7 @@ Frag Compiler::FindByteRange(int root, int id) { if (inst_[out].opcode() == kInstAlt) root = out; else if (ByteRangeEqual(out, id)) - return Frag(root, PatchList::Mk(root << 1)); + return Frag(root, PatchList::Mk(root << 1), false); else return NoMatch(); } @@ -662,48 +648,43 @@ void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) { static_cast(hi), foldcase, 0)); } -// Table describing how to make a UTF-8 matching machine -// for the rune range 80-10FFFF (Runeself-Runemax). -// This range happens frequently enough (for example /./ and /[^a-z]/) -// and the rune_cache_ map is slow enough that this is worth -// special handling. Makes compilation of a small expression -// with a dot in it about 10% faster. -// The * in the comments below mark whole sequences. -static struct ByteRangeProg { - int next; - int lo; - int hi; -} prog_80_10ffff[] = { - // Two-byte - { -1, 0x80, 0xBF, }, // 0: 80-BF - { 0, 0xC2, 0xDF, }, // 1: C2-DF 80-BF* - - // Three-byte - { 0, 0xA0, 0xBF, }, // 2: A0-BF 80-BF - { 2, 0xE0, 0xE0, }, // 3: E0 A0-BF 80-BF* - { 0, 0x80, 0xBF, }, // 4: 80-BF 80-BF - { 4, 0xE1, 0xEF, }, // 5: E1-EF 80-BF 80-BF* - - // Four-byte - { 4, 0x90, 0xBF, }, // 6: 90-BF 80-BF 80-BF - { 6, 0xF0, 0xF0, }, // 7: F0 90-BF 80-BF 80-BF* - { 4, 0x80, 0xBF, }, // 8: 80-BF 80-BF 80-BF - { 8, 0xF1, 0xF3, }, // 9: F1-F3 80-BF 80-BF 80-BF* - { 4, 0x80, 0x8F, }, // 10: 80-8F 80-BF 80-BF - { 10, 0xF4, 0xF4, }, // 11: F4 80-8F 80-BF 80-BF* -}; - void Compiler::Add_80_10ffff() { - int inst[arraysize(prog_80_10ffff)] = { 0 }; // does not need to be initialized; silences gcc warning - for (size_t i = 0; i < arraysize(prog_80_10ffff); i++) { - const ByteRangeProg& p = prog_80_10ffff[i]; - int next = 0; - if (p.next >= 0) - next = inst[p.next]; - inst[i] = UncachedRuneByteSuffix(static_cast(p.lo), - static_cast(p.hi), false, next); - if ((p.lo & 0xC0) != 0x80) - AddSuffix(inst[i]); + // The 80-10FFFF (Runeself-Runemax) rune range occurs frequently enough + // (for example, for /./ and /[^a-z]/) that it is worth simplifying: by + // permitting overlong encodings in E0 and F0 sequences and code points + // over 10FFFF in F4 sequences, the size of the bytecode and the number + // of equivalence classes are reduced significantly. + int id; + if (reversed_) { + // Prefix factoring matters, but we don't have to handle it here + // because the rune range trie logic takes care of that already. + id = UncachedRuneByteSuffix(0xC2, 0xDF, false, 0); + id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); + AddSuffix(id); + + id = UncachedRuneByteSuffix(0xE0, 0xEF, false, 0); + id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); + id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); + AddSuffix(id); + + id = UncachedRuneByteSuffix(0xF0, 0xF4, false, 0); + id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); + id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); + id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); + AddSuffix(id); + } else { + // Suffix factoring matters - and we do have to handle it here. + int cont1 = UncachedRuneByteSuffix(0x80, 0xBF, false, 0); + id = UncachedRuneByteSuffix(0xC2, 0xDF, false, cont1); + AddSuffix(id); + + int cont2 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont1); + id = UncachedRuneByteSuffix(0xE0, 0xEF, false, cont2); + AddSuffix(id); + + int cont3 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont2); + id = UncachedRuneByteSuffix(0xF0, 0xF4, false, cont3); + AddSuffix(id); } } @@ -711,9 +692,8 @@ void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) { if (lo > hi) return; - // Pick off 80-10FFFF as a common special case - // that can bypass the slow rune_cache_. - if (lo == 0x80 && hi == 0x10ffff && !reversed_) { + // Pick off 80-10FFFF as a common special case. + if (lo == 0x80 && hi == 0x10ffff) { Add_80_10ffff(); return; } @@ -1095,8 +1075,6 @@ static bool IsAnchorEnd(Regexp** pre, int depth) { void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem, RE2::Anchor anchor) { - prog_->set_flags(flags); - if (flags & Regexp::Latin1) encoding_ = kEncodingLatin1; max_mem_ = max_mem; @@ -1117,14 +1095,11 @@ void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem, // on the program.) if (m >= 1<<24) m = 1<<24; - // Inst imposes its own limit (currently bigger than 2^24 but be safe). if (m > Prog::Inst::kMaxInst) m = Prog::Inst::kMaxInst; - max_ninst_ = static_cast(m); } - anchor_ = anchor; } @@ -1178,10 +1153,10 @@ Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) { c.prog_->set_start_unanchored(all.begin); // Hand ownership of prog_ to caller. - return c.Finish(); + return c.Finish(re); } -Prog* Compiler::Finish() { +Prog* Compiler::Finish(Regexp* re) { if (failed_) return NULL; @@ -1198,6 +1173,13 @@ Prog* Compiler::Finish() { prog_->Flatten(); prog_->ComputeByteMap(); + if (!prog_->reversed()) { + std::string prefix; + bool prefix_foldcase; + if (re->RequiredPrefixForAccel(&prefix, &prefix_foldcase)) + prog_->ConfigurePrefixAccel(prefix, prefix_foldcase); + } + // Record remaining memory for DFA. if (max_mem_ <= 0) { prog_->set_dfa_mem(1<<20); @@ -1254,7 +1236,7 @@ Prog* Compiler::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) { c.prog_->set_start(all.begin); c.prog_->set_start_unanchored(all.begin); - Prog* prog = c.Finish(); + Prog* prog = c.Finish(re); if (prog == NULL) return NULL; diff --git a/re2/dfa.cc b/re2/dfa.cc index 816080a029c2fb0fafc1f5e0b2c160857a62ce8d..d47c7d50a7e00a92f5d88227e679e6219f336159 100644 --- a/re2/dfa.cc +++ b/re2/dfa.cc @@ -42,6 +42,7 @@ #include "util/strutil.h" #include "re2/pod_array.h" #include "re2/prog.h" +#include "re2/re2.h" #include "re2/sparse_set.h" #include "re2/stringpiece.h" @@ -52,20 +53,13 @@ namespace re2 { -#if !defined(__linux__) /* only Linux seems to have memrchr */ -static void* memrchr(const void* s, int c, size_t n) { - const unsigned char* p = (const unsigned char*)s; - for (p += n; n > 0; n--) - if (*--p == c) - return (void*)p; - - return NULL; -} -#endif - // Controls whether the DFA should bail out early if the NFA would be faster. static bool dfa_should_bail_when_slow = true; +void Prog::TESTING_ONLY_set_dfa_should_bail_when_slow(bool b) { + dfa_should_bail_when_slow = b; +} + // Changing this to true compiles in prints that trace execution of the DFA. // Generates a lot of output -- only useful for debugging. static const bool ExtraDebug = false; @@ -177,11 +171,8 @@ class DFA { typedef std::unordered_set StateSet; private: - // Special "first_byte" values for a state. (Values >= 0 denote actual bytes.) - enum { - kFbUnknown = -1, // No analysis has been performed. - kFbNone = -2, // The first-byte trick cannot be used. - }; + // Make it easier to swap in a scalable reader-writer mutex. + using CacheMutex = Mutex; enum { // Indices into start_ for unanchored searches. @@ -249,25 +240,26 @@ class DFA { struct SearchParams { SearchParams(const StringPiece& text, const StringPiece& context, RWLocker* cache_lock) - : text(text), context(context), + : text(text), + context(context), anchored(false), + can_prefix_accel(false), want_earliest_match(false), run_forward(false), start(NULL), - first_byte(kFbUnknown), cache_lock(cache_lock), failed(false), ep(NULL), - matches(NULL) { } + matches(NULL) {} StringPiece text; StringPiece context; bool anchored; + bool can_prefix_accel; bool want_earliest_match; bool run_forward; State* start; - int first_byte; - RWLocker *cache_lock; + RWLocker* cache_lock; bool failed; // "out" parameter: whether search gave up const char* ep; // "out" parameter: end pointer for match SparseSet* matches; @@ -278,15 +270,13 @@ class DFA { }; // Before each search, the parameters to Search are analyzed by - // AnalyzeSearch to determine the state in which to start and the - // "first_byte" for that state, if any. + // AnalyzeSearch to determine the state in which to start. struct StartInfo { - StartInfo() : start(NULL), first_byte(kFbUnknown) {} - State* start; - std::atomic first_byte; + StartInfo() : start(NULL) {} + std::atomic start; }; - // Fills in params->start and params->first_byte using + // Fills in params->start and params->can_prefix_accel using // the other search parameters. Returns true on success, // false on failure. // cache_mutex_.r <= L < mutex_ @@ -297,10 +287,10 @@ class DFA { // The generic search loop, inlined to create specialized versions. // cache_mutex_.r <= L < mutex_ // Might unlock and relock cache_mutex_ via params->cache_lock. - inline bool InlinedSearchLoop(SearchParams* params, - bool have_first_byte, - bool want_earliest_match, - bool run_forward); + template + inline bool InlinedSearchLoop(SearchParams* params); // The specialized versions of InlinedSearchLoop. The three letters // at the ends of the name denote the true/false values used as the @@ -322,13 +312,6 @@ class DFA { // Might unlock and relock cache_mutex_ via params->cache_lock. bool FastSearchLoop(SearchParams* params); - // For debugging, a slow search loop that calls InlinedSearchLoop - // directly -- because the booleans passed are not constants, the - // loop is not specialized like the SearchFFF etc. versions, so it - // runs much more slowly. Useful only for debugging. - // cache_mutex_.r <= L < mutex_ - // Might unlock and relock cache_mutex_ via params->cache_lock. - bool SlowSearchLoop(SearchParams* params); // Looks up bytes in bytemap_ but handles case c == kByteEndText too. int ByteMap(int c) { @@ -355,11 +338,14 @@ class DFA { // while holding cache_mutex_ for writing, to avoid interrupting other // readers. Any State* pointers are only valid while cache_mutex_ // is held. - Mutex cache_mutex_; + CacheMutex cache_mutex_; int64_t mem_budget_; // Total memory budget for all States. int64_t state_budget_; // Amount of memory remaining for new States. StateSet state_cache_; // All States computed so far. StartInfo start_[kMaxStart]; + + DFA(const DFA&) = delete; + DFA& operator=(const DFA&) = delete; }; // Shorthand for casting to uint8_t*. @@ -613,7 +599,7 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) { // Only ByteRange, EmptyWidth, and Match instructions are useful to keep: // those are the only operators with any effect in // RunWorkqOnEmptyString or RunWorkqOnByte. - int* inst = new int[q->size()]; + PODArray inst(q->size()); int n = 0; uint32_t needflags = 0; // flags needed by kInstEmptyWidth instructions bool sawmatch = false; // whether queue contains guaranteed kInstMatch @@ -643,7 +629,6 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) { (it == q->begin() && ip->greedy(prog_))) && (kind_ != Prog::kLongestMatch || !sawmark) && (flag & kFlagMatch)) { - delete[] inst; if (ExtraDebug) fprintf(stderr, " -> FullMatchState\n"); return FullMatchState; @@ -690,7 +675,6 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) { // the execution loop can stop early. This is only okay // if the state is *not* a matching state. if (n == 0 && flag == 0) { - delete[] inst; if (ExtraDebug) fprintf(stderr, " -> DeadState\n"); return DeadState; @@ -700,7 +684,7 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) { // unordered state sets separated by Marks. Sort each set // to canonicalize, to reduce the number of distinct sets stored. if (kind_ == Prog::kLongestMatch) { - int* ip = inst; + int* ip = inst.data(); int* ep = ip + n; while (ip < ep) { int* markp = ip; @@ -717,7 +701,7 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) { // we have an unordered set of states (i.e. we don't have Marks) // and sorting will reduce the number of distinct sets stored. if (kind_ == Prog::kManyMatch) { - int* ip = inst; + int* ip = inst.data(); int* ep = ip + n; std::sort(ip, ep); } @@ -736,8 +720,7 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) { // Save the needed empty-width flags in the top bits for use later. flag |= needflags << kFlagNeedShift; - State* state = CachedState(inst, n, flag); - delete[] inst; + State* state = CachedState(inst.data(), n, flag); return state; } @@ -971,8 +954,21 @@ void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq, break; case kInstByteRange: // can follow if c is in range - if (ip->Matches(c)) - AddToQueue(newq, ip->out(), flag); + if (!ip->Matches(c)) + break; + AddToQueue(newq, ip->out(), flag); + if (ip->hint() != 0) { + // We have a hint, but we must cancel out the + // increment that will occur after the break. + i += ip->hint() - 1; + } else { + // We have no hint, so we must find the end + // of the current list and then skip to it. + Prog::Inst* ip0 = ip; + while (!ip->last()) + ++ip; + i += ip - ip0; + } break; case kInstMatch: @@ -1117,7 +1113,7 @@ DFA::State* DFA::RunStateOnByte(State* state, int c) { class DFA::RWLocker { public: - explicit RWLocker(Mutex* mu); + explicit RWLocker(CacheMutex* mu); ~RWLocker(); // If the lock is only held for reading right now, @@ -1127,14 +1123,14 @@ class DFA::RWLocker { void LockForWriting(); private: - Mutex* mu_; + CacheMutex* mu_; bool writing_; RWLocker(const RWLocker&) = delete; RWLocker& operator=(const RWLocker&) = delete; }; -DFA::RWLocker::RWLocker(Mutex* mu) : mu_(mu), writing_(false) { +DFA::RWLocker::RWLocker(CacheMutex* mu) : mu_(mu), writing_(false) { mu_->ReaderLock(); } @@ -1171,11 +1167,14 @@ void DFA::ResetCache(RWLocker* cache_lock) { // Re-acquire the cache_mutex_ for writing (exclusive use). cache_lock->LockForWriting(); + hooks::GetDFAStateCacheResetHook()({ + state_budget_, + state_cache_.size(), + }); + // Clear the cache, reset the memory budget. - for (int i = 0; i < kMaxStart; i++) { - start_[i].start = NULL; - start_[i].first_byte.store(kFbUnknown, std::memory_order_relaxed); - } + for (int i = 0; i < kMaxStart; i++) + start_[i].start.store(NULL, std::memory_order_relaxed); ClearCache(); mem_budget_ = state_budget_; } @@ -1290,8 +1289,7 @@ DFA::State* DFA::StateSaver::Restore() { // situation, the DFA can do better than executing the simple loop. // Instead, it can call memchr to search very quickly for the byte c. // Whether the start state has this property is determined during a -// pre-compilation pass, and if so, the byte b is passed to the search -// loop as the "first_byte" argument, along with a boolean "have_first_byte". +// pre-compilation pass and the "can_prefix_accel" argument is set. // // Fourth, the desired behavior is to search for the leftmost-best match // (approximately, the same one that Perl would find), which is not @@ -1323,10 +1321,10 @@ DFA::State* DFA::StateSaver::Restore() { // The bools are equal to the same-named variables in params, but // making them function arguments lets the inliner specialize // this function to each combination (see two paragraphs above). -inline bool DFA::InlinedSearchLoop(SearchParams* params, - bool have_first_byte, - bool want_earliest_match, - bool run_forward) { +template +inline bool DFA::InlinedSearchLoop(SearchParams* params) { State* start = params->start; const uint8_t* bp = BytePtr(params->text.data()); // start of text const uint8_t* p = bp; // text scanning point @@ -1369,22 +1367,14 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params, if (ExtraDebug) fprintf(stderr, "@%td: %s\n", p - bp, DumpState(s).c_str()); - if (have_first_byte && s == start) { - // In start state, only way out is to find first_byte, - // so use optimized assembly in memchr to skip ahead. - // If first_byte isn't found, we can skip to the end - // of the string. - if (run_forward) { - if ((p = BytePtr(memchr(p, params->first_byte, ep - p))) == NULL) { - p = ep; - break; - } - } else { - if ((p = BytePtr(memrchr(ep, params->first_byte, p - ep))) == NULL) { - p = ep; - break; - } - p++; + if (can_prefix_accel && s == start) { + // In start state, only way out is to find the prefix, + // so we use prefix accel (e.g. memchr) to skip ahead. + // If not found, we can skip to the end of the string. + p = BytePtr(prog_->PrefixAccel(p, ep - p)); + if (p == NULL) { + p = ep; + break; } } @@ -1498,15 +1488,15 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params, int lastbyte; if (run_forward) { - if (params->text.end() == params->context.end()) + if (EndPtr(params->text) == EndPtr(params->context)) lastbyte = kByteEndText; else - lastbyte = params->text.end()[0] & 0xFF; + lastbyte = EndPtr(params->text)[0] & 0xFF; } else { - if (params->text.begin() == params->context.begin()) + if (BeginPtr(params->text) == BeginPtr(params->context)) lastbyte = kByteEndText; else - lastbyte = params->text.begin()[-1] & 0xFF; + lastbyte = BeginPtr(params->text)[-1] & 0xFF; } State* ns = s->next_[ByteMap(lastbyte)].load(std::memory_order_acquire); @@ -1559,36 +1549,28 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params, // Inline specializations of the general loop. bool DFA::SearchFFF(SearchParams* params) { - return InlinedSearchLoop(params, 0, 0, 0); + return InlinedSearchLoop(params); } bool DFA::SearchFFT(SearchParams* params) { - return InlinedSearchLoop(params, 0, 0, 1); + return InlinedSearchLoop(params); } bool DFA::SearchFTF(SearchParams* params) { - return InlinedSearchLoop(params, 0, 1, 0); + return InlinedSearchLoop(params); } bool DFA::SearchFTT(SearchParams* params) { - return InlinedSearchLoop(params, 0, 1, 1); + return InlinedSearchLoop(params); } bool DFA::SearchTFF(SearchParams* params) { - return InlinedSearchLoop(params, 1, 0, 0); + return InlinedSearchLoop(params); } bool DFA::SearchTFT(SearchParams* params) { - return InlinedSearchLoop(params, 1, 0, 1); + return InlinedSearchLoop(params); } bool DFA::SearchTTF(SearchParams* params) { - return InlinedSearchLoop(params, 1, 1, 0); + return InlinedSearchLoop(params); } bool DFA::SearchTTT(SearchParams* params) { - return InlinedSearchLoop(params, 1, 1, 1); -} - -// For debugging, calls the general code directly. -bool DFA::SlowSearchLoop(SearchParams* params) { - return InlinedSearchLoop(params, - params->first_byte >= 0, - params->want_earliest_match, - params->run_forward); + return InlinedSearchLoop(params); } // For performance, calls the appropriate specialized version @@ -1607,8 +1589,7 @@ bool DFA::FastSearchLoop(SearchParams* params) { &DFA::SearchTTT, }; - bool have_first_byte = params->first_byte >= 0; - int index = 4 * have_first_byte + + int index = 4 * params->can_prefix_accel + 2 * params->want_earliest_match + 1 * params->run_forward; return (this->*Searches[index])(params); @@ -1646,7 +1627,7 @@ bool DFA::AnalyzeSearch(SearchParams* params) { const StringPiece& context = params->context; // Sanity check: make sure that text lies within context. - if (text.begin() < context.begin() || text.end() > context.end()) { + if (BeginPtr(text) < BeginPtr(context) || EndPtr(text) > EndPtr(context)) { LOG(DFATAL) << "context does not contain text"; params->start = DeadState; return true; @@ -1656,13 +1637,13 @@ bool DFA::AnalyzeSearch(SearchParams* params) { int start; uint32_t flags; if (params->run_forward) { - if (text.begin() == context.begin()) { + if (BeginPtr(text) == BeginPtr(context)) { start = kStartBeginText; flags = kEmptyBeginText|kEmptyBeginLine; - } else if (text.begin()[-1] == '\n') { + } else if (BeginPtr(text)[-1] == '\n') { start = kStartBeginLine; flags = kEmptyBeginLine; - } else if (Prog::IsWordChar(text.begin()[-1] & 0xFF)) { + } else if (Prog::IsWordChar(BeginPtr(text)[-1] & 0xFF)) { start = kStartAfterWordChar; flags = kFlagLastWord; } else { @@ -1670,13 +1651,13 @@ bool DFA::AnalyzeSearch(SearchParams* params) { flags = 0; } } else { - if (text.end() == context.end()) { + if (EndPtr(text) == EndPtr(context)) { start = kStartBeginText; flags = kEmptyBeginText|kEmptyBeginLine; - } else if (text.end()[0] == '\n') { + } else if (EndPtr(text)[0] == '\n') { start = kStartBeginLine; flags = kEmptyBeginLine; - } else if (Prog::IsWordChar(text.end()[0] & 0xFF)) { + } else if (Prog::IsWordChar(EndPtr(text)[0] & 0xFF)) { start = kStartAfterWordChar; flags = kFlagLastWord; } else { @@ -1700,13 +1681,22 @@ bool DFA::AnalyzeSearch(SearchParams* params) { } } + params->start = info->start.load(std::memory_order_acquire); + + // Even if we could prefix accel, we cannot do so when anchored and, + // less obviously, we cannot do so when we are going to need flags. + // This trick works only when there is a single byte that leads to a + // different state! + if (prog_->can_prefix_accel() && + !params->anchored && + params->start > SpecialStateMax && + params->start->flag_ >> kFlagNeedShift == 0) + params->can_prefix_accel = true; + if (ExtraDebug) - fprintf(stderr, "anchored=%d fwd=%d flags=%#x state=%s first_byte=%d\n", + fprintf(stderr, "anchored=%d fwd=%d flags=%#x state=%s can_prefix_accel=%d\n", params->anchored, params->run_forward, flags, - DumpState(info->start).c_str(), info->first_byte.load()); - - params->start = info->start; - params->first_byte = info->first_byte.load(std::memory_order_acquire); + DumpState(params->start).c_str(), params->can_prefix_accel); return true; } @@ -1715,47 +1705,25 @@ bool DFA::AnalyzeSearch(SearchParams* params) { bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info, uint32_t flags) { // Quick check. - int fb = info->first_byte.load(std::memory_order_acquire); - if (fb != kFbUnknown) + State* start = info->start.load(std::memory_order_acquire); + if (start != NULL) return true; MutexLock l(&mutex_); - fb = info->first_byte.load(std::memory_order_relaxed); - if (fb != kFbUnknown) + start = info->start.load(std::memory_order_relaxed); + if (start != NULL) return true; q0_->clear(); AddToQueue(q0_, params->anchored ? prog_->start() : prog_->start_unanchored(), flags); - info->start = WorkqToCachedState(q0_, NULL, flags); - if (info->start == NULL) + start = WorkqToCachedState(q0_, NULL, flags); + if (start == NULL) return false; - if (info->start == DeadState) { - // Synchronize with "quick check" above. - info->first_byte.store(kFbNone, std::memory_order_release); - return true; - } - - if (info->start == FullMatchState) { - // Synchronize with "quick check" above. - info->first_byte.store(kFbNone, std::memory_order_release); // will be ignored - return true; - } - - // Even if we have a first_byte, we cannot use it when anchored and, - // less obviously, we cannot use it when we are going to need flags. - // This trick works only when there is a single byte that leads to a - // different state! - int first_byte = prog_->first_byte(); - if (first_byte == -1 || - params->anchored || - info->start->flag_ >> kFlagNeedShift != 0) - first_byte = kFbNone; - // Synchronize with "quick check" above. - info->first_byte.store(first_byte, std::memory_order_release); + info->start.store(start, std::memory_order_release); return true; } @@ -1863,15 +1831,15 @@ bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context, StringPiece context = const_context; if (context.data() == NULL) context = text; - bool carat = anchor_start(); + bool caret = anchor_start(); bool dollar = anchor_end(); if (reversed_) { using std::swap; - swap(carat, dollar); + swap(caret, dollar); } - if (carat && context.begin() != text.begin()) + if (caret && BeginPtr(context) != BeginPtr(text)) return false; - if (dollar && context.end() != text.end()) + if (dollar && EndPtr(context) != EndPtr(text)) return false; // Handle full match by running an anchored longest match @@ -1904,8 +1872,12 @@ bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context, bool matched = dfa->Search(text, context, anchored, want_earliest_match, !reversed_, failed, &ep, matches); - if (*failed) + if (*failed) { + hooks::GetDFASearchFailureHook()({ + // Nothing yet... + }); return false; + } if (!matched) return false; if (endmatch && ep != (reversed_ ? text.data() : text.data() + text.size())) @@ -1998,10 +1970,6 @@ int Prog::BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb) { return GetDFA(kind)->BuildAllStates(cb); } -void Prog::TEST_dfa_should_bail_when_slow(bool b) { - dfa_should_bail_when_slow = b; -} - // Computes min and max for matching string. // Won't return strings bigger than maxlen. bool DFA::PossibleMatchRange(std::string* min, std::string* max, int maxlen) { diff --git a/re2/filtered_re2.cc b/re2/filtered_re2.cc index e5d8de5ce625e340580df22843af8acc85e9d9f3..5df97456e255b68cf183614803ab5f6da9adb563 100644 --- a/re2/filtered_re2.cc +++ b/re2/filtered_re2.cc @@ -6,6 +6,7 @@ #include #include +#include #include "util/util.h" #include "util/logging.h" @@ -27,7 +28,22 @@ FilteredRE2::FilteredRE2(int min_atom_len) FilteredRE2::~FilteredRE2() { for (size_t i = 0; i < re2_vec_.size(); i++) delete re2_vec_[i]; - delete prefilter_tree_; +} + +FilteredRE2::FilteredRE2(FilteredRE2&& other) + : re2_vec_(std::move(other.re2_vec_)), + compiled_(other.compiled_), + prefilter_tree_(std::move(other.prefilter_tree_)) { + other.re2_vec_.clear(); + other.re2_vec_.shrink_to_fit(); + other.compiled_ = false; + other.prefilter_tree_.reset(new PrefilterTree()); +} + +FilteredRE2& FilteredRE2::operator=(FilteredRE2&& other) { + this->~FilteredRE2(); + (void) new (this) FilteredRE2(std::move(other)); + return *this; } RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern, @@ -38,7 +54,7 @@ RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern, if (!re->ok()) { if (options.log_errors()) { LOG(ERROR) << "Couldn't compile regular expression, skipping: " - << re << " due to error " << re->error(); + << pattern << " due to error " << re->error(); } delete re; } else { diff --git a/re2/filtered_re2.h b/re2/filtered_re2.h index 4118accc87e93e7f3ce29dccd9a3d38741a00d8f..dd618c70e8bfee9cfc8e5118868f5f0a3cd298ee 100644 --- a/re2/filtered_re2.h +++ b/re2/filtered_re2.h @@ -10,17 +10,18 @@ // number of regexps that need to be actually searched. // // By design, it does not include a string matching engine. This is to -// allow the user of the class to use their favorite string match +// allow the user of the class to use their favorite string matching // engine. The overall flow is: Add all the regexps using Add, then -// Compile the FilteredRE2. The compile returns strings that need to -// be matched. Note that all returned strings are lowercase. For -// applying regexps to a search text, the caller does the string -// matching using the strings returned. When doing the string match, -// note that the caller has to do that on lower cased version of the -// search text. Then call FirstMatch or AllMatches with a vector of -// indices of strings that were found in the text to get the actual -// regexp matches. - +// Compile the FilteredRE2. Compile returns strings that need to be +// matched. Note that the returned strings are lowercased and distinct. +// For applying regexps to a search text, the caller does the string +// matching using the returned strings. When doing the string match, +// note that the caller has to do that in a case-insensitive way or +// on a lowercased version of the search text. Then call FirstMatch +// or AllMatches with a vector of indices of strings that were found +// in the text to get the actual regexp matches. + +#include #include #include @@ -36,18 +37,25 @@ class FilteredRE2 { explicit FilteredRE2(int min_atom_len); ~FilteredRE2(); + // Not copyable. + FilteredRE2(const FilteredRE2&) = delete; + FilteredRE2& operator=(const FilteredRE2&) = delete; + // Movable. + FilteredRE2(FilteredRE2&& other); + FilteredRE2& operator=(FilteredRE2&& other); + // Uses RE2 constructor to create a RE2 object (re). Returns // re->error_code(). If error_code is other than NoError, then re is // deleted and not added to re2_vec_. RE2::ErrorCode Add(const StringPiece& pattern, const RE2::Options& options, - int *id); + int* id); // Prepares the regexps added by Add for filtering. Returns a set // of strings that the caller should check for in candidate texts. - // The returned strings are lowercased. When doing string matching, - // the search text should be lowercased first to find matching - // strings from the set of strings returned by Compile. Call after + // The returned strings are lowercased and distinct. When doing + // string matching, it should be performed in a case-insensitive + // way or the search text should be lowercased first. Call after // all Add calls are done. void Compile(std::vector* strings_to_match); @@ -98,10 +106,7 @@ class FilteredRE2 { bool compiled_; // An AND-OR tree of string atoms used for filtering regexps. - PrefilterTree* prefilter_tree_; - - FilteredRE2(const FilteredRE2&) = delete; - FilteredRE2& operator=(const FilteredRE2&) = delete; + std::unique_ptr prefilter_tree_; }; } // namespace re2 diff --git a/re2/fuzzing/compiler-rt/LICENSE b/re2/fuzzing/compiler-rt/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..f9dc50615d7ec2b9913dc434fb243fc30889d2a9 --- /dev/null +++ b/re2/fuzzing/compiler-rt/LICENSE @@ -0,0 +1,219 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +--- LLVM Exceptions to the Apache 2.0 License ---- + +As an exception, if, as a result of your compiling your source code, portions +of this Software are embedded into an Object form of such source code, you +may redistribute such embedded portions in such Object form without complying +with the conditions of Sections 4(a), 4(b) and 4(d) of the License. + +In addition, if you combine or link compiled forms of this Software with +software that is licensed under the GPLv2 ("Combined Software") and if a +court of competent jurisdiction determines that the patent provision (Section +3), the indemnity provision (Section 9) or other Section of the License +conflicts with the conditions of the GPLv2, you may retroactively and +prospectively choose to deem waived or otherwise exclude such Section(s) of +the License, but only in their entirety and only with respect to the Combined +Software. + diff --git a/re2/fuzzing/compiler-rt/include/fuzzer/FuzzedDataProvider.h b/re2/fuzzing/compiler-rt/include/fuzzer/FuzzedDataProvider.h new file mode 100644 index 0000000000000000000000000000000000000000..3e069eba69b46229aa765d36db84197698a5b42a --- /dev/null +++ b/re2/fuzzing/compiler-rt/include/fuzzer/FuzzedDataProvider.h @@ -0,0 +1,305 @@ +//===- FuzzedDataProvider.h - Utility header for fuzz targets ---*- C++ -* ===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// A single header library providing an utility class to break up an array of +// bytes. Whenever run on the same input, provides the same output, as long as +// its methods are called in the same order, with the same arguments. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_ +#define LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// In addition to the comments below, the API is also briefly documented at +// https://github.com/google/fuzzing/blob/master/docs/split-inputs.md#fuzzed-data-provider +class FuzzedDataProvider { + public: + // |data| is an array of length |size| that the FuzzedDataProvider wraps to + // provide more granular access. |data| must outlive the FuzzedDataProvider. + FuzzedDataProvider(const uint8_t *data, size_t size) + : data_ptr_(data), remaining_bytes_(size) {} + ~FuzzedDataProvider() = default; + + // Returns a std::vector containing |num_bytes| of input data. If fewer than + // |num_bytes| of data remain, returns a shorter std::vector containing all + // of the data that's left. Can be used with any byte sized type, such as + // char, unsigned char, uint8_t, etc. + template std::vector ConsumeBytes(size_t num_bytes) { + num_bytes = std::min(num_bytes, remaining_bytes_); + return ConsumeBytes(num_bytes, num_bytes); + } + + // Similar to |ConsumeBytes|, but also appends the terminator value at the end + // of the resulting vector. Useful, when a mutable null-terminated C-string is + // needed, for example. But that is a rare case. Better avoid it, if possible, + // and prefer using |ConsumeBytes| or |ConsumeBytesAsString| methods. + template + std::vector ConsumeBytesWithTerminator(size_t num_bytes, + T terminator = 0) { + num_bytes = std::min(num_bytes, remaining_bytes_); + std::vector result = ConsumeBytes(num_bytes + 1, num_bytes); + result.back() = terminator; + return result; + } + + // Returns a std::string containing |num_bytes| of input data. Using this and + // |.c_str()| on the resulting string is the best way to get an immutable + // null-terminated C string. If fewer than |num_bytes| of data remain, returns + // a shorter std::string containing all of the data that's left. + std::string ConsumeBytesAsString(size_t num_bytes) { + static_assert(sizeof(std::string::value_type) == sizeof(uint8_t), + "ConsumeBytesAsString cannot convert the data to a string."); + + num_bytes = std::min(num_bytes, remaining_bytes_); + std::string result( + reinterpret_cast(data_ptr_), + num_bytes); + Advance(num_bytes); + return result; + } + + // Returns a number in the range [min, max] by consuming bytes from the + // input data. The value might not be uniformly distributed in the given + // range. If there's no input data left, always returns |min|. |min| must + // be less than or equal to |max|. + template T ConsumeIntegralInRange(T min, T max) { + static_assert(std::is_integral::value, "An integral type is required."); + static_assert(sizeof(T) <= sizeof(uint64_t), "Unsupported integral type."); + + if (min > max) + abort(); + + // Use the biggest type possible to hold the range and the result. + uint64_t range = static_cast(max) - min; + uint64_t result = 0; + size_t offset = 0; + + while (offset < sizeof(T) * CHAR_BIT && (range >> offset) > 0 && + remaining_bytes_ != 0) { + // Pull bytes off the end of the seed data. Experimentally, this seems to + // allow the fuzzer to more easily explore the input space. This makes + // sense, since it works by modifying inputs that caused new code to run, + // and this data is often used to encode length of data read by + // |ConsumeBytes|. Separating out read lengths makes it easier modify the + // contents of the data that is actually read. + --remaining_bytes_; + result = (result << CHAR_BIT) | data_ptr_[remaining_bytes_]; + offset += CHAR_BIT; + } + + // Avoid division by 0, in case |range + 1| results in overflow. + if (range != std::numeric_limits::max()) + result = result % (range + 1); + + return static_cast(min + result); + } + + // Returns a std::string of length from 0 to |max_length|. When it runs out of + // input data, returns what remains of the input. Designed to be more stable + // with respect to a fuzzer inserting characters than just picking a random + // length and then consuming that many bytes with |ConsumeBytes|. + std::string ConsumeRandomLengthString(size_t max_length) { + // Reads bytes from the start of |data_ptr_|. Maps "\\" to "\", and maps "\" + // followed by anything else to the end of the string. As a result of this + // logic, a fuzzer can insert characters into the string, and the string + // will be lengthened to include those new characters, resulting in a more + // stable fuzzer than picking the length of a string independently from + // picking its contents. + std::string result; + + // Reserve the anticipated capaticity to prevent several reallocations. + result.reserve(std::min(max_length, remaining_bytes_)); + for (size_t i = 0; i < max_length && remaining_bytes_ != 0; ++i) { + char next = ConvertUnsignedToSigned(data_ptr_[0]); + Advance(1); + if (next == '\\' && remaining_bytes_ != 0) { + next = ConvertUnsignedToSigned(data_ptr_[0]); + Advance(1); + if (next != '\\') + break; + } + result += next; + } + + result.shrink_to_fit(); + return result; + } + + // Returns a std::vector containing all remaining bytes of the input data. + template std::vector ConsumeRemainingBytes() { + return ConsumeBytes(remaining_bytes_); + } + + // Returns a std::string containing all remaining bytes of the input data. + // Prefer using |ConsumeRemainingBytes| unless you actually need a std::string + // object. + std::string ConsumeRemainingBytesAsString() { + return ConsumeBytesAsString(remaining_bytes_); + } + + // Returns a number in the range [Type's min, Type's max]. The value might + // not be uniformly distributed in the given range. If there's no input data + // left, always returns |min|. + template T ConsumeIntegral() { + return ConsumeIntegralInRange(std::numeric_limits::min(), + std::numeric_limits::max()); + } + + // Reads one byte and returns a bool, or false when no data remains. + bool ConsumeBool() { return 1 & ConsumeIntegral(); } + + // Returns a copy of the value selected from the given fixed-size |array|. + template + T PickValueInArray(const T (&array)[size]) { + static_assert(size > 0, "The array must be non empty."); + return array[ConsumeIntegralInRange(0, size - 1)]; + } + + template + T PickValueInArray(std::initializer_list list) { + // TODO(Dor1s): switch to static_assert once C++14 is allowed. + if (!list.size()) + abort(); + + return *(list.begin() + ConsumeIntegralInRange(0, list.size() - 1)); + } + + // Returns an enum value. The enum must start at 0 and be contiguous. It must + // also contain |kMaxValue| aliased to its largest (inclusive) value. Such as: + // enum class Foo { SomeValue, OtherValue, kMaxValue = OtherValue }; + template T ConsumeEnum() { + static_assert(std::is_enum::value, "|T| must be an enum type."); + return static_cast(ConsumeIntegralInRange( + 0, static_cast(T::kMaxValue))); + } + + // Returns a floating point number in the range [0.0, 1.0]. If there's no + // input data left, always returns 0. + template T ConsumeProbability() { + static_assert(std::is_floating_point::value, + "A floating point type is required."); + + // Use different integral types for different floating point types in order + // to provide better density of the resulting values. + using IntegralType = + typename std::conditional<(sizeof(T) <= sizeof(uint32_t)), uint32_t, + uint64_t>::type; + + T result = static_cast(ConsumeIntegral()); + result /= static_cast(std::numeric_limits::max()); + return result; + } + + // Returns a floating point value in the range [Type's lowest, Type's max] by + // consuming bytes from the input data. If there's no input data left, always + // returns approximately 0. + template T ConsumeFloatingPoint() { + return ConsumeFloatingPointInRange(std::numeric_limits::lowest(), + std::numeric_limits::max()); + } + + // Returns a floating point value in the given range by consuming bytes from + // the input data. If there's no input data left, returns |min|. Note that + // |min| must be less than or equal to |max|. + template T ConsumeFloatingPointInRange(T min, T max) { + if (min > max) + abort(); + + T range = .0; + T result = min; + constexpr T zero(.0); + if (max > zero && min < zero && max > min + std::numeric_limits::max()) { + // The diff |max - min| would overflow the given floating point type. Use + // the half of the diff as the range and consume a bool to decide whether + // the result is in the first of the second part of the diff. + range = (max / 2.0) - (min / 2.0); + if (ConsumeBool()) { + result += range; + } + } else { + range = max - min; + } + + return result + range * ConsumeProbability(); + } + + // Reports the remaining bytes available for fuzzed input. + size_t remaining_bytes() { return remaining_bytes_; } + + private: + FuzzedDataProvider(const FuzzedDataProvider &) = delete; + FuzzedDataProvider &operator=(const FuzzedDataProvider &) = delete; + + void Advance(size_t num_bytes) { + if (num_bytes > remaining_bytes_) + abort(); + + data_ptr_ += num_bytes; + remaining_bytes_ -= num_bytes; + } + + template + std::vector ConsumeBytes(size_t size, size_t num_bytes_to_consume) { + static_assert(sizeof(T) == sizeof(uint8_t), "Incompatible data type."); + + // The point of using the size-based constructor below is to increase the + // odds of having a vector object with capacity being equal to the length. + // That part is always implementation specific, but at least both libc++ and + // libstdc++ allocate the requested number of bytes in that constructor, + // which seems to be a natural choice for other implementations as well. + // To increase the odds even more, we also call |shrink_to_fit| below. + std::vector result(size); + if (size == 0) { + if (num_bytes_to_consume != 0) + abort(); + return result; + } + + std::memcpy(result.data(), data_ptr_, num_bytes_to_consume); + Advance(num_bytes_to_consume); + + // Even though |shrink_to_fit| is also implementation specific, we expect it + // to provide an additional assurance in case vector's constructor allocated + // a buffer which is larger than the actual amount of data we put inside it. + result.shrink_to_fit(); + return result; + } + + template TS ConvertUnsignedToSigned(TU value) { + static_assert(sizeof(TS) == sizeof(TU), "Incompatible data types."); + static_assert(!std::numeric_limits::is_signed, + "Source type must be unsigned."); + + // TODO(Dor1s): change to `if constexpr` once C++17 becomes mainstream. + if (std::numeric_limits::is_modulo) + return static_cast(value); + + // Avoid using implementation-defined unsigned to signer conversions. + // To learn more, see https://stackoverflow.com/questions/13150449. + if (value <= std::numeric_limits::max()) { + return static_cast(value); + } else { + constexpr auto TS_min = std::numeric_limits::min(); + return TS_min + static_cast(value - TS_min); + } + } + + const uint8_t *data_ptr_; + size_t remaining_bytes_; +}; + +#endif // LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_ diff --git a/re2/fuzzing/re2_fuzzer.cc b/re2/fuzzing/re2_fuzzer.cc index 061c418e188ca8d92c040675113b694e1f3960e5..3082a769252153f4f48622f24cdda838f0ab17e7 100644 --- a/re2/fuzzing/re2_fuzzer.cc +++ b/re2/fuzzing/re2_fuzzer.cc @@ -2,47 +2,155 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#include #include #include -#include -#include -#include +#include #include +#include -#include "re2/prefilter.h" #include "re2/re2.h" +#include "re2/regexp.h" +#include "re2/walker-inl.h" using re2::StringPiece; // NOT static, NOT signed. uint8_t dummy = 0; -void Test(StringPiece pattern, const RE2::Options& options, StringPiece text) { +// Walks kRegexpConcat and kRegexpAlternate subexpressions +// to determine their maximum length. +class SubexpressionWalker : public re2::Regexp::Walker { + public: + SubexpressionWalker() = default; + ~SubexpressionWalker() override = default; + + int PostVisit(re2::Regexp* re, int parent_arg, int pre_arg, + int* child_args, int nchild_args) override { + switch (re->op()) { + case re2::kRegexpConcat: + case re2::kRegexpAlternate: { + int max = nchild_args; + for (int i = 0; i < nchild_args; i++) + max = std::max(max, child_args[i]); + return max; + } + + default: + break; + } + return -1; + } + + // Should never be called: we use Walk(), not WalkExponential(). + int ShortVisit(re2::Regexp* re, int parent_arg) override { + return parent_arg; + } + + private: + SubexpressionWalker(const SubexpressionWalker&) = delete; + SubexpressionWalker& operator=(const SubexpressionWalker&) = delete; +}; + +// Walks substrings (i.e. kRegexpLiteralString subexpressions) +// to determine their maximum length... in runes, but avoiding +// overheads due to UTF-8 encoding is worthwhile when fuzzing. +class SubstringWalker : public re2::Regexp::Walker { + public: + SubstringWalker() = default; + ~SubstringWalker() override = default; + + int PostVisit(re2::Regexp* re, int parent_arg, int pre_arg, + int* child_args, int nchild_args) override { + switch (re->op()) { + case re2::kRegexpConcat: + case re2::kRegexpAlternate: + case re2::kRegexpStar: + case re2::kRegexpPlus: + case re2::kRegexpQuest: + case re2::kRegexpRepeat: + case re2::kRegexpCapture: { + int max = -1; + for (int i = 0; i < nchild_args; i++) + max = std::max(max, child_args[i]); + return max; + } + + case re2::kRegexpLiteralString: + return re->nrunes(); + + default: + break; + } + return -1; + } + + // Should never be called: we use Walk(), not WalkExponential(). + int ShortVisit(re2::Regexp* re, int parent_arg) override { + return parent_arg; + } + + private: + SubstringWalker(const SubstringWalker&) = delete; + SubstringWalker& operator=(const SubstringWalker&) = delete; +}; + +void TestOneInput(StringPiece pattern, const RE2::Options& options, + StringPiece text) { + // Crudely limit the use of ., \p, \P, \d, \D, \s, \S, \w and \W. + // Otherwise, we will waste time on inputs that have long runs of various + // character classes. The fuzzer has shown itself to be easily capable of + // generating such patterns that fall within the other limits, but result + // in timeouts nonetheless. The marginal cost is high - even more so when + // counted repetition is involved - whereas the marginal benefit is zero. + // Crudely limit the use of 'k', 'K', 's' and 'S' too because they become + // three-element character classes when case-insensitive and using UTF-8. + // TODO(junyer): Handle [:isalnum:] et al. when they start to cause pain. + int char_class = 0; + int backslash_p = 0; // very expensive, so handle specially + for (size_t i = 0; i < pattern.size(); i++) { + if (pattern[i] == '.' || + pattern[i] == 'k' || pattern[i] == 'K' || + pattern[i] == 's' || pattern[i] == 'S') + char_class++; + if (pattern[i] != '\\') + continue; + i++; + if (i >= pattern.size()) + break; + if (pattern[i] == 'p' || pattern[i] == 'P' || + pattern[i] == 'd' || pattern[i] == 'D' || + pattern[i] == 's' || pattern[i] == 'S' || + pattern[i] == 'w' || pattern[i] == 'W') + char_class++; + if (pattern[i] == 'p' || pattern[i] == 'P') + backslash_p++; + } + if (char_class > 9) + return; + if (backslash_p > 1) + return; + + // The default is 1000. Even 100 turned out to be too generous + // for fuzzing, empirically speaking, so let's try 10 instead. + re2::Regexp::FUZZING_ONLY_set_maximum_repeat_count(10); + RE2 re(pattern, options); if (!re.ok()) return; + // Don't waste time fuzzing programs with large subexpressions. + // They can cause bug reports due to fuzzer timeouts. And they + // aren't interesting for fuzzing purposes. + if (SubexpressionWalker().Walk(re.Regexp(), -1) > 9) + return; + // Don't waste time fuzzing programs with large substrings. // They can cause bug reports due to fuzzer timeouts when they // are repetitions (e.g. hundreds of NUL bytes) and matching is // unanchored. And they aren't interesting for fuzzing purposes. - std::unique_ptr prefilter(re2::Prefilter::FromRE2(&re)); - if (prefilter == nullptr) + if (SubstringWalker().Walk(re.Regexp(), -1) > 9) return; - std::queue nodes; - nodes.push(prefilter.get()); - while (!nodes.empty()) { - re2::Prefilter* node = nodes.front(); - nodes.pop(); - if (node->op() == re2::Prefilter::ATOM) { - if (node->atom().size() > 9) - return; - } else if (node->op() == re2::Prefilter::AND || - node->op() == re2::Prefilter::OR) { - for (re2::Prefilter* sub : *node->subs()) - nodes.push(sub); - } - } // Don't waste time fuzzing high-size programs. // They can cause bug reports due to fuzzer timeouts. @@ -55,7 +163,7 @@ void Test(StringPiece pattern, const RE2::Options& options, StringPiece text) { // Don't waste time fuzzing high-fanout programs. // They can cause bug reports due to fuzzer timeouts. - std::map histogram; + std::vector histogram; int fanout = re.ProgramFanout(&histogram); if (fanout > 9) return; @@ -102,72 +210,38 @@ void Test(StringPiece pattern, const RE2::Options& options, StringPiece text) { // Entry point for libFuzzer. extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { - if (size == 0 || size > 999) - return 0; - - // Crudely limit the use of ., \p, \P, \d, \D, \s, \S, \w and \W. - // Otherwise, we will waste time on inputs that have long runs of various - // character classes. The fuzzer has shown itself to be easily capable of - // generating such patterns that fall within the other limits, but result - // in timeouts nonetheless. The marginal cost is high - even more so when - // counted repetition is involved - whereas the marginal benefit is zero. - // TODO(junyer): Handle [:isalnum:] et al. when they start to cause pain. - int char_class = 0; - int backslash_p = 0; // very expensive, so handle specially - for (size_t i = 0; i < size; i++) { - if (data[i] == '.') - char_class++; - if (data[i] != '\\') - continue; - i++; - if (i >= size) - break; - if (data[i] == 'p' || data[i] == 'P' || - data[i] == 'd' || data[i] == 'D' || - data[i] == 's' || data[i] == 'S' || - data[i] == 'w' || data[i] == 'W') - char_class++; - if (data[i] == 'p' || data[i] == 'P') - backslash_p++; - } - if (char_class > 9) - return 0; - if (backslash_p > 1) + // An input larger than 4 KiB probably isn't interesting. (This limit + // allows for fdp.ConsumeRandomLengthString()'s backslash behaviour.) + if (size == 0 || size > 4096) return 0; - // The one-at-a-time hash by Bob Jenkins. - uint32_t hash = 0; - for (size_t i = 0; i < size; i++) { - hash += data[i]; - hash += (hash << 10); - hash ^= (hash >> 6); - } - hash += (hash << 3); - hash ^= (hash >> 11); - hash += (hash << 15); + FuzzedDataProvider fdp(data, size); + // The convention here is that fdp.ConsumeBool() returning false sets + // the default value whereas returning true sets the alternate value: + // most options default to false and so can be set directly; encoding + // defaults to UTF-8; case_sensitive defaults to true. We do NOT want + // to log errors. max_mem is 64 MiB because we can afford to use more + // RAM in exchange for (hopefully) faster fuzzing. RE2::Options options; + options.set_encoding(fdp.ConsumeBool() ? RE2::Options::EncodingLatin1 + : RE2::Options::EncodingUTF8); + options.set_posix_syntax(fdp.ConsumeBool()); + options.set_longest_match(fdp.ConsumeBool()); options.set_log_errors(false); options.set_max_mem(64 << 20); - options.set_encoding(hash & 1 ? RE2::Options::EncodingLatin1 - : RE2::Options::EncodingUTF8); - options.set_posix_syntax(hash & 2); - options.set_longest_match(hash & 4); - options.set_literal(hash & 8); - options.set_never_nl(hash & 16); - options.set_dot_nl(hash & 32); - options.set_never_capture(hash & 64); - options.set_case_sensitive(hash & 128); - options.set_perl_classes(hash & 256); - options.set_word_boundary(hash & 512); - options.set_one_line(hash & 1024); - - const char* ptr = reinterpret_cast(data); - int len = static_cast(size); - - StringPiece pattern(ptr, len); - StringPiece text(ptr, len); - Test(pattern, options, text); - + options.set_literal(fdp.ConsumeBool()); + options.set_never_nl(fdp.ConsumeBool()); + options.set_dot_nl(fdp.ConsumeBool()); + options.set_never_capture(fdp.ConsumeBool()); + options.set_case_sensitive(!fdp.ConsumeBool()); + options.set_perl_classes(fdp.ConsumeBool()); + options.set_word_boundary(fdp.ConsumeBool()); + options.set_one_line(fdp.ConsumeBool()); + + std::string pattern = fdp.ConsumeRandomLengthString(999); + std::string text = fdp.ConsumeRandomLengthString(999); + + TestOneInput(pattern, options, text); return 0; } diff --git a/re2/make_perl_groups.pl b/re2/make_perl_groups.pl index d9fcdafaafe9d7ee103c6678472f9c5fcfc6ab88..ed0d509dc327c37c5ab6d8a68662c720e210a303 100755 --- a/re2/make_perl_groups.pl +++ b/re2/make_perl_groups.pl @@ -76,7 +76,7 @@ sub PrintClass($$@) { } else { $negname =~ y/a-z/A-Z/; } - return "{ \"$escname\", +1, code$cnum, $n }", "{ \"$negname\", -1, code$cnum, $n }"; + return "{ \"$escname\", +1, code$cnum, $n, 0, 0 }", "{ \"$negname\", -1, code$cnum, $n, 0, 0 }"; } my $cnum = 0; diff --git a/re2/mimics_pcre.cc b/re2/mimics_pcre.cc index ad197bef554bdea0813990daed2282d8083f0ad7..b1d6a512286ba875e53ea7cc90e310d91a451d28 100644 --- a/re2/mimics_pcre.cc +++ b/re2/mimics_pcre.cc @@ -38,14 +38,21 @@ static bool CanBeEmptyString(Regexp *re); class PCREWalker : public Regexp::Walker { public: PCREWalker() {} - bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args, - int nchild_args); - bool ShortVisit(Regexp* re, bool a) { - // Should never be called: we use Walk not WalkExponential. - LOG(DFATAL) << "EmptyStringWalker::ShortVisit called"; + virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, + bool* child_args, int nchild_args); + + virtual bool ShortVisit(Regexp* re, bool a) { + // Should never be called: we use Walk(), not WalkExponential(). +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + LOG(DFATAL) << "PCREWalker::ShortVisit called"; +#endif return a; } + + private: + PCREWalker(const PCREWalker&) = delete; + PCREWalker& operator=(const PCREWalker&) = delete; }; // Called after visiting each of re's children and accumulating @@ -114,13 +121,16 @@ bool Regexp::MimicsPCRE() { class EmptyStringWalker : public Regexp::Walker { public: - EmptyStringWalker() { } - bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, - bool* child_args, int nchild_args); + EmptyStringWalker() {} + + virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, + bool* child_args, int nchild_args); - bool ShortVisit(Regexp* re, bool a) { - // Should never be called: we use Walk not WalkExponential. + virtual bool ShortVisit(Regexp* re, bool a) { + // Should never be called: we use Walk(), not WalkExponential(). +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION LOG(DFATAL) << "EmptyStringWalker::ShortVisit called"; +#endif return a; } diff --git a/re2/nfa.cc b/re2/nfa.cc index 77fb5fb1862c6826b04b8352f1e183a4afd73e9d..c7339f8ffd1bf28ed4d4b9776544a7841135a0d1 100644 --- a/re2/nfa.cc +++ b/re2/nfa.cc @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -107,18 +108,21 @@ class NFA { // Returns text version of capture information, for debugging. std::string FormatCapture(const char** capture); - inline void CopyCapture(const char** dst, const char** src); + void CopyCapture(const char** dst, const char** src) { + memmove(dst, src, ncapture_*sizeof src[0]); + } Prog* prog_; // underlying program int start_; // start instruction in program int ncapture_; // number of submatches to track bool longest_; // whether searching for longest match bool endmatch_; // whether match must end at text.end() - const char* btext_; // beginning of text being matched (for FormatSubmatch) - const char* etext_; // end of text being matched (for endmatch_) + const char* btext_; // beginning of text (for FormatSubmatch) + const char* etext_; // end of text (for endmatch_) Threadq q0_, q1_; // pre-allocated for Search. PODArray stack_; // pre-allocated for AddToThreadq - Thread* free_threads_; // free list + std::deque arena_; // thread arena + Thread* freelist_; // thread freelist const char** match_; // best match so far bool matched_; // any match so far? @@ -141,31 +145,30 @@ NFA::NFA(Prog* prog) { prog_->inst_count(kInstEmptyWidth) + prog_->inst_count(kInstNop) + 1; // + 1 for start inst stack_ = PODArray(nstack); - free_threads_ = NULL; + freelist_ = NULL; match_ = NULL; matched_ = false; } NFA::~NFA() { delete[] match_; - Thread* next; - for (Thread* t = free_threads_; t; t = next) { - next = t->next; - delete[] t->capture; - delete t; - } + for (const Thread& t : arena_) + delete[] t.capture; } NFA::Thread* NFA::AllocThread() { - Thread* t = free_threads_; - if (t == NULL) { - t = new Thread; + Thread* t = freelist_; + if (t != NULL) { + freelist_ = t->next; t->ref = 1; - t->capture = new const char*[ncapture_]; + // We don't need to touch t->capture because + // the caller will immediately overwrite it. return t; } - free_threads_ = t->next; + arena_.emplace_back(); + t = &arena_.back(); t->ref = 1; + t->capture = new const char*[ncapture_]; return t; } @@ -176,21 +179,13 @@ NFA::Thread* NFA::Incref(Thread* t) { } void NFA::Decref(Thread* t) { - if (t == NULL) - return; + DCHECK(t != NULL); t->ref--; if (t->ref > 0) return; DCHECK_EQ(t->ref, 0); - t->next = free_threads_; - free_threads_ = t; -} - -void NFA::CopyCapture(const char** dst, const char** src) { - for (int i = 0; i < ncapture_; i+=2) { - dst[i] = src[i]; - dst[i+1] = src[i+1]; - } + t->next = freelist_; + freelist_ = t; } // Follows all empty arrows from id0 and enqueues all the states reached. @@ -372,8 +367,10 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context, matched_ = true; Decref(t); - for (++i; i != runq->end(); ++i) - Decref(i->value()); + for (++i; i != runq->end(); ++i) { + if (i->value() != NULL) + Decref(i->value()); + } runq->clear(); if (ip->greedy(prog_)) return ip->out1(); @@ -416,8 +413,10 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context, // worse than the one we just found: don't run the // rest of the current Threadq. Decref(t); - for (++i; i != runq->end(); ++i) - Decref(i->value()); + for (++i; i != runq->end(); ++i) { + if (i->value() != NULL) + Decref(i->value()); + } runq->clear(); return 0; } @@ -457,14 +456,14 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, context = text; // Sanity check: make sure that text lies within context. - if (text.begin() < context.begin() || text.end() > context.end()) { + if (BeginPtr(text) < BeginPtr(context) || EndPtr(text) > EndPtr(context)) { LOG(DFATAL) << "context does not contain text"; return false; } - if (prog_->anchor_start() && context.begin() != text.begin()) + if (prog_->anchor_start() && BeginPtr(context) != BeginPtr(text)) return false; - if (prog_->anchor_end() && context.end() != text.end()) + if (prog_->anchor_end() && EndPtr(context) != EndPtr(text)) return false; anchored |= prog_->anchor_start(); if (prog_->anchor_end()) { @@ -489,6 +488,7 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, } match_ = new const char*[ncapture_]; + memset(match_, 0, ncapture_*sizeof match_[0]); matched_ = false; // For debugging prints. @@ -505,7 +505,6 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, Threadq* nextq = &q1_; runq->clear(); nextq->clear(); - memset(&match_[0], 0, ncapture_*sizeof match_[0]); // Loop over the text, stepping the machine. for (const char* p = text.data();; p++) { @@ -572,16 +571,14 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, // matches, since it would be to the right of the match // we already found.) if (!matched_ && (!anchored || p == text.data())) { - // If there's a required first byte for an unanchored search - // and we're not in the middle of any possible matches, - // use memchr to search for the byte quickly. - int fb = prog_->first_byte(); + // Try to use prefix accel (e.g. memchr) to skip ahead. + // The search must be unanchored and there must be zero + // possible matches already. if (!anchored && runq->size() == 0 && - fb >= 0 && p < etext_ && (p[0] & 0xFF) != fb) { - p = reinterpret_cast(memchr(p, fb, etext_ - p)); - if (p == NULL) { + p < etext_ && prog_->can_prefix_accel()) { + p = reinterpret_cast(prog_->PrefixAccel(p, etext_ - p)); + if (p == NULL) p = etext_; - } } Thread* t = AllocThread(); @@ -603,7 +600,7 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, // by simply not continuing the loop. // This complements the special case in NFA::Step(). if (p == NULL) { - (void)Step(runq, nextq, p < etext_ ? p[0] & 0xFF : -1, context, p); + (void) Step(runq, nextq, -1, context, p); DCHECK_EQ(runq->size(), 0); using std::swap; swap(nextq, runq); @@ -612,8 +609,10 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, } } - for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) - Decref(i->value()); + for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) { + if (i->value() != NULL) + Decref(i->value()); + } if (matched_) { for (int i = 0; i < nsubmatch; i++) @@ -629,67 +628,6 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, return false; } -// Computes whether all successful matches have a common first byte, -// and if so, returns that byte. If not, returns -1. -int Prog::ComputeFirstByte() { - int b = -1; - SparseSet q(size()); - q.insert(start()); - for (SparseSet::iterator it = q.begin(); it != q.end(); ++it) { - int id = *it; - Prog::Inst* ip = inst(id); - switch (ip->opcode()) { - default: - LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte"; - break; - - case kInstMatch: - // The empty string matches: no first byte. - return -1; - - case kInstByteRange: - if (!ip->last()) - q.insert(id+1); - - // Must match only a single byte - if (ip->lo() != ip->hi()) - return -1; - if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z') - return -1; - // If we haven't seen any bytes yet, record it; - // otherwise must match the one we saw before. - if (b == -1) - b = ip->lo(); - else if (b != ip->lo()) - return -1; - break; - - case kInstNop: - case kInstCapture: - case kInstEmptyWidth: - if (!ip->last()) - q.insert(id+1); - - // Continue on. - // Ignore ip->empty() flags for kInstEmptyWidth - // in order to be as conservative as possible - // (assume all possible empty-width flags are true). - if (ip->out()) - q.insert(ip->out()); - break; - - case kInstAltMatch: - DCHECK(!ip->last()); - q.insert(id+1); - break; - - case kInstFail: - break; - } - } - return b; -} - bool Prog::SearchNFA(const StringPiece& text, const StringPiece& context, Anchor anchor, MatchKind kind, @@ -708,7 +646,7 @@ Prog::SearchNFA(const StringPiece& text, const StringPiece& context, } if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch)) return false; - if (kind == kFullMatch && match[0].end() != text.end()) + if (kind == kFullMatch && EndPtr(match[0]) != EndPtr(text)) return false; return true; } diff --git a/re2/onepass.cc b/re2/onepass.cc index 66a62d94b0ee8238a1a305d9aebe1187caed1c2f..263974654dbca114d7840a02f77c50a24a993b25 100644 --- a/re2/onepass.cc +++ b/re2/onepass.cc @@ -237,9 +237,9 @@ bool Prog::SearchOnePass(const StringPiece& text, StringPiece context = const_context; if (context.data() == NULL) context = text; - if (anchor_start() && context.begin() != text.begin()) + if (anchor_start() && BeginPtr(context) != BeginPtr(text)) return false; - if (anchor_end() && context.end() != text.end()) + if (anchor_end() && EndPtr(context) != EndPtr(text)) return false; if (anchor_end()) kind = kFullMatch; diff --git a/re2/parse.cc b/re2/parse.cc index 50dfdac1aaa26af03a3063d0b571f4fbeab49da1..85f16f060ba366e897a1de1381fd964b1f441749 100644 --- a/re2/parse.cc +++ b/re2/parse.cc @@ -44,12 +44,12 @@ namespace re2 { -// Reduce the maximum repeat count by an order of magnitude when fuzzing. -#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION -static const int kMaxRepeat = 100; -#else -static const int kMaxRepeat = 1000; -#endif +// Controls the maximum repeat count permitted by the parser. +static int maximum_repeat_count = 1000; + +void Regexp::FUZZING_ONLY_set_maximum_repeat_count(int i) { + maximum_repeat_count = i; +} // Regular expression parse state. // The list of parsed regexps so far is maintained as a vector of @@ -93,7 +93,7 @@ class Regexp::ParseState { bool PushSimpleOp(RegexpOp op); // Pushes a ^ onto the stack. - bool PushCarat(); + bool PushCaret(); // Pushes a \b (word == true) or \B (word == false) onto the stack. bool PushWordBoundary(bool word); @@ -423,7 +423,7 @@ bool Regexp::ParseState::PushLiteral(Rune r) { } // Pushes a ^ onto the stack. -bool Regexp::ParseState::PushCarat() { +bool Regexp::ParseState::PushCaret() { if (flags_ & OneLine) { return PushSimpleOp(kRegexpBeginText); } @@ -556,9 +556,10 @@ int RepetitionWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg, } int RepetitionWalker::ShortVisit(Regexp* re, int parent_arg) { - // This should never be called, since we use Walk and not - // WalkExponential. + // Should never be called: we use Walk(), not WalkExponential(). +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION LOG(DFATAL) << "RepetitionWalker::ShortVisit called"; +#endif return 0; } @@ -567,7 +568,9 @@ int RepetitionWalker::ShortVisit(Regexp* re, int parent_arg) { bool Regexp::ParseState::PushRepetition(int min, int max, const StringPiece& s, bool nongreedy) { - if ((max != -1 && max < min) || min > kMaxRepeat || max > kMaxRepeat) { + if ((max != -1 && max < min) || + min > maximum_repeat_count || + max > maximum_repeat_count) { status_->set_code(kRegexpRepeatSize); status_->set_error_arg(s); return false; @@ -590,7 +593,7 @@ bool Regexp::ParseState::PushRepetition(int min, int max, stacktop_ = re; if (min >= 2 || max >= 2) { RepetitionWalker w; - if (w.Walk(stacktop_, kMaxRepeat) == 0) { + if (w.Walk(stacktop_, maximum_repeat_count) == 0) { status_->set_code(kRegexpRepeatSize); status_->set_error_arg(s); return false; @@ -684,7 +687,7 @@ bool Regexp::ParseState::DoRightParen() { if ((r1 = stacktop_) == NULL || (r2 = r1->down_) == NULL || r2->op() != kLeftParen) { - status_->set_code(kRegexpMissingParen); + status_->set_code(kRegexpUnexpectedParen); status_->set_error_arg(whole_regexp_); return false; } @@ -1406,13 +1409,15 @@ static int StringPieceToRune(Rune *r, StringPiece *sp, RegexpStatus* status) { } } - status->set_code(kRegexpBadUTF8); - status->set_error_arg(StringPiece()); + if (status != NULL) { + status->set_code(kRegexpBadUTF8); + status->set_error_arg(StringPiece()); + } return -1; } -// Return whether name is valid UTF-8. -// If not, set status to kRegexpBadUTF8. +// Returns whether name is valid UTF-8. +// If not, sets status to kRegexpBadUTF8. static bool IsValidUTF8(const StringPiece& s, RegexpStatus* status) { StringPiece t = s; Rune r; @@ -1801,14 +1806,13 @@ ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags, // Convert the UnicodeSet to a URange32 and UGroup that we can add. int nr = uset.getRangeCount(); - URange32* r = new URange32[nr]; + PODArray r(nr); for (int i = 0; i < nr; i++) { r[i].lo = uset.getRangeStart(i); r[i].hi = uset.getRangeEnd(i); } - UGroup g = {"", +1, 0, 0, r, nr}; + UGroup g = {"", +1, 0, 0, r.data(), nr}; AddUGroup(cc, &g, sign, parse_flags); - delete[] r; #endif return kParseOk; @@ -2011,19 +2015,34 @@ bool Regexp::ParseState::ParseCharClass(StringPiece* s, return true; } -// Is this a valid capture name? [A-Za-z0-9_]+ -// PCRE limits names to 32 bytes. -// Python rejects names starting with digits. -// We don't enforce either of those. +// Returns whether name is a valid capture name. static bool IsValidCaptureName(const StringPiece& name) { if (name.empty()) return false; - for (size_t i = 0; i < name.size(); i++) { - int c = name[i]; - if (('0' <= c && c <= '9') || - ('a' <= c && c <= 'z') || - ('A' <= c && c <= 'Z') || - c == '_') + + // Historically, we effectively used [0-9A-Za-z_]+ to validate; that + // followed Python 2 except for not restricting the first character. + // As of Python 3, Unicode characters beyond ASCII are also allowed; + // accordingly, we permit the Lu, Ll, Lt, Lm, Lo, Nl, Mn, Mc, Nd and + // Pc categories, but again without restricting the first character. + // Also, Unicode normalization (e.g. NFKC) isn't performed: Python 3 + // performs it for identifiers, but seemingly not for capture names; + // if they start doing that for capture names, we won't follow suit. + static const CharClass* const cc = []() { + CharClassBuilder ccb; + for (StringPiece group : + {"Lu", "Ll", "Lt", "Lm", "Lo", "Nl", "Mn", "Mc", "Nd", "Pc"}) + AddUGroup(&ccb, LookupGroup(group, unicode_groups, num_unicode_groups), + +1, Regexp::NoParseFlags); + return ccb.GetCharClass(); + }(); + + StringPiece t = name; + Rune r; + while (!t.empty()) { + if (StringPieceToRune(&r, &t, NULL) < 0) + return false; + if (cc->Contains(r)) continue; return false; } @@ -2271,7 +2290,7 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, break; case '^': // Beginning of line. - if (!ps.PushCarat()) + if (!ps.PushCaret()) return NULL; t.remove_prefix(1); // '^' break; diff --git a/re2/perl_groups.cc b/re2/perl_groups.cc index 422b3882d494d8ddadda9445e19fba0d0208508d..46874445816916a34087cf95a87345c94f595351 100644 --- a/re2/perl_groups.cc +++ b/re2/perl_groups.cc @@ -20,12 +20,12 @@ static const URange16 code3[] = { /* \w */ { 0x61, 0x7a }, }; const UGroup perl_groups[] = { - { "\\d", +1, code1, 1 }, - { "\\D", -1, code1, 1 }, - { "\\s", +1, code2, 3 }, - { "\\S", -1, code2, 3 }, - { "\\w", +1, code3, 4 }, - { "\\W", -1, code3, 4 }, + { "\\d", +1, code1, 1, 0, 0 }, + { "\\D", -1, code1, 1, 0, 0 }, + { "\\s", +1, code2, 3, 0, 0 }, + { "\\S", -1, code2, 3, 0, 0 }, + { "\\w", +1, code3, 4, 0, 0 }, + { "\\W", -1, code3, 4, 0, 0 }, }; const int num_perl_groups = 6; static const URange16 code4[] = { /* [:alnum:] */ @@ -85,34 +85,34 @@ static const URange16 code17[] = { /* [:xdigit:] */ { 0x61, 0x66 }, }; const UGroup posix_groups[] = { - { "[:alnum:]", +1, code4, 3 }, - { "[:^alnum:]", -1, code4, 3 }, - { "[:alpha:]", +1, code5, 2 }, - { "[:^alpha:]", -1, code5, 2 }, - { "[:ascii:]", +1, code6, 1 }, - { "[:^ascii:]", -1, code6, 1 }, - { "[:blank:]", +1, code7, 2 }, - { "[:^blank:]", -1, code7, 2 }, - { "[:cntrl:]", +1, code8, 2 }, - { "[:^cntrl:]", -1, code8, 2 }, - { "[:digit:]", +1, code9, 1 }, - { "[:^digit:]", -1, code9, 1 }, - { "[:graph:]", +1, code10, 1 }, - { "[:^graph:]", -1, code10, 1 }, - { "[:lower:]", +1, code11, 1 }, - { "[:^lower:]", -1, code11, 1 }, - { "[:print:]", +1, code12, 1 }, - { "[:^print:]", -1, code12, 1 }, - { "[:punct:]", +1, code13, 4 }, - { "[:^punct:]", -1, code13, 4 }, - { "[:space:]", +1, code14, 2 }, - { "[:^space:]", -1, code14, 2 }, - { "[:upper:]", +1, code15, 1 }, - { "[:^upper:]", -1, code15, 1 }, - { "[:word:]", +1, code16, 4 }, - { "[:^word:]", -1, code16, 4 }, - { "[:xdigit:]", +1, code17, 3 }, - { "[:^xdigit:]", -1, code17, 3 }, + { "[:alnum:]", +1, code4, 3, 0, 0 }, + { "[:^alnum:]", -1, code4, 3, 0, 0 }, + { "[:alpha:]", +1, code5, 2, 0, 0 }, + { "[:^alpha:]", -1, code5, 2, 0, 0 }, + { "[:ascii:]", +1, code6, 1, 0, 0 }, + { "[:^ascii:]", -1, code6, 1, 0, 0 }, + { "[:blank:]", +1, code7, 2, 0, 0 }, + { "[:^blank:]", -1, code7, 2, 0, 0 }, + { "[:cntrl:]", +1, code8, 2, 0, 0 }, + { "[:^cntrl:]", -1, code8, 2, 0, 0 }, + { "[:digit:]", +1, code9, 1, 0, 0 }, + { "[:^digit:]", -1, code9, 1, 0, 0 }, + { "[:graph:]", +1, code10, 1, 0, 0 }, + { "[:^graph:]", -1, code10, 1, 0, 0 }, + { "[:lower:]", +1, code11, 1, 0, 0 }, + { "[:^lower:]", -1, code11, 1, 0, 0 }, + { "[:print:]", +1, code12, 1, 0, 0 }, + { "[:^print:]", -1, code12, 1, 0, 0 }, + { "[:punct:]", +1, code13, 4, 0, 0 }, + { "[:^punct:]", -1, code13, 4, 0, 0 }, + { "[:space:]", +1, code14, 2, 0, 0 }, + { "[:^space:]", -1, code14, 2, 0, 0 }, + { "[:upper:]", +1, code15, 1, 0, 0 }, + { "[:^upper:]", -1, code15, 1, 0, 0 }, + { "[:word:]", +1, code16, 4, 0, 0 }, + { "[:^word:]", -1, code16, 4, 0, 0 }, + { "[:xdigit:]", +1, code17, 3, 0, 0 }, + { "[:^xdigit:]", -1, code17, 3, 0, 0 }, }; const int num_posix_groups = 28; diff --git a/re2/pod_array.h b/re2/pod_array.h index e8093ad3f98269eff5e69d6bd31bd9e9fba82ce1..f234e976f40de79cf4b3aa9703371ecf8ab19fb3 100644 --- a/re2/pod_array.h +++ b/re2/pod_array.h @@ -13,7 +13,7 @@ namespace re2 { template class PODArray { public: - static_assert(std::is_pod::value, + static_assert(std::is_trivial::value && std::is_standard_layout::value, "T must be POD"); PODArray() diff --git a/re2/prefilter.cc b/re2/prefilter.cc index f61d54b8f81fac7025c29b2c979fb41cc77da71d..a47b3120fbea84fac59de936ab69a16232e64e7a 100644 --- a/re2/prefilter.cc +++ b/re2/prefilter.cc @@ -648,14 +648,15 @@ Prefilter* Prefilter::FromRegexp(Regexp* re) { return NULL; Regexp* simple = re->Simplify(); - Prefilter::Info *info = BuildInfo(simple); + if (simple == NULL) + return NULL; + Prefilter::Info* info = BuildInfo(simple); simple->Decref(); if (info == NULL) return NULL; Prefilter* m = info->TakeMatch(); - delete info; return m; } diff --git a/re2/prefilter_tree.cc b/re2/prefilter_tree.cc index 187e2ec55200c63765f5d12c15a6336d12b05af2..fdf4e083c9a42d9252f4da505c7569b9ed36f598 100644 --- a/re2/prefilter_tree.cc +++ b/re2/prefilter_tree.cc @@ -107,7 +107,7 @@ void PrefilterTree::Compile(std::vector* atom_vec) { Prefilter* PrefilterTree::CanonicalNode(NodeMap* nodes, Prefilter* node) { std::string node_string = NodeString(node); - std::map::iterator iter = nodes->find(node_string); + NodeMap::iterator iter = nodes->find(node_string); if (iter == nodes->end()) return NULL; return (*iter).second; @@ -377,7 +377,7 @@ void PrefilterTree::PrintDebugInfo(NodeMap* nodes) { LOG(ERROR) << it->first; } LOG(ERROR) << "Map:"; - for (std::map::const_iterator iter = nodes->begin(); + for (NodeMap::const_iterator iter = nodes->begin(); iter != nodes->end(); ++iter) LOG(ERROR) << "NodeId: " << (*iter).second->unique_id() << " Str: " << (*iter).first; diff --git a/re2/prog.cc b/re2/prog.cc index cc35917537a31ee5dd21f147993a29b6e43af1a3..55dc10578b6944eb389ff85af9756a80d75be340 100644 --- a/re2/prog.cc +++ b/re2/prog.cc @@ -7,6 +7,12 @@ #include "re2/prog.h" +#if defined(__AVX2__) +#include +#ifdef _MSC_VER +#include +#endif +#endif #include #include #include @@ -109,9 +115,10 @@ Prog::Prog() start_unanchored_(0), size_(0), bytemap_range_(0), - first_byte_(-1), - flags_(0), + prefix_foldcase_(false), + prefix_size_(0), list_count_(0), + bit_state_text_max_size_(0), dfa_mem_(0), dfa_first_(NULL), dfa_longest_(NULL) { @@ -120,6 +127,8 @@ Prog::Prog() Prog::~Prog() { DeleteDFA(dfa_longest_); DeleteDFA(dfa_first_); + if (prefix_foldcase_) + delete[] prefix_dfa_; } typedef SparseSet Workq; @@ -185,14 +194,31 @@ std::string Prog::DumpByteMap() { return map; } -int Prog::first_byte() { - std::call_once(first_byte_once_, [](Prog* prog) { - prog->first_byte_ = prog->ComputeFirstByte(); - }, this); - return first_byte_; -} +// Is ip a guaranteed match at end of text, perhaps after some capturing? +static bool IsMatch(Prog* prog, Prog::Inst* ip) { + for (;;) { + switch (ip->opcode()) { + default: + LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode(); + return false; + + case kInstAlt: + case kInstAltMatch: + case kInstByteRange: + case kInstFail: + case kInstEmptyWidth: + return false; + + case kInstCapture: + case kInstNop: + ip = prog->inst(ip->out()); + break; -static bool IsMatch(Prog*, Prog::Inst*); + case kInstMatch: + return true; + } + } +} // Peep-hole optimizer. void Prog::Optimize() { @@ -258,32 +284,6 @@ void Prog::Optimize() { } } -// Is ip a guaranteed match at end of text, perhaps after some capturing? -static bool IsMatch(Prog* prog, Prog::Inst* ip) { - for (;;) { - switch (ip->opcode()) { - default: - LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode(); - return false; - - case kInstAlt: - case kInstAltMatch: - case kInstByteRange: - case kInstFail: - case kInstEmptyWidth: - return false; - - case kInstCapture: - case kInstNop: - ip = prog->inst(ip->out()); - break; - - case kInstMatch: - return true; - } - } -} - uint32_t Prog::EmptyFlags(const StringPiece& text, const char* p) { int flags = 0; @@ -641,6 +641,11 @@ void Prog::Flatten() { for (int i = 0; i < list_count_; ++i) list_heads_[flatmap[i]] = i; } + + // BitState allocates a bitmap of size list_count_ * (text.size()+1) + // for tracking pairs of possibilities that it has already explored. + const size_t kBitStateBitmapMaxSize = 256*1024; // max size in bits + bit_state_text_max_size_ = kBitStateBitmapMaxSize / list_count_ - 1; } void Prog::MarkSuccessors(SparseArray* rootmap, @@ -918,4 +923,250 @@ void Prog::ComputeHints(std::vector* flat, int begin, int end) { } } +// The final state will always be this, which frees up a register for the hot +// loop and thus avoids the spilling that can occur when building with Clang. +static const size_t kShiftDFAFinal = 9; + +// This function takes the prefix as std::string (i.e. not const std::string& +// as normal) because it's going to clobber it, so a temporary is convenient. +static uint64_t* BuildShiftDFA(std::string prefix) { + // This constant is for convenience now and also for correctness later when + // we clobber the prefix, but still need to know how long it was initially. + const size_t size = prefix.size(); + + // Construct the NFA. + // The table is indexed by input byte; each element is a bitfield of states + // reachable by the input byte. Given a bitfield of the current states, the + // bitfield of states reachable from those is - for this specific purpose - + // always ((ncurr << 1) | 1). Intersecting the reachability bitfields gives + // the bitfield of the next states reached by stepping over the input byte. + // Credits for this technique: the Hyperscan paper by Geoff Langdale et al. + uint16_t nfa[256]{}; + for (size_t i = 0; i < size; ++i) { + uint8_t b = prefix[i]; + nfa[b] |= 1 << (i+1); + } + // This is the `\C*?` for unanchored search. + for (int b = 0; b < 256; ++b) + nfa[b] |= 1; + + // This maps from DFA state to NFA states; the reverse mapping is used when + // recording transitions and gets implemented with plain old linear search. + // The "Shift DFA" technique limits this to ten states when using uint64_t; + // to allow for the initial state, we use at most nine bytes of the prefix. + // That same limit is also why uint16_t is sufficient for the NFA bitfield. + uint16_t states[kShiftDFAFinal+1]{}; + states[0] = 1; + for (size_t dcurr = 0; dcurr < size; ++dcurr) { + uint8_t b = prefix[dcurr]; + uint16_t ncurr = states[dcurr]; + uint16_t nnext = nfa[b] & ((ncurr << 1) | 1); + size_t dnext = dcurr+1; + if (dnext == size) + dnext = kShiftDFAFinal; + states[dnext] = nnext; + } + + // Sort and unique the bytes of the prefix to avoid repeating work while we + // record transitions. This clobbers the prefix, but it's no longer needed. + std::sort(prefix.begin(), prefix.end()); + prefix.erase(std::unique(prefix.begin(), prefix.end()), prefix.end()); + + // Construct the DFA. + // The table is indexed by input byte; each element is effectively a packed + // array of uint6_t; each array value will be multiplied by six in order to + // avoid having to do so later in the hot loop as well as masking/shifting. + // Credits for this technique: "Shift-based DFAs" on GitHub by Per Vognsen. + uint64_t* dfa = new uint64_t[256]{}; + // Record a transition from each state for each of the bytes of the prefix. + // Note that all other input bytes go back to the initial state by default. + for (size_t dcurr = 0; dcurr < size; ++dcurr) { + for (uint8_t b : prefix) { + uint16_t ncurr = states[dcurr]; + uint16_t nnext = nfa[b] & ((ncurr << 1) | 1); + size_t dnext = 0; + while (states[dnext] != nnext) + ++dnext; + dfa[b] |= static_cast(dnext * 6) << (dcurr * 6); + // Convert ASCII letters to uppercase and record the extra transitions. + // Note that ASCII letters are guaranteed to be lowercase at this point + // because that's how the parser normalises them. #FunFact: 'k' and 's' + // match U+212A and U+017F, respectively, so they won't occur here when + // using UTF-8 encoding because the parser will emit character classes. + if ('a' <= b && b <= 'z') { + b -= 'a' - 'A'; + dfa[b] |= static_cast(dnext * 6) << (dcurr * 6); + } + } + } + // This lets the final state "saturate", which will matter for performance: + // in the hot loop, we check for a match only at the end of each iteration, + // so we must keep signalling the match until we get around to checking it. + for (int b = 0; b < 256; ++b) + dfa[b] |= static_cast(kShiftDFAFinal * 6) << (kShiftDFAFinal * 6); + + return dfa; +} + +void Prog::ConfigurePrefixAccel(const std::string& prefix, + bool prefix_foldcase) { + prefix_foldcase_ = prefix_foldcase; + prefix_size_ = prefix.size(); + if (prefix_foldcase_) { + // Use PrefixAccel_ShiftDFA(). + // ... and no more than nine bytes of the prefix. (See above for details.) + prefix_size_ = std::min(prefix_size_, kShiftDFAFinal); + prefix_dfa_ = BuildShiftDFA(prefix.substr(0, prefix_size_)); + } else if (prefix_size_ != 1) { + // Use PrefixAccel_FrontAndBack(). + prefix_front_ = prefix.front(); + prefix_back_ = prefix.back(); + } else { + // Use memchr(3). + prefix_front_ = prefix.front(); + } +} + +const void* Prog::PrefixAccel_ShiftDFA(const void* data, size_t size) { + if (size < prefix_size_) + return NULL; + + uint64_t curr = 0; + + // At the time of writing, rough benchmarks on a Broadwell machine showed + // that this unroll factor (i.e. eight) achieves a speedup factor of two. + if (size >= 8) { + const uint8_t* p = reinterpret_cast(data); + const uint8_t* endp = p + (size&~7); + do { + uint8_t b0 = p[0]; + uint8_t b1 = p[1]; + uint8_t b2 = p[2]; + uint8_t b3 = p[3]; + uint8_t b4 = p[4]; + uint8_t b5 = p[5]; + uint8_t b6 = p[6]; + uint8_t b7 = p[7]; + + uint64_t next0 = prefix_dfa_[b0]; + uint64_t next1 = prefix_dfa_[b1]; + uint64_t next2 = prefix_dfa_[b2]; + uint64_t next3 = prefix_dfa_[b3]; + uint64_t next4 = prefix_dfa_[b4]; + uint64_t next5 = prefix_dfa_[b5]; + uint64_t next6 = prefix_dfa_[b6]; + uint64_t next7 = prefix_dfa_[b7]; + + uint64_t curr0 = next0 >> (curr & 63); + uint64_t curr1 = next1 >> (curr0 & 63); + uint64_t curr2 = next2 >> (curr1 & 63); + uint64_t curr3 = next3 >> (curr2 & 63); + uint64_t curr4 = next4 >> (curr3 & 63); + uint64_t curr5 = next5 >> (curr4 & 63); + uint64_t curr6 = next6 >> (curr5 & 63); + uint64_t curr7 = next7 >> (curr6 & 63); + + if ((curr7 & 63) == kShiftDFAFinal * 6) { + // At the time of writing, using the same masking subexpressions from + // the preceding lines caused Clang to clutter the hot loop computing + // them - even though they aren't actually needed for shifting! Hence + // these rewritten conditions, which achieve a speedup factor of two. + if (((curr7-curr0) & 63) == 0) return p+1-prefix_size_; + if (((curr7-curr1) & 63) == 0) return p+2-prefix_size_; + if (((curr7-curr2) & 63) == 0) return p+3-prefix_size_; + if (((curr7-curr3) & 63) == 0) return p+4-prefix_size_; + if (((curr7-curr4) & 63) == 0) return p+5-prefix_size_; + if (((curr7-curr5) & 63) == 0) return p+6-prefix_size_; + if (((curr7-curr6) & 63) == 0) return p+7-prefix_size_; + if (((curr7-curr7) & 63) == 0) return p+8-prefix_size_; + } + + curr = curr7; + p += 8; + } while (p != endp); + data = p; + size = size&7; + } + + const uint8_t* p = reinterpret_cast(data); + const uint8_t* endp = p + size; + while (p != endp) { + uint8_t b = *p++; + uint64_t next = prefix_dfa_[b]; + curr = next >> (curr & 63); + if ((curr & 63) == kShiftDFAFinal * 6) + return p-prefix_size_; + } + return NULL; +} + +#if defined(__AVX2__) +// Finds the least significant non-zero bit in n. +static int FindLSBSet(uint32_t n) { + DCHECK_NE(n, 0); +#if defined(__GNUC__) + return __builtin_ctz(n); +#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) + unsigned long c; + _BitScanForward(&c, n); + return static_cast(c); +#else + int c = 31; + for (int shift = 1 << 4; shift != 0; shift >>= 1) { + uint32_t word = n << shift; + if (word != 0) { + n = word; + c -= shift; + } + } + return c; +#endif +} +#endif + +const void* Prog::PrefixAccel_FrontAndBack(const void* data, size_t size) { + DCHECK_GE(prefix_size_, 2); + if (size < prefix_size_) + return NULL; + // Don't bother searching the last prefix_size_-1 bytes for prefix_front_. + // This also means that probing for prefix_back_ doesn't go out of bounds. + size -= prefix_size_-1; + +#if defined(__AVX2__) + // Use AVX2 to look for prefix_front_ and prefix_back_ 32 bytes at a time. + if (size >= sizeof(__m256i)) { + const __m256i* fp = reinterpret_cast( + reinterpret_cast(data)); + const __m256i* bp = reinterpret_cast( + reinterpret_cast(data) + prefix_size_-1); + const __m256i* endfp = fp + size/sizeof(__m256i); + const __m256i f_set1 = _mm256_set1_epi8(prefix_front_); + const __m256i b_set1 = _mm256_set1_epi8(prefix_back_); + do { + const __m256i f_loadu = _mm256_loadu_si256(fp++); + const __m256i b_loadu = _mm256_loadu_si256(bp++); + const __m256i f_cmpeq = _mm256_cmpeq_epi8(f_set1, f_loadu); + const __m256i b_cmpeq = _mm256_cmpeq_epi8(b_set1, b_loadu); + const int fb_testz = _mm256_testz_si256(f_cmpeq, b_cmpeq); + if (fb_testz == 0) { // ZF: 1 means zero, 0 means non-zero. + const __m256i fb_and = _mm256_and_si256(f_cmpeq, b_cmpeq); + const int fb_movemask = _mm256_movemask_epi8(fb_and); + const int fb_ctz = FindLSBSet(fb_movemask); + return reinterpret_cast(fp-1) + fb_ctz; + } + } while (fp != endfp); + data = fp; + size = size%sizeof(__m256i); + } +#endif + + const char* p0 = reinterpret_cast(data); + for (const char* p = p0;; p++) { + DCHECK_GE(size, static_cast(p-p0)); + p = reinterpret_cast(memchr(p, prefix_front_, size - (p-p0))); + if (p == NULL || p[prefix_size_-1] == prefix_back_) + return p; + } +} + } // namespace re2 diff --git a/re2/prog.h b/re2/prog.h index 40a6ce4e3c862b92403b58ca669d5bfdbc2e62f9..4af012ab6fc8982d5fdc09052a23cd4a896f5243 100644 --- a/re2/prog.h +++ b/re2/prog.h @@ -198,8 +198,8 @@ class Prog { Inst *inst(int id) { return &inst_[id]; } int start() { return start_; } - int start_unanchored() { return start_unanchored_; } void set_start(int start) { start_ = start; } + int start_unanchored() { return start_unanchored_; } void set_start_unanchored(int start) { start_unanchored_ = start; } int size() { return size_; } bool reversed() { return reversed_; } @@ -207,19 +207,40 @@ class Prog { int list_count() { return list_count_; } int inst_count(InstOp op) { return inst_count_[op]; } uint16_t* list_heads() { return list_heads_.data(); } - void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; } + size_t bit_state_text_max_size() { return bit_state_text_max_size_; } int64_t dfa_mem() { return dfa_mem_; } - int flags() { return flags_; } - void set_flags(int flags) { flags_ = flags; } + void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; } bool anchor_start() { return anchor_start_; } void set_anchor_start(bool b) { anchor_start_ = b; } bool anchor_end() { return anchor_end_; } void set_anchor_end(bool b) { anchor_end_ = b; } int bytemap_range() { return bytemap_range_; } const uint8_t* bytemap() { return bytemap_; } + bool can_prefix_accel() { return prefix_size_ != 0; } + + // Accelerates to the first likely occurrence of the prefix. + // Returns a pointer to the first byte or NULL if not found. + const void* PrefixAccel(const void* data, size_t size) { + DCHECK(can_prefix_accel()); + if (prefix_foldcase_) { + return PrefixAccel_ShiftDFA(data, size); + } else if (prefix_size_ != 1) { + return PrefixAccel_FrontAndBack(data, size); + } else { + return memchr(data, prefix_front_, size); + } + } + + // Configures prefix accel using the analysis performed during compilation. + void ConfigurePrefixAccel(const std::string& prefix, bool prefix_foldcase); + + // An implementation of prefix accel that uses prefix_dfa_ to perform + // case-insensitive search. + const void* PrefixAccel_ShiftDFA(const void* data, size_t size); - // Lazily computed. - int first_byte(); + // An implementation of prefix accel that looks for prefix_front_ and + // prefix_back_ to return fewer false positives than memchr(3) alone. + const void* PrefixAccel_FrontAndBack(const void* data, size_t size); // Returns string representation of program for debugging. std::string Dump(); @@ -290,17 +311,9 @@ class Prog { // FOR TESTING OR EXPERIMENTAL PURPOSES ONLY. int BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb); - // Controls whether the DFA should bail out early if the NFA would be faster. - // FOR TESTING ONLY. - static void TEST_dfa_should_bail_when_slow(bool b); - // Compute bytemap. void ComputeByteMap(); - // Computes whether all matches must begin with the same first - // byte, and if so, returns that byte. If not, returns -1. - int ComputeFirstByte(); - // Run peep-hole optimizer on program. void Optimize(); @@ -386,6 +399,10 @@ class Prog { // Computes hints for ByteRange instructions in [begin, end). void ComputeHints(std::vector* flat, int begin, int end); + // Controls whether the DFA should bail out early if the NFA would be faster. + // FOR TESTING ONLY. + static void TESTING_ONLY_set_dfa_should_bail_when_slow(bool b); + private: friend class Compiler; @@ -402,13 +419,22 @@ class Prog { int start_unanchored_; // unanchored entry point for program int size_; // number of instructions int bytemap_range_; // bytemap_[x] < bytemap_range_ - int first_byte_; // required first byte for match, or -1 if none - int flags_; // regexp parse flags - int list_count_; // count of lists (see above) - int inst_count_[kNumInst]; // count of instructions by opcode - PODArray list_heads_; // sparse array enumerating list heads - // not populated if size_ is overly large + bool prefix_foldcase_; // whether prefix is case-insensitive + size_t prefix_size_; // size of prefix (0 if no prefix) + union { + uint64_t* prefix_dfa_; // "Shift DFA" for prefix + struct { + int prefix_front_; // first byte of prefix + int prefix_back_; // last byte of prefix + }; + }; + + int list_count_; // count of lists (see above) + int inst_count_[kNumInst]; // count of instructions by opcode + PODArray list_heads_; // sparse array enumerating list heads + // not populated if size_ is overly large + size_t bit_state_text_max_size_; // upper bound (inclusive) on text.size() PODArray inst_; // pointer to instruction array PODArray onepass_nodes_; // data for OnePass nodes @@ -419,7 +445,6 @@ class Prog { uint8_t bytemap_[256]; // map from input bytes to byte classes - std::once_flag first_byte_once_; std::once_flag dfa_first_once_; std::once_flag dfa_longest_once_; @@ -427,6 +452,17 @@ class Prog { Prog& operator=(const Prog&) = delete; }; +// std::string_view in MSVC has iterators that aren't just pointers and +// that don't allow comparisons between different objects - not even if +// those objects are views into the same string! Thus, we provide these +// conversion functions for convenience. +static inline const char* BeginPtr(const StringPiece& s) { + return s.data(); +} +static inline const char* EndPtr(const StringPiece& s) { + return s.data() + s.size(); +} + } // namespace re2 #endif // RE2_PROG_H_ diff --git a/re2/re2.cc b/re2/re2.cc index a8dd24b00c389dbd754f328215c7cf992e5d4de4..c02713322f7ecf99bf8a9d3e8c40bbf38483204e 100644 --- a/re2/re2.cc +++ b/re2/re2.cc @@ -12,10 +12,14 @@ #include #include #include +#ifdef _MSC_VER +#include +#endif #include #include #include #include +#include #include #include #include @@ -79,6 +83,8 @@ static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) { return RE2::ErrorMissingBracket; case re2::kRegexpMissingParen: return RE2::ErrorMissingParen; + case re2::kRegexpUnexpectedParen: + return RE2::ErrorUnexpectedParen; case re2::kRegexpTrailingBackslash: return RE2::ErrorTrailingBackslash; case re2::kRegexpRepeatArgument: @@ -172,15 +178,20 @@ void RE2::Init(const StringPiece& pattern, const Options& options) { empty_group_names = new std::map; }); - pattern_ = std::string(pattern); + pattern_.assign(pattern.data(), pattern.size()); options_.Copy(options); entire_regexp_ = NULL; + error_ = empty_string; + error_code_ = NoError; + error_arg_.clear(); + prefix_.clear(); + prefix_foldcase_ = false; suffix_regexp_ = NULL; prog_ = NULL; num_captures_ = -1; + is_one_pass_ = false; + rprog_ = NULL; - error_ = empty_string; - error_code_ = NoError; named_groups_ = NULL; group_names_ = NULL; @@ -239,9 +250,11 @@ re2::Prog* RE2::ReverseProg() const { if (re->rprog_ == NULL) { if (re->options_.log_errors()) LOG(ERROR) << "Error reverse compiling '" << trunc(re->pattern_) << "'"; - re->error_ = - new std::string("pattern too large - reverse compile failed"); - re->error_code_ = RE2::ErrorPatternTooLarge; + // We no longer touch error_ and error_code_ because failing to compile + // the reverse Prog is not a showstopper: falling back to NFA execution + // is fine. More importantly, an RE2 object is supposed to be logically + // immutable: whatever ok() would have returned after Init() completed, + // it should continue to return that no matter what ReverseProg() does. } }, this); return rprog_; @@ -277,28 +290,54 @@ int RE2::ReverseProgramSize() const { return prog->size(); } -static int Fanout(Prog* prog, std::map* histogram) { +// Finds the most significant non-zero bit in n. +static int FindMSBSet(uint32_t n) { + DCHECK_NE(n, 0); +#if defined(__GNUC__) + return 31 ^ __builtin_clz(n); +#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) + unsigned long c; + _BitScanReverse(&c, n); + return static_cast(c); +#else + int c = 0; + for (int shift = 1 << 4; shift != 0; shift >>= 1) { + uint32_t word = n >> shift; + if (word != 0) { + n = word; + c += shift; + } + } + return c; +#endif +} + +static int Fanout(Prog* prog, std::vector* histogram) { SparseArray fanout(prog->size()); prog->Fanout(&fanout); - histogram->clear(); + int data[32] = {}; + int size = 0; for (SparseArray::iterator i = fanout.begin(); i != fanout.end(); ++i) { - // TODO(junyer): Optimise this? - int bucket = 0; - while (1 << bucket < i->value()) { - bucket++; - } - (*histogram)[bucket]++; + if (i->value() == 0) + continue; + uint32_t value = i->value(); + int bucket = FindMSBSet(value); + bucket += value & (value-1) ? 1 : 0; + ++data[bucket]; + size = std::max(size, bucket+1); } - return histogram->rbegin()->first; + if (histogram != NULL) + histogram->assign(data, data+size); + return size-1; } -int RE2::ProgramFanout(std::map* histogram) const { +int RE2::ProgramFanout(std::vector* histogram) const { if (prog_ == NULL) return -1; return Fanout(prog_, histogram); } -int RE2::ReverseProgramFanout(std::map* histogram) const { +int RE2::ReverseProgramFanout(std::vector* histogram) const { if (prog_ == NULL) return -1; Prog* prog = ReverseProg(); @@ -368,6 +407,8 @@ bool RE2::Replace(std::string* str, const StringPiece& rewrite) { StringPiece vec[kVecSize]; int nvec = 1 + MaxSubmatch(rewrite); + if (nvec > 1 + re.NumberOfCapturingGroups()) + return false; if (nvec > static_cast(arraysize(vec))) return false; if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec)) @@ -388,6 +429,8 @@ int RE2::GlobalReplace(std::string* str, const StringPiece& rewrite) { StringPiece vec[kVecSize]; int nvec = 1 + MaxSubmatch(rewrite); + if (nvec > 1 + re.NumberOfCapturingGroups()) + return false; if (nvec > static_cast(arraysize(vec))) return false; @@ -460,9 +503,10 @@ bool RE2::Extract(const StringPiece& text, std::string* out) { StringPiece vec[kVecSize]; int nvec = 1 + MaxSubmatch(rewrite); + if (nvec > 1 + re.NumberOfCapturingGroups()) + return false; if (nvec > static_cast(arraysize(vec))) return false; - if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec)) return false; @@ -610,6 +654,8 @@ bool RE2::Match(const StringPiece& text, // If the regexp is anchored explicitly, must not be in middle of text. if (prog_->anchor_start() && startpos != 0) return false; + if (prog_->anchor_end() && endpos != text.size()) + return false; // If the regexp is anchored explicitly, update re_anchor // so that we can potentially fall into a faster case below. @@ -643,50 +689,87 @@ bool RE2::Match(const StringPiece& text, Prog::MatchKind kind = Prog::kFirstMatch; if (options_.longest_match()) kind = Prog::kLongestMatch; - bool skipped_test = false; - bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture); - - // BitState allocates a bitmap of size prog_->list_count() * text.size(). - // It also allocates a stack of 3-word structures which could potentially - // grow as large as prog_->list_count() * text.size(), but in practice is - // much smaller. - const int kMaxBitStateBitmapSize = 256*1024; // bitmap size <= max (bits) + bool can_one_pass = is_one_pass_ && ncap <= Prog::kMaxOnePassCapture; bool can_bit_state = prog_->CanBitState(); - size_t bit_state_text_max = kMaxBitStateBitmapSize / prog_->list_count(); + size_t bit_state_text_max_size = prog_->bit_state_text_max_size(); +#ifdef RE2_HAVE_THREAD_LOCAL + hooks::context = this; +#endif bool dfa_failed = false; + bool skipped_test = false; switch (re_anchor) { default: + LOG(DFATAL) << "Unexpected re_anchor value: " << re_anchor; + return false; + case UNANCHORED: { + if (prog_->anchor_end()) { + // This is a very special case: we don't need the forward DFA because + // we already know where the match must end! Instead, the reverse DFA + // can say whether there is a match and (optionally) where it starts. + Prog* prog = ReverseProg(); + if (prog == NULL) { + // Fall back to NFA below. + skipped_test = true; + break; + } + if (!prog->SearchDFA(subtext, text, Prog::kAnchored, + Prog::kLongestMatch, matchp, &dfa_failed, NULL)) { + if (dfa_failed) { + if (options_.log_errors()) + LOG(ERROR) << "DFA out of memory: " + << "pattern length " << pattern_.size() << ", " + << "program size " << prog->size() << ", " + << "list count " << prog->list_count() << ", " + << "bytemap range " << prog->bytemap_range(); + // Fall back to NFA below. + skipped_test = true; + break; + } + return false; + } + if (matchp == NULL) // Matched. Don't care where. + return true; + break; + } + if (!prog_->SearchDFA(subtext, text, anchor, kind, matchp, &dfa_failed, NULL)) { if (dfa_failed) { if (options_.log_errors()) - LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", " - << "bytemap range " << prog_->bytemap_range() << ", " - << "list count " << prog_->list_count(); + LOG(ERROR) << "DFA out of memory: " + << "pattern length " << pattern_.size() << ", " + << "program size " << prog_->size() << ", " + << "list count " << prog_->list_count() << ", " + << "bytemap range " << prog_->bytemap_range(); // Fall back to NFA below. skipped_test = true; break; } return false; } - if (matchp == NULL) // Matched. Don't care where + if (matchp == NULL) // Matched. Don't care where. return true; - // SearchDFA set match[0].end() but didn't know where the - // match started. Run the regexp backward from match[0].end() + // SearchDFA set match.end() but didn't know where the + // match started. Run the regexp backward from match.end() // to find the longest possible match -- that's where it started. Prog* prog = ReverseProg(); - if (prog == NULL) - return false; + if (prog == NULL) { + // Fall back to NFA below. + skipped_test = true; + break; + } if (!prog->SearchDFA(match, text, Prog::kAnchored, Prog::kLongestMatch, &match, &dfa_failed, NULL)) { if (dfa_failed) { if (options_.log_errors()) - LOG(ERROR) << "DFA out of memory: size " << prog->size() << ", " - << "bytemap range " << prog->bytemap_range() << ", " - << "list count " << prog->list_count(); + LOG(ERROR) << "DFA out of memory: " + << "pattern length " << pattern_.size() << ", " + << "program size " << prog->size() << ", " + << "list count " << prog->list_count() << ", " + << "bytemap range " << prog->bytemap_range(); // Fall back to NFA below. skipped_test = true; break; @@ -712,11 +795,12 @@ bool RE2::Match(const StringPiece& text, // it doesn't have the shared state and occasional mutex that // the DFA does. if (can_one_pass && text.size() <= 4096 && - (ncap > 1 || text.size() <= 8)) { + (ncap > 1 || text.size() <= 16)) { skipped_test = true; break; } - if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) { + if (can_bit_state && text.size() <= bit_state_text_max_size && + ncap > 1) { skipped_test = true; break; } @@ -724,9 +808,11 @@ bool RE2::Match(const StringPiece& text, &match, &dfa_failed, NULL)) { if (dfa_failed) { if (options_.log_errors()) - LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", " - << "bytemap range " << prog_->bytemap_range() << ", " - << "list count " << prog_->list_count(); + LOG(ERROR) << "DFA out of memory: " + << "pattern length " << pattern_.size() << ", " + << "program size " << prog_->size() << ", " + << "list count " << prog_->list_count() << ", " + << "bytemap range " << prog_->bytemap_range(); // Fall back to NFA below. skipped_test = true; break; @@ -761,7 +847,7 @@ bool RE2::Match(const StringPiece& text, LOG(ERROR) << "SearchOnePass inconsistency"; return false; } - } else if (can_bit_state && subtext1.size() <= bit_state_text_max) { + } else if (can_bit_state && subtext1.size() <= bit_state_text_max_size) { if (!prog_->SearchBitState(subtext1, text, anchor, kind, submatch, ncap)) { if (!skipped_test && options_.log_errors()) @@ -829,7 +915,7 @@ bool RE2::DoMatch(const StringPiece& text, } if (consumed != NULL) - *consumed = static_cast(vec[0].end() - text.begin()); + *consumed = static_cast(EndPtr(vec[0]) - BeginPtr(text)); if (n == 0 || args == NULL) { // We are not interested in results @@ -928,8 +1014,8 @@ bool RE2::Rewrite(std::string* out, int n = (c - '0'); if (n >= veclen) { if (options_.log_errors()) { - LOG(ERROR) << "requested group " << n - << " in regexp " << rewrite.data(); + LOG(ERROR) << "invalid substitution \\" << n + << " from " << veclen << " groups"; } return false; } @@ -949,41 +1035,49 @@ bool RE2::Rewrite(std::string* out, /***** Parsers for various types *****/ -bool RE2::Arg::parse_null(const char* str, size_t n, void* dest) { +namespace re2_internal { + +template <> +bool Parse(const char* str, size_t n, void* dest) { // We fail if somebody asked us to store into a non-NULL void* pointer return (dest == NULL); } -bool RE2::Arg::parse_string(const char* str, size_t n, void* dest) { +template <> +bool Parse(const char* str, size_t n, std::string* dest) { if (dest == NULL) return true; - reinterpret_cast(dest)->assign(str, n); + dest->assign(str, n); return true; } -bool RE2::Arg::parse_stringpiece(const char* str, size_t n, void* dest) { +template <> +bool Parse(const char* str, size_t n, StringPiece* dest) { if (dest == NULL) return true; - *(reinterpret_cast(dest)) = StringPiece(str, n); + *dest = StringPiece(str, n); return true; } -bool RE2::Arg::parse_char(const char* str, size_t n, void* dest) { +template <> +bool Parse(const char* str, size_t n, char* dest) { if (n != 1) return false; if (dest == NULL) return true; - *(reinterpret_cast(dest)) = str[0]; + *dest = str[0]; return true; } -bool RE2::Arg::parse_schar(const char* str, size_t n, void* dest) { +template <> +bool Parse(const char* str, size_t n, signed char* dest) { if (n != 1) return false; if (dest == NULL) return true; - *(reinterpret_cast(dest)) = str[0]; + *dest = str[0]; return true; } -bool RE2::Arg::parse_uchar(const char* str, size_t n, void* dest) { +template <> +bool Parse(const char* str, size_t n, unsigned char* dest) { if (n != 1) return false; if (dest == NULL) return true; - *(reinterpret_cast(dest)) = str[0]; + *dest = str[0]; return true; } @@ -1047,10 +1141,40 @@ static const char* TerminateNumber(char* buf, size_t nbuf, const char* str, return buf; } -bool RE2::Arg::parse_long_radix(const char* str, - size_t n, - void* dest, - int radix) { +template <> +bool Parse(const char* str, size_t n, float* dest) { + if (n == 0) return false; + static const int kMaxLength = 200; + char buf[kMaxLength+1]; + str = TerminateNumber(buf, sizeof buf, str, &n, true); + char* end; + errno = 0; + float r = strtof(str, &end); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *dest = r; + return true; +} + +template <> +bool Parse(const char* str, size_t n, double* dest) { + if (n == 0) return false; + static const int kMaxLength = 200; + char buf[kMaxLength+1]; + str = TerminateNumber(buf, sizeof buf, str, &n, true); + char* end; + errno = 0; + double r = strtod(str, &end); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *dest = r; + return true; +} + +template <> +bool Parse(const char* str, size_t n, long* dest, int radix) { if (n == 0) return false; char buf[kMaxNumberLength+1]; str = TerminateNumber(buf, sizeof buf, str, &n, false); @@ -1060,14 +1184,12 @@ bool RE2::Arg::parse_long_radix(const char* str, if (end != str + n) return false; // Leftover junk if (errno) return false; if (dest == NULL) return true; - *(reinterpret_cast(dest)) = r; + *dest = r; return true; } -bool RE2::Arg::parse_ulong_radix(const char* str, - size_t n, - void* dest, - int radix) { +template <> +bool Parse(const char* str, size_t n, unsigned long* dest, int radix) { if (n == 0) return false; char buf[kMaxNumberLength+1]; str = TerminateNumber(buf, sizeof buf, str, &n, false); @@ -1083,62 +1205,52 @@ bool RE2::Arg::parse_ulong_radix(const char* str, if (end != str + n) return false; // Leftover junk if (errno) return false; if (dest == NULL) return true; - *(reinterpret_cast(dest)) = r; + *dest = r; return true; } -bool RE2::Arg::parse_short_radix(const char* str, - size_t n, - void* dest, - int radix) { +template <> +bool Parse(const char* str, size_t n, short* dest, int radix) { long r; - if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse - if ((short)r != r) return false; // Out of range + if (!Parse(str, n, &r, radix)) return false; // Could not parse + if ((short)r != r) return false; // Out of range if (dest == NULL) return true; - *(reinterpret_cast(dest)) = (short)r; + *dest = (short)r; return true; } -bool RE2::Arg::parse_ushort_radix(const char* str, - size_t n, - void* dest, - int radix) { +template <> +bool Parse(const char* str, size_t n, unsigned short* dest, int radix) { unsigned long r; - if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse - if ((unsigned short)r != r) return false; // Out of range + if (!Parse(str, n, &r, radix)) return false; // Could not parse + if ((unsigned short)r != r) return false; // Out of range if (dest == NULL) return true; - *(reinterpret_cast(dest)) = (unsigned short)r; + *dest = (unsigned short)r; return true; } -bool RE2::Arg::parse_int_radix(const char* str, - size_t n, - void* dest, - int radix) { +template <> +bool Parse(const char* str, size_t n, int* dest, int radix) { long r; - if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse - if ((int)r != r) return false; // Out of range + if (!Parse(str, n, &r, radix)) return false; // Could not parse + if ((int)r != r) return false; // Out of range if (dest == NULL) return true; - *(reinterpret_cast(dest)) = (int)r; + *dest = (int)r; return true; } -bool RE2::Arg::parse_uint_radix(const char* str, - size_t n, - void* dest, - int radix) { +template <> +bool Parse(const char* str, size_t n, unsigned int* dest, int radix) { unsigned long r; - if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse - if ((unsigned int)r != r) return false; // Out of range + if (!Parse(str, n, &r, radix)) return false; // Could not parse + if ((unsigned int)r != r) return false; // Out of range if (dest == NULL) return true; - *(reinterpret_cast(dest)) = (unsigned int)r; + *dest = (unsigned int)r; return true; } -bool RE2::Arg::parse_longlong_radix(const char* str, - size_t n, - void* dest, - int radix) { +template <> +bool Parse(const char* str, size_t n, long long* dest, int radix) { if (n == 0) return false; char buf[kMaxNumberLength+1]; str = TerminateNumber(buf, sizeof buf, str, &n, false); @@ -1148,14 +1260,12 @@ bool RE2::Arg::parse_longlong_radix(const char* str, if (end != str + n) return false; // Leftover junk if (errno) return false; if (dest == NULL) return true; - *(reinterpret_cast(dest)) = r; + *dest = r; return true; } -bool RE2::Arg::parse_ulonglong_radix(const char* str, - size_t n, - void* dest, - int radix) { +template <> +bool Parse(const char* str, size_t n, unsigned long long* dest, int radix) { if (n == 0) return false; char buf[kMaxNumberLength+1]; str = TerminateNumber(buf, sizeof buf, str, &n, false); @@ -1170,67 +1280,47 @@ bool RE2::Arg::parse_ulonglong_radix(const char* str, if (end != str + n) return false; // Leftover junk if (errno) return false; if (dest == NULL) return true; - *(reinterpret_cast(dest)) = r; + *dest = r; return true; } -static bool parse_double_float(const char* str, size_t n, bool isfloat, - void* dest) { - if (n == 0) return false; - static const int kMaxLength = 200; - char buf[kMaxLength+1]; - str = TerminateNumber(buf, sizeof buf, str, &n, true); - char* end; - errno = 0; - double r; - if (isfloat) { - r = strtof(str, &end); - } else { - r = strtod(str, &end); - } - if (end != str + n) return false; // Leftover junk - if (errno) return false; - if (dest == NULL) return true; - if (isfloat) { - *(reinterpret_cast(dest)) = (float)r; - } else { - *(reinterpret_cast(dest)) = r; - } - return true; -} +} // namespace re2_internal -bool RE2::Arg::parse_double(const char* str, size_t n, void* dest) { - return parse_double_float(str, n, false, dest); -} +namespace hooks { -bool RE2::Arg::parse_float(const char* str, size_t n, void* dest) { - return parse_double_float(str, n, true, dest); -} +#ifdef RE2_HAVE_THREAD_LOCAL +thread_local const RE2* context = NULL; +#endif -#define DEFINE_INTEGER_PARSER(name) \ - bool RE2::Arg::parse_##name(const char* str, size_t n, void* dest) { \ - return parse_##name##_radix(str, n, dest, 10); \ - } \ - bool RE2::Arg::parse_##name##_hex(const char* str, size_t n, void* dest) { \ - return parse_##name##_radix(str, n, dest, 16); \ - } \ - bool RE2::Arg::parse_##name##_octal(const char* str, size_t n, void* dest) { \ - return parse_##name##_radix(str, n, dest, 8); \ - } \ - bool RE2::Arg::parse_##name##_cradix(const char* str, size_t n, \ - void* dest) { \ - return parse_##name##_radix(str, n, dest, 0); \ - } +template +union Hook { + void Store(T* cb) { cb_.store(cb, std::memory_order_release); } + T* Load() const { return cb_.load(std::memory_order_acquire); } + +#if !defined(__clang__) && defined(_MSC_VER) + // Citing https://github.com/protocolbuffers/protobuf/pull/4777 as precedent, + // this is a gross hack to make std::atomic constant-initialized on MSVC. + static_assert(ATOMIC_POINTER_LOCK_FREE == 2, + "std::atomic must be always lock-free"); + T* cb_for_constinit_; +#endif + + std::atomic cb_; +}; + +template +static void DoNothing(const T&) {} + +#define DEFINE_HOOK(type, name) \ + static Hook name##_hook = {{&DoNothing}}; \ + void Set##type##Hook(type##Callback* cb) { name##_hook.Store(cb); } \ + type##Callback* Get##type##Hook() { return name##_hook.Load(); } + +DEFINE_HOOK(DFAStateCacheReset, dfa_state_cache_reset) +DEFINE_HOOK(DFASearchFailure, dfa_search_failure) -DEFINE_INTEGER_PARSER(short); -DEFINE_INTEGER_PARSER(ushort); -DEFINE_INTEGER_PARSER(int); -DEFINE_INTEGER_PARSER(uint); -DEFINE_INTEGER_PARSER(long); -DEFINE_INTEGER_PARSER(ulong); -DEFINE_INTEGER_PARSER(longlong); -DEFINE_INTEGER_PARSER(ulonglong); +#undef DEFINE_HOOK -#undef DEFINE_INTEGER_PARSER +} // namespace hooks } // namespace re2 diff --git a/re2/re2.h b/re2/re2.h index c39589d6d683d70d974712d89c70f90861608760..7fd2245cb35c070b81fb50429bf2af2fa48f4ac0 100644 --- a/re2/re2.h +++ b/re2/re2.h @@ -30,6 +30,19 @@ // "(?i)hello" -- (?i) turns on case-insensitive matching // "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible // +// The double backslashes are needed when writing C++ string literals. +// However, they should NOT be used when writing C++11 raw string literals: +// +// R"(hello (\w+) world)" -- \w matches a "word" character +// R"(version (\d+))" -- \d matches a digit +// R"(hello\s+world)" -- \s matches any whitespace character +// R"(\b(\w+)\b)" -- \b matches non-empty string at word boundary +// R"((?i)hello)" -- (?i) turns on case-insensitive matching +// R"(/\*(.*?)\*/)" -- .*? matches . minimum no. of times possible +// +// When using UTF-8 encoding, case-insensitive matching will perform +// simple case folding, not full case folding. +// // ----------------------------------------------------------------------- // MATCHING INTERFACE: // @@ -195,6 +208,12 @@ #include #include #include +#include +#include + +#if defined(__APPLE__) +#include +#endif #include "re2/stringpiece.h" @@ -229,6 +248,7 @@ class RE2 { ErrorBadCharRange, // bad character class range ErrorMissingBracket, // missing closing ] ErrorMissingParen, // missing closing ) + ErrorUnexpectedParen, // unexpected closing ) ErrorTrailingBackslash, // trailing \ at end of regexp ErrorRepeatArgument, // repeat argument missing, e.g. "*" ErrorRepeatSize, // bad repetition argument @@ -287,11 +307,11 @@ class RE2 { int ProgramSize() const; int ReverseProgramSize() const; - // EXPERIMENTAL! SUBJECT TO CHANGE! - // Outputs the program fanout as a histogram bucketed by powers of 2. + // If histogram is not null, outputs the program fanout + // as a histogram bucketed by powers of 2. // Returns the number of the largest non-empty bucket. - int ProgramFanout(std::map* histogram) const; - int ReverseProgramFanout(std::map* histogram) const; + int ProgramFanout(std::vector* histogram) const; + int ReverseProgramFanout(std::vector* histogram) const; // Returns the underlying Regexp; not for general use. // Returns entire_regexp_ so that callers don't need @@ -349,12 +369,12 @@ class RE2 { // (void*)NULL (the corresponding matched sub-pattern is not copied) // // Returns true iff all of the following conditions are satisfied: - // a. "text" matches "re" exactly - // b. The number of matched sub-patterns is >= number of supplied pointers + // a. "text" matches "re" fully - from the beginning to the end of "text". + // b. The number of matched sub-patterns is >= number of supplied pointers. // c. The "i"th argument has a suitable type for holding the // string captured as the "i"th sub-pattern. If you pass in // NULL for the "i"th argument, or pass fewer arguments than - // number of sub-patterns, "i"th captured sub-pattern is + // number of sub-patterns, the "i"th captured sub-pattern is // ignored. // // CAVEAT: An optional sub-pattern that does not exist in the @@ -368,8 +388,17 @@ class RE2 { return Apply(FullMatchN, text, re, Arg(std::forward(a))...); } - // Exactly like FullMatch(), except that "re" is allowed to match - // a substring of "text". + // Like FullMatch(), except that "re" is allowed to match a substring + // of "text". + // + // Returns true iff all of the following conditions are satisfied: + // a. "text" matches "re" partially - for some substring of "text". + // b. The number of matched sub-patterns is >= number of supplied pointers. + // c. The "i"th argument has a suitable type for holding the + // string captured as the "i"th sub-pattern. If you pass in + // NULL for the "i"th argument, or pass fewer arguments than + // number of sub-patterns, the "i"th captured sub-pattern is + // ignored. template static bool PartialMatch(const StringPiece& text, const RE2& re, A&&... a) { return Apply(PartialMatchN, text, re, Arg(std::forward(a))...); @@ -378,7 +407,16 @@ class RE2 { // Like FullMatch() and PartialMatch(), except that "re" has to match // a prefix of the text, and "input" is advanced past the matched // text. Note: "input" is modified iff this routine returns true - // and "re" matched a non-empty substring of "text". + // and "re" matched a non-empty substring of "input". + // + // Returns true iff all of the following conditions are satisfied: + // a. "input" matches "re" partially - for some prefix of "input". + // b. The number of matched sub-patterns is >= number of supplied pointers. + // c. The "i"th argument has a suitable type for holding the + // string captured as the "i"th sub-pattern. If you pass in + // NULL for the "i"th argument, or pass fewer arguments than + // number of sub-patterns, the "i"th captured sub-pattern is + // ignored. template static bool Consume(StringPiece* input, const RE2& re, A&&... a) { return Apply(ConsumeN, input, re, Arg(std::forward(a))...); @@ -388,6 +426,15 @@ class RE2 { // the text. That is, "re" need not start its match at the beginning // of "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds // the next word in "s" and stores it in "word". + // + // Returns true iff all of the following conditions are satisfied: + // a. "input" matches "re" partially - for some substring of "input". + // b. The number of matched sub-patterns is >= number of supplied pointers. + // c. The "i"th argument has a suitable type for holding the + // string captured as the "i"th sub-pattern. If you pass in + // NULL for the "i"th argument, or pass fewer arguments than + // number of sub-patterns, the "i"th captured sub-pattern is + // ignored. template static bool FindAndConsume(StringPiece* input, const RE2& re, A&&... a) { return Apply(FindAndConsumeN, input, re, Arg(std::forward(a))...); @@ -443,7 +490,7 @@ class RE2 { // Escapes all potentially meaningful regexp characters in // 'unquoted'. The returned string, used as a regular expression, - // will exactly match the original string. For example, + // will match exactly the original string. For example, // 1.5-2.0? // may become: // 1\.5\-2\.0\? @@ -626,17 +673,6 @@ class RE2 { Encoding encoding() const { return encoding_; } void set_encoding(Encoding encoding) { encoding_ = encoding; } - // Legacy interface to encoding. - // TODO(rsc): Remove once clients have been converted. - bool utf8() const { return encoding_ == EncodingUTF8; } - void set_utf8(bool b) { - if (b) { - encoding_ = EncodingUTF8; - } else { - encoding_ = EncodingLatin1; - } - } - bool posix_syntax() const { return posix_syntax_; } void set_posix_syntax(bool b) { posix_syntax_ = b; } @@ -699,32 +735,12 @@ class RE2 { const Options& options() const { return options_; } // Argument converters; see below. - static inline Arg CRadix(short* x); - static inline Arg CRadix(unsigned short* x); - static inline Arg CRadix(int* x); - static inline Arg CRadix(unsigned int* x); - static inline Arg CRadix(long* x); - static inline Arg CRadix(unsigned long* x); - static inline Arg CRadix(long long* x); - static inline Arg CRadix(unsigned long long* x); - - static inline Arg Hex(short* x); - static inline Arg Hex(unsigned short* x); - static inline Arg Hex(int* x); - static inline Arg Hex(unsigned int* x); - static inline Arg Hex(long* x); - static inline Arg Hex(unsigned long* x); - static inline Arg Hex(long long* x); - static inline Arg Hex(unsigned long long* x); - - static inline Arg Octal(short* x); - static inline Arg Octal(unsigned short* x); - static inline Arg Octal(int* x); - static inline Arg Octal(unsigned int* x); - static inline Arg Octal(long* x); - static inline Arg Octal(unsigned long* x); - static inline Arg Octal(long long* x); - static inline Arg Octal(unsigned long long* x); + template + static Arg CRadix(T* ptr); + template + static Arg Hex(T* ptr); + template + static Arg Octal(T* ptr); private: void Init(const StringPiece& pattern, const Options& options); @@ -737,29 +753,26 @@ class RE2 { re2::Prog* ReverseProg() const; - std::string pattern_; // string regular expression - Options options_; // option flags - std::string prefix_; // required prefix (before regexp_) - bool prefix_foldcase_; // prefix is ASCII case-insensitive - re2::Regexp* entire_regexp_; // parsed regular expression - re2::Regexp* suffix_regexp_; // parsed regular expression, prefix removed - re2::Prog* prog_; // compiled program for regexp - int num_captures_; // Number of capturing groups - bool is_one_pass_; // can use prog_->SearchOnePass? - - mutable re2::Prog* rprog_; // reverse program for regexp - mutable const std::string* error_; // Error indicator - // (or points to empty string) - mutable ErrorCode error_code_; // Error code - mutable std::string error_arg_; // Fragment of regexp showing error - + std::string pattern_; // string regular expression + Options options_; // option flags + re2::Regexp* entire_regexp_; // parsed regular expression + const std::string* error_; // error indicator (or points to empty string) + ErrorCode error_code_; // error code + std::string error_arg_; // fragment of regexp showing error + std::string prefix_; // required prefix (before suffix_regexp_) + bool prefix_foldcase_; // prefix_ is ASCII case-insensitive + re2::Regexp* suffix_regexp_; // parsed regular expression, prefix_ removed + re2::Prog* prog_; // compiled program for regexp + int num_captures_; // number of capturing groups + bool is_one_pass_; // can use prog_->SearchOnePass? + + // Reverse Prog for DFA execution only + mutable re2::Prog* rprog_; // Map from capture names to indices mutable const std::map* named_groups_; - // Map from capture indices to names mutable const std::map* group_names_; - // Onces for lazy computations. mutable std::once_flag rprog_once_; mutable std::once_flag named_groups_once_; mutable std::once_flag group_names_once_; @@ -770,137 +783,134 @@ class RE2 { /***** Implementation details *****/ -// Hex/Octal/Binary? - -// Special class for parsing into objects that define a ParseFrom() method -template -class _RE2_MatchObject { - public: - static inline bool Parse(const char* str, size_t n, void* dest) { - if (dest == NULL) return true; - T* object = reinterpret_cast(dest); - return object->ParseFrom(str, n); - } -}; +namespace re2_internal { + +// Types for which the 3-ary Parse() function template has specializations. +template struct Parse3ary : public std::false_type {}; +template <> struct Parse3ary : public std::true_type {}; +template <> struct Parse3ary : public std::true_type {}; +template <> struct Parse3ary : public std::true_type {}; +template <> struct Parse3ary : public std::true_type {}; +template <> struct Parse3ary : public std::true_type {}; +template <> struct Parse3ary : public std::true_type {}; +template <> struct Parse3ary : public std::true_type {}; +template <> struct Parse3ary : public std::true_type {}; + +template +bool Parse(const char* str, size_t n, T* dest); + +// Types for which the 4-ary Parse() function template has specializations. +template struct Parse4ary : public std::false_type {}; +template <> struct Parse4ary : public std::true_type {}; +template <> struct Parse4ary : public std::true_type {}; +template <> struct Parse4ary : public std::true_type {}; +template <> struct Parse4ary : public std::true_type {}; +template <> struct Parse4ary : public std::true_type {}; +template <> struct Parse4ary : public std::true_type {}; +template <> struct Parse4ary : public std::true_type {}; +template <> struct Parse4ary : public std::true_type {}; + +template +bool Parse(const char* str, size_t n, T* dest, int radix); + +} // namespace re2_internal class RE2::Arg { + private: + template + using CanParse3ary = typename std::enable_if< + re2_internal::Parse3ary::value, + int>::type; + + template + using CanParse4ary = typename std::enable_if< + re2_internal::Parse4ary::value, + int>::type; + +#if !defined(_MSC_VER) + template + using CanParseFrom = typename std::enable_if< + std::is_member_function_pointer< + decltype(static_cast( + &T::ParseFrom))>::value, + int>::type; +#endif + public: - // Empty constructor so we can declare arrays of RE2::Arg - Arg(); + Arg() : Arg(nullptr) {} + Arg(std::nullptr_t ptr) : arg_(ptr), parser_(DoNothing) {} - // Constructor specially designed for NULL arguments - Arg(void*); - Arg(std::nullptr_t); + template = 0> + Arg(T* ptr) : arg_(ptr), parser_(DoParse3ary) {} + + template = 0> + Arg(T* ptr) : arg_(ptr), parser_(DoParse4ary) {} + +#if !defined(_MSC_VER) + template = 0> + Arg(T* ptr) : arg_(ptr), parser_(DoParseFrom) {} +#endif typedef bool (*Parser)(const char* str, size_t n, void* dest); -// Type-specific parsers -#define MAKE_PARSER(type, name) \ - Arg(type* p) : arg_(p), parser_(name) {} \ - Arg(type* p, Parser parser) : arg_(p), parser_(parser) {} - - MAKE_PARSER(char, parse_char) - MAKE_PARSER(signed char, parse_schar) - MAKE_PARSER(unsigned char, parse_uchar) - MAKE_PARSER(float, parse_float) - MAKE_PARSER(double, parse_double) - MAKE_PARSER(std::string, parse_string) - MAKE_PARSER(StringPiece, parse_stringpiece) - - MAKE_PARSER(short, parse_short) - MAKE_PARSER(unsigned short, parse_ushort) - MAKE_PARSER(int, parse_int) - MAKE_PARSER(unsigned int, parse_uint) - MAKE_PARSER(long, parse_long) - MAKE_PARSER(unsigned long, parse_ulong) - MAKE_PARSER(long long, parse_longlong) - MAKE_PARSER(unsigned long long, parse_ulonglong) - -#undef MAKE_PARSER - - // Generic constructor templates - template Arg(T* p) - : arg_(p), parser_(_RE2_MatchObject::Parse) { } - template Arg(T* p, Parser parser) - : arg_(p), parser_(parser) { } - - // Parse the data - bool Parse(const char* str, size_t n) const; + template + Arg(T* ptr, Parser parser) : arg_(ptr), parser_(parser) {} + + bool Parse(const char* str, size_t n) const { + return (*parser_)(str, n, arg_); + } private: - void* arg_; - Parser parser_; + static bool DoNothing(const char* /*str*/, size_t /*n*/, void* /*dest*/) { + return true; + } - static bool parse_null (const char* str, size_t n, void* dest); - static bool parse_char (const char* str, size_t n, void* dest); - static bool parse_schar (const char* str, size_t n, void* dest); - static bool parse_uchar (const char* str, size_t n, void* dest); - static bool parse_float (const char* str, size_t n, void* dest); - static bool parse_double (const char* str, size_t n, void* dest); - static bool parse_string (const char* str, size_t n, void* dest); - static bool parse_stringpiece (const char* str, size_t n, void* dest); - -#define DECLARE_INTEGER_PARSER(name) \ - private: \ - static bool parse_##name(const char* str, size_t n, void* dest); \ - static bool parse_##name##_radix(const char* str, size_t n, void* dest, \ - int radix); \ - \ - public: \ - static bool parse_##name##_hex(const char* str, size_t n, void* dest); \ - static bool parse_##name##_octal(const char* str, size_t n, void* dest); \ - static bool parse_##name##_cradix(const char* str, size_t n, void* dest); - - DECLARE_INTEGER_PARSER(short) - DECLARE_INTEGER_PARSER(ushort) - DECLARE_INTEGER_PARSER(int) - DECLARE_INTEGER_PARSER(uint) - DECLARE_INTEGER_PARSER(long) - DECLARE_INTEGER_PARSER(ulong) - DECLARE_INTEGER_PARSER(longlong) - DECLARE_INTEGER_PARSER(ulonglong) - -#undef DECLARE_INTEGER_PARSER + template + static bool DoParse3ary(const char* str, size_t n, void* dest) { + return re2_internal::Parse(str, n, reinterpret_cast(dest)); + } -}; + template + static bool DoParse4ary(const char* str, size_t n, void* dest) { + return re2_internal::Parse(str, n, reinterpret_cast(dest), 10); + } -inline RE2::Arg::Arg() : arg_(NULL), parser_(parse_null) { } -inline RE2::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { } -inline RE2::Arg::Arg(std::nullptr_t p) : arg_(p), parser_(parse_null) { } +#if !defined(_MSC_VER) + template + static bool DoParseFrom(const char* str, size_t n, void* dest) { + if (dest == NULL) return true; + return reinterpret_cast(dest)->ParseFrom(str, n); + } +#endif -inline bool RE2::Arg::Parse(const char* str, size_t n) const { - return (*parser_)(str, n, arg_); -} + void* arg_; + Parser parser_; +}; -// This part of the parser, appropriate only for ints, deals with bases -#define MAKE_INTEGER_PARSER(type, name) \ - inline RE2::Arg RE2::Hex(type* ptr) { \ - return RE2::Arg(ptr, RE2::Arg::parse_##name##_hex); \ - } \ - inline RE2::Arg RE2::Octal(type* ptr) { \ - return RE2::Arg(ptr, RE2::Arg::parse_##name##_octal); \ - } \ - inline RE2::Arg RE2::CRadix(type* ptr) { \ - return RE2::Arg(ptr, RE2::Arg::parse_##name##_cradix); \ - } +template +inline RE2::Arg RE2::CRadix(T* ptr) { + return RE2::Arg(ptr, [](const char* str, size_t n, void* dest) -> bool { + return re2_internal::Parse(str, n, reinterpret_cast(dest), 0); + }); +} -MAKE_INTEGER_PARSER(short, short) -MAKE_INTEGER_PARSER(unsigned short, ushort) -MAKE_INTEGER_PARSER(int, int) -MAKE_INTEGER_PARSER(unsigned int, uint) -MAKE_INTEGER_PARSER(long, long) -MAKE_INTEGER_PARSER(unsigned long, ulong) -MAKE_INTEGER_PARSER(long long, longlong) -MAKE_INTEGER_PARSER(unsigned long long, ulonglong) +template +inline RE2::Arg RE2::Hex(T* ptr) { + return RE2::Arg(ptr, [](const char* str, size_t n, void* dest) -> bool { + return re2_internal::Parse(str, n, reinterpret_cast(dest), 16); + }); +} -#undef MAKE_INTEGER_PARSER +template +inline RE2::Arg RE2::Octal(T* ptr) { + return RE2::Arg(ptr, [](const char* str, size_t n, void* dest) -> bool { + return re2_internal::Parse(str, n, reinterpret_cast(dest), 8); + }); +} #ifndef SWIG - // Silence warnings about missing initializers for members of LazyRE2. -// Note that we test for Clang first because it defines __GNUC__ as well. -#if defined(__clang__) -#elif defined(__GNUC__) && __GNUC__ >= 6 +#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 6 #pragma GCC diagnostic ignored "-Wmissing-field-initializers" #endif @@ -949,7 +959,55 @@ class LazyRE2 { void operator=(const LazyRE2&); // disallowed }; -#endif // SWIG +#endif + +namespace hooks { + +// Most platforms support thread_local. Older versions of iOS don't support +// thread_local, but for the sake of brevity, we lump together all versions +// of Apple platforms that aren't macOS. If an iOS application really needs +// the context pointee someday, we can get more specific then... +// +// As per https://github.com/google/re2/issues/325, thread_local support in +// MinGW seems to be buggy. (FWIW, Abseil folks also avoid it.) +#define RE2_HAVE_THREAD_LOCAL +#if (defined(__APPLE__) && !TARGET_OS_OSX) || defined(__MINGW32__) +#undef RE2_HAVE_THREAD_LOCAL +#endif + +// A hook must not make any assumptions regarding the lifetime of the context +// pointee beyond the current invocation of the hook. Pointers and references +// obtained via the context pointee should be considered invalidated when the +// hook returns. Hence, any data about the context pointee (e.g. its pattern) +// would have to be copied in order for it to be kept for an indefinite time. +// +// A hook must not use RE2 for matching. Control flow reentering RE2::Match() +// could result in infinite mutual recursion. To discourage that possibility, +// RE2 will not maintain the context pointer correctly when used in that way. +#ifdef RE2_HAVE_THREAD_LOCAL +extern thread_local const RE2* context; +#endif + +struct DFAStateCacheReset { + int64_t state_budget; + size_t state_cache_size; +}; + +struct DFASearchFailure { + // Nothing yet... +}; + +#define DECLARE_HOOK(type) \ + using type##Callback = void(const type&); \ + void Set##type##Hook(type##Callback* cb); \ + type##Callback* Get##type##Hook(); + +DECLARE_HOOK(DFAStateCacheReset) +DECLARE_HOOK(DFASearchFailure) + +#undef DECLARE_HOOK + +} // namespace hooks } // namespace re2 diff --git a/re2/regexp.cc b/re2/regexp.cc index 7995ffceb320e9d84d8c04efd0daeef7d60267db..2e1bfac910e0602ecbde72ad650bfac0da9aba71 100644 --- a/re2/regexp.cc +++ b/re2/regexp.cc @@ -20,6 +20,7 @@ #include "util/logging.h" #include "util/mutex.h" #include "util/utf.h" +#include "re2/pod_array.h" #include "re2/stringpiece.h" #include "re2/walker-inl.h" @@ -243,16 +244,15 @@ Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, return new Regexp(kRegexpEmptyMatch, flags); } - Regexp** subcopy = NULL; + PODArray subcopy; if (op == kRegexpAlternate && can_factor) { // Going to edit sub; make a copy so we don't step on caller. - subcopy = new Regexp*[nsub]; - memmove(subcopy, sub, nsub * sizeof sub[0]); - sub = subcopy; + subcopy = PODArray(nsub); + memmove(subcopy.data(), sub, nsub * sizeof sub[0]); + sub = subcopy.data(); nsub = FactorAlternation(sub, nsub, flags); if (nsub == 1) { Regexp* re = sub[0]; - delete[] subcopy; return re; } } @@ -269,7 +269,6 @@ Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub, nsub - (nbigsub-1)*kMaxNsub, flags, false); - delete[] subcopy; return re; } @@ -278,8 +277,6 @@ Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, Regexp** subs = re->sub(); for (int i = 0; i < nsub; i++) subs[i] = sub[i]; - - delete[] subcopy; return re; } @@ -501,6 +498,7 @@ static const char *kErrorStrings[] = { "invalid character class range", "missing ]", "missing )", + "unexpected )", "trailing \\", "no argument for repetition operator", "invalid repetition size", @@ -544,9 +542,12 @@ class NumCapturesWalker : public Regexp::Walker { ncapture_++; return ignored; } + virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { - // Should never be called: we use Walk not WalkExponential. + // Should never be called: we use Walk(), not WalkExponential(). +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION LOG(DFATAL) << "NumCapturesWalker::ShortVisit called"; +#endif return ignored; } @@ -575,7 +576,7 @@ class NamedCapturesWalker : public Regexp::Walker { return m; } - Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { + virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { if (re->op() == kRegexpCapture && re->name() != NULL) { // Allocate map once we find a name. if (map_ == NULL) @@ -591,8 +592,10 @@ class NamedCapturesWalker : public Regexp::Walker { } virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { - // Should never be called: we use Walk not WalkExponential. + // Should never be called: we use Walk(), not WalkExponential(). +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called"; +#endif return ignored; } @@ -621,7 +624,7 @@ class CaptureNamesWalker : public Regexp::Walker { return m; } - Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { + virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { if (re->op() == kRegexpCapture && re->name() != NULL) { // Allocate map once we find a name. if (map_ == NULL) @@ -633,8 +636,10 @@ class CaptureNamesWalker : public Regexp::Walker { } virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { - // Should never be called: we use Walk not WalkExponential. + // Should never be called: we use Walk(), not WalkExponential(). +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called"; +#endif return ignored; } @@ -651,78 +656,89 @@ std::map* Regexp::CaptureNames() { return w.TakeMap(); } +void ConvertRunesToBytes(bool latin1, Rune* runes, int nrunes, + std::string* bytes) { + if (latin1) { + bytes->resize(nrunes); + for (int i = 0; i < nrunes; i++) + (*bytes)[i] = static_cast(runes[i]); + } else { + bytes->resize(nrunes * UTFmax); // worst case + char* p = &(*bytes)[0]; + for (int i = 0; i < nrunes; i++) + p += runetochar(p, &runes[i]); + bytes->resize(p - &(*bytes)[0]); + bytes->shrink_to_fit(); + } +} + // Determines whether regexp matches must be anchored // with a fixed string prefix. If so, returns the prefix and // the regexp that remains after the prefix. The prefix might // be ASCII case-insensitive. bool Regexp::RequiredPrefix(std::string* prefix, bool* foldcase, Regexp** suffix) { + prefix->clear(); + *foldcase = false; + *suffix = NULL; + // No need for a walker: the regexp must be of the form // 1. some number of ^ anchors // 2. a literal char or string // 3. the rest - prefix->clear(); - *foldcase = false; - *suffix = NULL; if (op_ != kRegexpConcat) return false; - - // Some number of anchors, then a literal or concatenation. int i = 0; - Regexp** sub = this->sub(); - while (i < nsub_ && sub[i]->op_ == kRegexpBeginText) + while (i < nsub_ && sub()[i]->op_ == kRegexpBeginText) i++; if (i == 0 || i >= nsub_) return false; - - Regexp* re = sub[i]; - switch (re->op_) { - default: - return false; - - case kRegexpLiteralString: - // Convert to string in proper encoding. - if (re->parse_flags() & Latin1) { - prefix->resize(re->nrunes_); - for (int j = 0; j < re->nrunes_; j++) - (*prefix)[j] = static_cast(re->runes_[j]); - } else { - // Convert to UTF-8 in place. - // Assume worst-case space and then trim. - prefix->resize(re->nrunes_ * UTFmax); - char *p = &(*prefix)[0]; - for (int j = 0; j < re->nrunes_; j++) { - Rune r = re->runes_[j]; - if (r < Runeself) - *p++ = static_cast(r); - else - p += runetochar(p, &r); - } - prefix->resize(p - &(*prefix)[0]); - } - break; - - case kRegexpLiteral: - if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) { - prefix->append(1, static_cast(re->rune_)); - } else { - char buf[UTFmax]; - prefix->append(buf, runetochar(buf, &re->rune_)); - } - break; - } - *foldcase = (sub[i]->parse_flags() & FoldCase) != 0; + Regexp* re = sub()[i]; + if (re->op_ != kRegexpLiteral && + re->op_ != kRegexpLiteralString) + return false; i++; - - // The rest. if (i < nsub_) { for (int j = i; j < nsub_; j++) - sub[j]->Incref(); - re = Concat(sub + i, nsub_ - i, parse_flags()); + sub()[j]->Incref(); + *suffix = Concat(sub() + i, nsub_ - i, parse_flags()); } else { - re = new Regexp(kRegexpEmptyMatch, parse_flags()); + *suffix = new Regexp(kRegexpEmptyMatch, parse_flags()); } - *suffix = re; + + bool latin1 = (re->parse_flags() & Latin1) != 0; + Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_; + int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_; + ConvertRunesToBytes(latin1, runes, nrunes, prefix); + *foldcase = (re->parse_flags() & FoldCase) != 0; + return true; +} + +// Determines whether regexp matches must be unanchored +// with a fixed string prefix. If so, returns the prefix. +// The prefix might be ASCII case-insensitive. +bool Regexp::RequiredPrefixForAccel(std::string* prefix, bool* foldcase) { + prefix->clear(); + *foldcase = false; + + // No need for a walker: the regexp must either begin with or be + // a literal char or string. We "see through" capturing groups, + // but make no effort to glue multiple prefix fragments together. + Regexp* re = op_ == kRegexpConcat && nsub_ > 0 ? sub()[0] : this; + while (re->op_ == kRegexpCapture) { + re = re->sub()[0]; + if (re->op_ == kRegexpConcat && re->nsub_ > 0) + re = re->sub()[0]; + } + if (re->op_ != kRegexpLiteral && + re->op_ != kRegexpLiteralString) + return false; + + bool latin1 = (re->parse_flags() & Latin1) != 0; + Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_; + int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_; + ConvertRunesToBytes(latin1, runes, nrunes, prefix); + *foldcase = (re->parse_flags() & FoldCase) != 0; return true; } @@ -903,7 +919,7 @@ void CharClassBuilder::Negate() { // The ranges are allocated in the same block as the header, // necessitating a special allocator and Delete method. -CharClass* CharClass::New(int maxranges) { +CharClass* CharClass::New(size_t maxranges) { CharClass* cc; uint8_t* data = new uint8_t[sizeof *cc + maxranges*sizeof cc->ranges_[0]]; cc = reinterpret_cast(data); @@ -920,7 +936,7 @@ void CharClass::Delete() { } CharClass* CharClass::Negate() { - CharClass* cc = CharClass::New(nranges_+1); + CharClass* cc = CharClass::New(static_cast(nranges_+1)); cc->folds_ascii_ = folds_ascii_; cc->nrunes_ = Runemax + 1 - nrunes_; int n = 0; @@ -939,7 +955,7 @@ CharClass* CharClass::Negate() { return cc; } -bool CharClass::Contains(Rune r) { +bool CharClass::Contains(Rune r) const { RuneRange* rr = ranges_; int n = nranges_; while (n > 0) { @@ -957,7 +973,7 @@ bool CharClass::Contains(Rune r) { } CharClass* CharClassBuilder::GetCharClass() { - CharClass* cc = CharClass::New(static_cast(ranges_.size())); + CharClass* cc = CharClass::New(ranges_.size()); int n = 0; for (iterator it = begin(); it != end(); ++it) cc->ranges_[n++] = *it; diff --git a/re2/regexp.h b/re2/regexp.h index a5d85c812885114a523f482dd75967c7330fb428..b6446f9fe5d40a5791243482a231a4a09ad01f18 100644 --- a/re2/regexp.h +++ b/re2/regexp.h @@ -86,6 +86,7 @@ // form accessible to clients, so that client code can analyze the // parsed regular expressions. +#include #include #include #include @@ -177,6 +178,7 @@ enum RegexpStatusCode { kRegexpBadCharRange, // bad character class range kRegexpMissingBracket, // missing closing ] kRegexpMissingParen, // missing closing ) + kRegexpUnexpectedParen, // unexpected closing ) kRegexpTrailingBackslash, // at end of regexp kRegexpRepeatArgument, // repeat argument missing, e.g. "*" kRegexpRepeatSize, // bad repetition argument @@ -252,13 +254,13 @@ class CharClass { bool full() { return nrunes_ == Runemax+1; } bool FoldsASCII() { return folds_ascii_; } - bool Contains(Rune r); + bool Contains(Rune r) const; CharClass* Negate(); private: CharClass(); // not implemented ~CharClass(); // not implemented - static CharClass* New(int maxranges); + static CharClass* New(size_t maxranges); friend class CharClassBuilder; @@ -440,6 +442,17 @@ class Regexp { bool RequiredPrefix(std::string* prefix, bool* foldcase, Regexp** suffix); + // Whether every match of this regexp must be unanchored and + // begin with a non-empty fixed string (perhaps after ASCII + // case-folding). If so, returns the prefix. + // Callers should expect *prefix and *foldcase to be "zeroed" + // regardless of the return value. + bool RequiredPrefixForAccel(std::string* prefix, bool* foldcase); + + // Controls the maximum repeat count permitted by the parser. + // FOR FUZZING ONLY. + static void FUZZING_ONLY_set_maximum_repeat_count(int i); + private: // Constructor allocates vectors as appropriate for operator. explicit Regexp(RegexpOp op, ParseFlags parse_flags); diff --git a/re2/set.cc b/re2/set.cc index 69af666077da89573606b9148c1d7e40c1775574..18705663a52b6e3e243ec8e45c23b7df1d969f8a 100644 --- a/re2/set.cc +++ b/re2/set.cc @@ -7,6 +7,7 @@ #include #include #include +#include #include "util/util.h" #include "util/logging.h" @@ -18,19 +19,37 @@ namespace re2 { -RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor) { - options_.Copy(options); +RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor) + : options_(options), + anchor_(anchor), + compiled_(false), + size_(0) { options_.set_never_capture(true); // might unblock some optimisations - anchor_ = anchor; - prog_ = NULL; - compiled_ = false; - size_ = 0; } RE2::Set::~Set() { for (size_t i = 0; i < elem_.size(); i++) elem_[i].second->Decref(); - delete prog_; +} + +RE2::Set::Set(Set&& other) + : options_(other.options_), + anchor_(other.anchor_), + elem_(std::move(other.elem_)), + compiled_(other.compiled_), + size_(other.size_), + prog_(std::move(other.prog_)) { + other.elem_.clear(); + other.elem_.shrink_to_fit(); + other.compiled_ = false; + other.size_ = 0; + other.prog_.reset(); +} + +RE2::Set& RE2::Set::operator=(Set&& other) { + this->~Set(); + (void) new (this) Set(std::move(other)); + return *this; } int RE2::Set::Add(const StringPiece& pattern, std::string* error) { @@ -97,9 +116,9 @@ bool RE2::Set::Compile() { options_.ParseFlags()); re2::Regexp* re = re2::Regexp::Alternate(sub.data(), size_, pf); - prog_ = Prog::CompileSet(re, anchor_, options_.max_mem()); + prog_.reset(Prog::CompileSet(re, anchor_, options_.max_mem())); re->Decref(); - return prog_ != NULL; + return prog_ != nullptr; } bool RE2::Set::Match(const StringPiece& text, std::vector* v) const { @@ -114,6 +133,9 @@ bool RE2::Set::Match(const StringPiece& text, std::vector* v, error_info->kind = kNotCompiled; return false; } +#ifdef RE2_HAVE_THREAD_LOCAL + hooks::context = NULL; +#endif bool dfa_failed = false; std::unique_ptr matches; if (v != NULL) { @@ -124,9 +146,10 @@ bool RE2::Set::Match(const StringPiece& text, std::vector* v, NULL, &dfa_failed, matches.get()); if (dfa_failed) { if (options_.log_errors()) - LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", " - << "bytemap range " << prog_->bytemap_range() << ", " - << "list count " << prog_->list_count(); + LOG(ERROR) << "DFA out of memory: " + << "program size " << prog_->size() << ", " + << "list count " << prog_->list_count() << ", " + << "bytemap range " << prog_->bytemap_range(); if (error_info != NULL) error_info->kind = kOutOfMemory; return false; diff --git a/re2/set.h b/re2/set.h index 59733fd94cda3b6eb8ab99068375be56d5cc9182..8d64f30ccd94073058de740e22fb110d013de506 100644 --- a/re2/set.h +++ b/re2/set.h @@ -5,6 +5,7 @@ #ifndef RE2_SET_H_ #define RE2_SET_H_ +#include #include #include #include @@ -36,6 +37,13 @@ class RE2::Set { Set(const RE2::Options& options, RE2::Anchor anchor); ~Set(); + // Not copyable. + Set(const Set&) = delete; + Set& operator=(const Set&) = delete; + // Movable. + Set(Set&& other); + Set& operator=(Set&& other); + // Adds pattern to the set using the options passed to the constructor. // Returns the index that will identify the regexp in the output of Match(), // or -1 if the regexp cannot be parsed. @@ -67,12 +75,9 @@ class RE2::Set { RE2::Options options_; RE2::Anchor anchor_; std::vector elem_; - re2::Prog* prog_; bool compiled_; int size_; - - Set(const Set&) = delete; - Set& operator=(const Set&) = delete; + std::unique_ptr prog_; }; } // namespace re2 diff --git a/re2/simplify.cc b/re2/simplify.cc index c6eb4a77437b0ac4b540af94d2e67edb8c49ee4c..663d5fcd4569b3b3976bc5d7609c92e1563e5d9a 100644 --- a/re2/simplify.cc +++ b/re2/simplify.cc @@ -28,8 +28,6 @@ bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags, Regexp* sre = re->Simplify(); re->Decref(); if (sre == NULL) { - // Should not happen, since Simplify never fails. - LOG(ERROR) << "Simplify failed on " << src; if (status) { status->set_code(kRegexpInternalError); status->set_error_arg(src); @@ -180,10 +178,20 @@ Regexp* Regexp::Simplify() { CoalesceWalker cw; Regexp* cre = cw.Walk(this, NULL); if (cre == NULL) - return cre; + return NULL; + if (cw.stopped_early()) { + cre->Decref(); + return NULL; + } SimplifyWalker sw; Regexp* sre = sw.Walk(cre, NULL); cre->Decref(); + if (sre == NULL) + return NULL; + if (sw.stopped_early()) { + sre->Decref(); + return NULL; + } return sre; } @@ -212,9 +220,10 @@ Regexp* CoalesceWalker::Copy(Regexp* re) { } Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) { - // This should never be called, since we use Walk and not - // WalkExponential. + // Should never be called: we use Walk(), not WalkExponential(). +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION LOG(DFATAL) << "CoalesceWalker::ShortVisit called"; +#endif return re->Incref(); } @@ -437,9 +446,10 @@ Regexp* SimplifyWalker::Copy(Regexp* re) { } Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) { - // This should never be called, since we use Walk and not - // WalkExponential. + // Should never be called: we use Walk(), not WalkExponential(). +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION LOG(DFATAL) << "SimplifyWalker::ShortVisit called"; +#endif return re->Incref(); } diff --git a/re2/testing/backtrack.cc b/re2/testing/backtrack.cc index 1e888da9bba3c9d87fc9528dbf3e96c2370abc1c..920a4534dcc0b71ead4d6a1d9cc8f44c434e22f9 100644 --- a/re2/testing/backtrack.cc +++ b/re2/testing/backtrack.cc @@ -29,6 +29,7 @@ #include "util/util.h" #include "util/logging.h" +#include "re2/pod_array.h" #include "re2/prog.h" #include "re2/regexp.h" @@ -53,7 +54,6 @@ namespace re2 { class Backtracker { public: explicit Backtracker(Prog* prog); - ~Backtracker(); bool Search(const StringPiece& text, const StringPiece& context, bool anchored, bool longest, @@ -79,9 +79,11 @@ class Backtracker { int nsubmatch_; // # of submatches to fill in // Search state - const char* cap_[64]; // capture registers - uint32_t *visited_; // bitmap: (Inst*, char*) pairs already backtracked - size_t nvisited_; // # of words in bitmap + const char* cap_[64]; // capture registers + PODArray visited_; // bitmap: (Inst*, char*) pairs visited + + Backtracker(const Backtracker&) = delete; + Backtracker& operator=(const Backtracker&) = delete; }; Backtracker::Backtracker(Prog* prog) @@ -90,13 +92,7 @@ Backtracker::Backtracker(Prog* prog) longest_(false), endmatch_(false), submatch_(NULL), - nsubmatch_(0), - visited_(NULL), - nvisited_(0) { -} - -Backtracker::~Backtracker() { - delete[] visited_; + nsubmatch_(0) { } // Runs a backtracking search. @@ -107,9 +103,9 @@ bool Backtracker::Search(const StringPiece& text, const StringPiece& context, context_ = context; if (context_.data() == NULL) context_ = text; - if (prog_->anchor_start() && text.begin() > context_.begin()) + if (prog_->anchor_start() && BeginPtr(text) > BeginPtr(context_)) return false; - if (prog_->anchor_end() && text.end() < context_.end()) + if (prog_->anchor_end() && EndPtr(text) < EndPtr(context_)) return false; anchored_ = anchored | prog_->anchor_start(); longest_ = longest | prog_->anchor_end(); @@ -130,10 +126,10 @@ bool Backtracker::Search(const StringPiece& text, const StringPiece& context, // Allocate new visited_ bitmap -- size is proportional // to text, so have to reallocate on each call to Search. - delete[] visited_; - nvisited_ = (prog_->size()*(text.size()+1) + 31)/32; - visited_ = new uint32_t[nvisited_]; - memset(visited_, 0, nvisited_*sizeof visited_[0]); + int nvisited = prog_->size() * static_cast(text.size()+1); + nvisited = (nvisited + 31) / 32; + visited_ = PODArray(nvisited); + memset(visited_.data(), 0, nvisited*sizeof visited_[0]); // Anchored search must start at text.begin(). if (anchored_) { @@ -163,8 +159,9 @@ bool Backtracker::Visit(int id, const char* p) { // either it didn't match or it did but we're hoping for a better match. // Either way, don't go down that road again. CHECK(p <= text_.data() + text_.size()); - size_t n = id*(text_.size()+1) + (p - text_.data()); - CHECK_LT(n/32, nvisited_); + int n = id * static_cast(text_.size()+1) + + static_cast(p-text_.data()); + CHECK_LT(n/32, visited_.size()); if (visited_[n/32] & (1 << (n&31))) return false; visited_[n/32] |= 1 << (n&31); @@ -270,7 +267,7 @@ bool Prog::UnsafeSearchBacktrack(const StringPiece& text, bool longest = kind != kFirstMatch; if (!b.Search(text, context, anchored, longest, match, nmatch)) return false; - if (kind == kFullMatch && match[0].end() != text.end()) + if (kind == kFullMatch && EndPtr(match[0]) != EndPtr(text)) return false; return true; } diff --git a/re2/testing/charclass_test.cc b/re2/testing/charclass_test.cc index a2837a69a3c526f92003f6c67b7eb06dec3c0c59..9c2a32f6a8494931864fa042f284c47b492ca822 100644 --- a/re2/testing/charclass_test.cc +++ b/re2/testing/charclass_test.cc @@ -85,7 +85,7 @@ static CCTest tests[] = { { {-1} } }, }; -template +template static void Broke(const char *desc, const CCTest* t, CharClass* cc) { if (t == NULL) { printf("\t%s:", desc); @@ -136,7 +136,7 @@ void Delete(CharClassBuilder* cc) { delete cc; } -template +template bool CorrectCC(CharClass *cc, CCTest *t, const char *desc) { typename CharClass::iterator it = cc->begin(); int size = 0; diff --git a/re2/testing/compile_test.cc b/re2/testing/compile_test.cc index 6b77cf97b91f27c5f69a80b24e1f1aec2509aebb..47188307815126e6b6d6185d121578e7241e9ca3 100644 --- a/re2/testing/compile_test.cc +++ b/re2/testing/compile_test.cc @@ -109,6 +109,20 @@ static Test tests[] = { { "[[-`]", "3. byte [5b-60] 0 -> 4\n" "4. match! 0\n" }, + // Issue 310 + { "(?:|a)*", + "3+ nop -> 7\n" + "4. nop -> 9\n" + "5+ nop -> 7\n" + "6. nop -> 9\n" + "7+ nop -> 5\n" + "8. byte [61-61] 0 -> 5\n" + "9. match! 0\n" }, + { "(?:|a)+", + "3+ nop -> 5\n" + "4. byte [61-61] 0 -> 5\n" + "5+ nop -> 3\n" + "6. match! 0\n" }, }; TEST(TestRegexpCompileToProg, Simple) { @@ -147,10 +161,19 @@ static void DumpByteMap(StringPiece pattern, Regexp::ParseFlags flags, Regexp* re = Regexp::Parse(pattern, flags, NULL); EXPECT_TRUE(re != NULL); - Prog* prog = re->CompileToProg(0); - EXPECT_TRUE(prog != NULL); - *bytemap = prog->DumpByteMap(); - delete prog; + { + Prog* prog = re->CompileToProg(0); + EXPECT_TRUE(prog != NULL); + *bytemap = prog->DumpByteMap(); + delete prog; + } + + { + Prog* prog = re->CompileToReverseProg(0); + EXPECT_TRUE(prog != NULL); + EXPECT_EQ(*bytemap, prog->DumpByteMap()); + delete prog; + } re->Decref(); } @@ -213,16 +236,11 @@ TEST(TestCompile, UTF8Ranges) { EXPECT_EQ("[00-09] -> 0\n" "[0a-0a] -> 1\n" "[0b-7f] -> 0\n" - "[80-8f] -> 2\n" - "[90-9f] -> 3\n" - "[a0-bf] -> 4\n" + "[80-bf] -> 2\n" "[c0-c1] -> 1\n" - "[c2-df] -> 5\n" - "[e0-e0] -> 6\n" - "[e1-ef] -> 7\n" - "[f0-f0] -> 8\n" - "[f1-f3] -> 9\n" - "[f4-f4] -> 10\n" + "[c2-df] -> 3\n" + "[e0-ef] -> 4\n" + "[f0-f4] -> 5\n" "[f5-ff] -> 1\n", bytemap); } @@ -232,7 +250,7 @@ TEST(TestCompile, InsufficientMemory) { "^(?P[^\\s]+)\\s+(?P[^\\s]+)\\s+(?P.+)$", Regexp::LikePerl, NULL); EXPECT_TRUE(re != NULL); - Prog* prog = re->CompileToProg(920); + Prog* prog = re->CompileToProg(850); // If the memory budget has been exhausted, compilation should fail // and return NULL instead of trying to do anything with NoMatch(). EXPECT_TRUE(prog == NULL); @@ -299,20 +317,22 @@ TEST(TestCompile, Bug26705922) { "8. byte [f0-f0] 0 -> 7\n", reverse); - Dump("[\\x{80}-\\x{10FFFF}]", Regexp::LikePerl, NULL, &reverse); - EXPECT_EQ("3. byte [80-bf] 0 -> 4\n" - "4+ byte [c2-df] 0 -> 7\n" - "5+ byte [a0-bf] 1 -> 8\n" - "6. byte [80-bf] 0 -> 9\n" + Dump("[\\x{80}-\\x{10FFFF}]", Regexp::LikePerl, &forward, &reverse); + EXPECT_EQ("3+ byte [c2-df] 0 -> 6\n" + "4+ byte [e0-ef] 0 -> 8\n" + "5. byte [f0-f4] 0 -> 9\n" + "6. byte [80-bf] 0 -> 7\n" "7. match! 0\n" - "8. byte [e0-e0] 0 -> 7\n" - "9+ byte [e1-ef] 0 -> 7\n" - "10+ byte [90-bf] 1 -> 13\n" - "11+ byte [80-bf] 1 -> 14\n" - "12. byte [80-8f] 0 -> 15\n" - "13. byte [f0-f0] 0 -> 7\n" - "14. byte [f1-f3] 0 -> 7\n" - "15. byte [f4-f4] 0 -> 7\n", + "8. byte [80-bf] 0 -> 6\n" + "9. byte [80-bf] 0 -> 8\n", + forward); + EXPECT_EQ("3. byte [80-bf] 0 -> 4\n" + "4+ byte [c2-df] 0 -> 6\n" + "5. byte [80-bf] 0 -> 7\n" + "6. match! 0\n" + "7+ byte [e0-ef] 0 -> 6\n" + "8. byte [80-bf] 0 -> 9\n" + "9. byte [f0-f4] 0 -> 6\n", reverse); } @@ -332,27 +352,37 @@ TEST(TestCompile, Bug35237384) { forward); Dump("(a*|b*)*{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL); - EXPECT_EQ("3+ nop -> 6\n" - "4+ nop -> 8\n" - "5. nop -> 21\n" - "6+ byte [61-61] 1 -> 6\n" - "7. nop -> 3\n" - "8+ byte [62-62] 1 -> 8\n" - "9. nop -> 3\n" - "10+ byte [61-61] 1 -> 10\n" - "11. nop -> 21\n" - "12+ byte [62-62] 1 -> 12\n" - "13. nop -> 21\n" - "14+ byte [61-61] 1 -> 14\n" - "15. nop -> 18\n" - "16+ byte [62-62] 1 -> 16\n" - "17. nop -> 18\n" - "18+ nop -> 14\n" - "19+ nop -> 16\n" - "20. match! 0\n" - "21+ nop -> 10\n" - "22+ nop -> 12\n" - "23. nop -> 18\n", + EXPECT_EQ("3+ nop -> 28\n" + "4. nop -> 30\n" + "5+ byte [61-61] 1 -> 5\n" + "6. nop -> 32\n" + "7+ byte [61-61] 1 -> 7\n" + "8. nop -> 26\n" + "9+ byte [61-61] 1 -> 9\n" + "10. nop -> 20\n" + "11+ byte [62-62] 1 -> 11\n" + "12. nop -> 20\n" + "13+ byte [62-62] 1 -> 13\n" + "14. nop -> 26\n" + "15+ byte [62-62] 1 -> 15\n" + "16. nop -> 32\n" + "17+ nop -> 9\n" + "18. nop -> 11\n" + "19. match! 0\n" + "20+ nop -> 17\n" + "21. nop -> 19\n" + "22+ nop -> 7\n" + "23. nop -> 13\n" + "24+ nop -> 17\n" + "25. nop -> 19\n" + "26+ nop -> 22\n" + "27. nop -> 24\n" + "28+ nop -> 5\n" + "29. nop -> 15\n" + "30+ nop -> 22\n" + "31. nop -> 24\n" + "32+ nop -> 28\n" + "33. nop -> 30\n", forward); Dump("((|S.+)+|(|S.+)+|){2}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL); diff --git a/re2/testing/dfa_test.cc b/re2/testing/dfa_test.cc index 25f2311947ed555236f126b5637d7d94938ffc75..842daafff740b551306cb6c9e883a3908c5744a7 100644 --- a/re2/testing/dfa_test.cc +++ b/re2/testing/dfa_test.cc @@ -26,6 +26,20 @@ DEFINE_FLAG(int, threads, 4, "number of threads"); namespace re2 { +static int state_cache_resets = 0; +static int search_failures = 0; + +struct SetHooks { + SetHooks() { + hooks::SetDFAStateCacheResetHook([](const hooks::DFAStateCacheReset&) { + ++state_cache_resets; + }); + hooks::SetDFASearchFailureHook([](const hooks::DFASearchFailure&) { + ++search_failures; + }); + } +} set_hooks; + // Check that multithreaded access to DFA class works. // Helper function: builds entire DFA for prog. @@ -108,44 +122,6 @@ TEST(SingleThreaded, BuildEntireDFA) { re->Decref(); } -// Generates and returns a string over binary alphabet {0,1} that contains -// all possible binary sequences of length n as subsequences. The obvious -// brute force method would generate a string of length n * 2^n, but this -// generates a string of length n + 2^n - 1 called a De Bruijn cycle. -// See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17. -// Such a string is useful for testing a DFA. If you have a DFA -// where distinct last n bytes implies distinct states, then running on a -// DeBruijn string causes the DFA to need to create a new state at every -// position in the input, never reusing any states until it gets to the -// end of the string. This is the worst possible case for DFA execution. -static std::string DeBruijnString(int n) { - CHECK_LT(n, static_cast(8*sizeof(int))); - CHECK_GT(n, 0); - - std::vector did(size_t{1}<Decref(); // Reset to original behaviour. - Prog::TEST_dfa_should_bail_when_slow(true); + Prog::TESTING_ONLY_set_dfa_should_bail_when_slow(true); + ASSERT_GT(state_cache_resets, 0); + ASSERT_EQ(search_failures, 0); } // Helper function: searches for match, which should match, @@ -238,7 +218,9 @@ static void DoSearch(Prog* prog, const StringPiece& match, } TEST(Multithreaded, SearchDFA) { - Prog::TEST_dfa_should_bail_when_slow(false); + Prog::TESTING_ONLY_set_dfa_should_bail_when_slow(false); + state_cache_resets = 0; + search_failures = 0; // Same as single-threaded test above. const int n = 18; @@ -277,7 +259,9 @@ TEST(Multithreaded, SearchDFA) { re->Decref(); // Reset to original behaviour. - Prog::TEST_dfa_should_bail_when_slow(true); + Prog::TESTING_ONLY_set_dfa_should_bail_when_slow(true); + ASSERT_GT(state_cache_resets, 0); + ASSERT_EQ(search_failures, 0); } struct ReverseTest { diff --git a/re2/testing/exhaustive_tester.cc b/re2/testing/exhaustive_tester.cc index bdac381a896ef3e5b8fac06f8a8b035ecccb4236..b0409c32d7cf80848e802cbe19528b27f44c61aa 100644 --- a/re2/testing/exhaustive_tester.cc +++ b/re2/testing/exhaustive_tester.cc @@ -67,8 +67,8 @@ static void PrintResult(const RE2& re, const StringPiece& input, RE2::Anchor anc printf("-"); else printf("%td-%td", - m[i].begin() - input.begin(), - m[i].end() - input.begin()); + BeginPtr(m[i]) - BeginPtr(input), + EndPtr(m[i]) - BeginPtr(input)); } } diff --git a/re2/testing/filtered_re2_test.cc b/re2/testing/filtered_re2_test.cc index deef2f87d628a41634288c1010546dc27f2d20c5..c788fdadc49b2f7ae280fef9289f79a5ee172fde 100644 --- a/re2/testing/filtered_re2_test.cc +++ b/re2/testing/filtered_re2_test.cc @@ -7,6 +7,7 @@ #include #include #include +#include #include "util/test.h" #include "util/logging.h" @@ -291,4 +292,49 @@ TEST(FilteredRE2Test, EmptyStringInStringSetBug) { "EmptyStringInStringSetBug", &v)); } +TEST(FilteredRE2Test, MoveSemantics) { + FilterTestVars v1; + int id; + v1.f.Add("foo\\d+", v1.opts, &id); + EXPECT_EQ(0, id); + v1.f.Compile(&v1.atoms); + EXPECT_EQ(1, v1.atoms.size()); + EXPECT_EQ("foo", v1.atoms[0]); + v1.f.AllMatches("abc foo1 xyz", {0}, &v1.matches); + EXPECT_EQ(1, v1.matches.size()); + EXPECT_EQ(0, v1.matches[0]); + v1.f.AllMatches("abc bar2 xyz", {0}, &v1.matches); + EXPECT_EQ(0, v1.matches.size()); + + // The moved-to object should do what the moved-from object did. + FilterTestVars v2; + v2.f = std::move(v1.f); + v2.f.AllMatches("abc foo1 xyz", {0}, &v2.matches); + EXPECT_EQ(1, v2.matches.size()); + EXPECT_EQ(0, v2.matches[0]); + v2.f.AllMatches("abc bar2 xyz", {0}, &v2.matches); + EXPECT_EQ(0, v2.matches.size()); + + // The moved-from object should have been reset and be reusable. + v1.f.Add("bar\\d+", v1.opts, &id); + EXPECT_EQ(0, id); + v1.f.Compile(&v1.atoms); + EXPECT_EQ(1, v1.atoms.size()); + EXPECT_EQ("bar", v1.atoms[0]); + v1.f.AllMatches("abc foo1 xyz", {0}, &v1.matches); + EXPECT_EQ(0, v1.matches.size()); + v1.f.AllMatches("abc bar2 xyz", {0}, &v1.matches); + EXPECT_EQ(1, v1.matches.size()); + EXPECT_EQ(0, v1.matches[0]); + + // Verify that "overwriting" works and also doesn't leak memory. + // (The latter will need a leak detector such as LeakSanitizer.) + v1.f = std::move(v2.f); + v1.f.AllMatches("abc foo1 xyz", {0}, &v1.matches); + EXPECT_EQ(1, v1.matches.size()); + EXPECT_EQ(0, v1.matches[0]); + v1.f.AllMatches("abc bar2 xyz", {0}, &v1.matches); + EXPECT_EQ(0, v1.matches.size()); +} + } // namespace re2 diff --git a/re2/testing/null_walker.cc b/re2/testing/null_walker.cc index 77fa72389eadee98b706da989967c4cb8e01e711..2bdea027891587b8daec566dcbc3009d7dae3096 100644 --- a/re2/testing/null_walker.cc +++ b/re2/testing/null_walker.cc @@ -13,13 +13,16 @@ namespace re2 { class NullWalker : public Regexp::Walker { public: - NullWalker() { } - bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, - bool* child_args, int nchild_args); + NullWalker() {} - bool ShortVisit(Regexp* re, bool a) { - // Should never be called: we use Walk not WalkExponential. + virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, + bool* child_args, int nchild_args); + + virtual bool ShortVisit(Regexp* re, bool a) { + // Should never be called: we use Walk(), not WalkExponential(). +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION LOG(DFATAL) << "NullWalker::ShortVisit called"; +#endif return a; } diff --git a/re2/testing/parse_test.cc b/re2/testing/parse_test.cc index 344652690fe8ab9bca866b0d98e9dc3246b613f7..e571127b55146038b0a36d28f99ad5ab20ca327e 100644 --- a/re2/testing/parse_test.cc +++ b/re2/testing/parse_test.cc @@ -164,6 +164,7 @@ static Test tests[] = { // Test named captures { "(?Pa)", "cap{name:lit{a}}" }, + { "(?P<中文>a)", "cap{中文:lit{a}}" }, // Case-folded literals { "[Aa]", "litfold{a}" }, diff --git a/re2/testing/re2_arg_test.cc b/re2/testing/re2_arg_test.cc index 7a38de7c2fb1a6b66d20a23d916e8a0460933142..f62e17cf4772c688a661cc262007c462ae373dd2 100644 --- a/re2/testing/re2_arg_test.cc +++ b/re2/testing/re2_arg_test.cc @@ -11,6 +11,7 @@ #include #include "util/test.h" +#include "util/logging.h" #include "re2/re2.h" namespace re2 { @@ -132,4 +133,28 @@ TEST(RE2ArgTest, Uint64Test) { PARSE_FOR_TYPE(uint64_t, 5); } +TEST(RE2ArgTest, ParseFromTest) { +#if !defined(_MSC_VER) + struct { + bool ParseFrom(const char* str, size_t n) { + LOG(INFO) << "str = " << str << ", n = " << n; + return true; + } + } obj1; + RE2::Arg arg1(&obj1); + EXPECT_TRUE(arg1.Parse("one", 3)); + + struct { + bool ParseFrom(const char* str, size_t n) { + LOG(INFO) << "str = " << str << ", n = " << n; + return false; + } + // Ensure that RE2::Arg works even with overloaded ParseFrom(). + void ParseFrom(const char* str) {} + } obj2; + RE2::Arg arg2(&obj2); + EXPECT_FALSE(arg2.Parse("two", 3)); +#endif +} + } // namespace re2 diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc index 2f4b90cddd39d402e9b6a4cde904f4d3e9bde7a5..b1f7d7335d53cf40fce8a49451a0b8c049e86d98 100644 --- a/re2/testing/re2_test.cc +++ b/re2/testing/re2_test.cc @@ -12,6 +12,7 @@ #include #include #include +#include #if !defined(_MSC_VER) && !defined(__CYGWIN__) && !defined(__MINGW32__) #include #include /* for sysconf */ @@ -223,6 +224,15 @@ TEST(RE2, Extract) { ASSERT_EQ(s, "'foo'"); } +TEST(RE2, MaxSubmatchTooLarge) { + std::string s; + ASSERT_FALSE(RE2::Extract("foo", "f(o+)", "\\1\\2", &s)); + s = "foo"; + ASSERT_FALSE(RE2::Replace(&s, "f(o+)", "\\1\\2")); + s = "foo"; + ASSERT_FALSE(RE2::GlobalReplace(&s, "f(o+)", "\\1\\2")); +} + TEST(RE2, Consume) { RE2 r("\\s*(\\w+)"); // matches a word, possibly proceeded by whitespace std::string word; @@ -473,40 +483,39 @@ TEST(ProgramFanout, BigProgram) { RE2 re100("(?:(?:(?:(?:(?:.)?){100})*)+)"); RE2 re1000("(?:(?:(?:(?:(?:.)?){1000})*)+)"); - std::map histogram; + std::vector histogram; - // 3 is the largest non-empty bucket and has 1 element. + // 3 is the largest non-empty bucket and has 2 element. ASSERT_EQ(3, re1.ProgramFanout(&histogram)); - ASSERT_EQ(1, histogram[3]); + ASSERT_EQ(2, histogram[3]); - // 7 is the largest non-empty bucket and has 10 elements. - ASSERT_EQ(7, re10.ProgramFanout(&histogram)); - ASSERT_EQ(10, histogram[7]); + // 6 is the largest non-empty bucket and has 11 elements. + ASSERT_EQ(6, re10.ProgramFanout(&histogram)); + ASSERT_EQ(11, histogram[6]); - // 10 is the largest non-empty bucket and has 100 elements. - ASSERT_EQ(10, re100.ProgramFanout(&histogram)); - ASSERT_EQ(100, histogram[10]); + // 9 is the largest non-empty bucket and has 101 elements. + ASSERT_EQ(9, re100.ProgramFanout(&histogram)); + ASSERT_EQ(101, histogram[9]); - // 13 is the largest non-empty bucket and has 1000 elements. + // 13 is the largest non-empty bucket and has 1001 elements. ASSERT_EQ(13, re1000.ProgramFanout(&histogram)); - ASSERT_EQ(1000, histogram[13]); + ASSERT_EQ(1001, histogram[13]); - // 2 is the largest non-empty bucket and has 3 elements. - // This differs from the others due to how reverse `.' works. + // 2 is the largest non-empty bucket and has 2 element. ASSERT_EQ(2, re1.ReverseProgramFanout(&histogram)); - ASSERT_EQ(3, histogram[2]); + ASSERT_EQ(2, histogram[2]); - // 5 is the largest non-empty bucket and has 10 elements. + // 5 is the largest non-empty bucket and has 11 elements. ASSERT_EQ(5, re10.ReverseProgramFanout(&histogram)); - ASSERT_EQ(10, histogram[5]); + ASSERT_EQ(11, histogram[5]); - // 9 is the largest non-empty bucket and has 100 elements. + // 9 is the largest non-empty bucket and has 101 elements. ASSERT_EQ(9, re100.ReverseProgramFanout(&histogram)); - ASSERT_EQ(100, histogram[9]); + ASSERT_EQ(101, histogram[9]); - // 12 is the largest non-empty bucket and has 1000 elements. + // 12 is the largest non-empty bucket and has 1001 elements. ASSERT_EQ(12, re1000.ReverseProgramFanout(&histogram)); - ASSERT_EQ(1000, histogram[12]); + ASSERT_EQ(1001, histogram[12]); } // Issue 956519: handling empty character sets was @@ -1232,11 +1241,10 @@ TEST(RE2, DeepRecursion) { // Suggested by Josh Hyman. Failed when SearchOnePass was // not implementing case-folding. TEST(CaseInsensitive, MatchAndConsume) { - std::string result; std::string text = "A fish named *Wanda*"; StringPiece sp(text); - - EXPECT_TRUE(RE2::PartialMatch(sp, "(?i)([wand]{5})", &result)); + StringPiece result; + EXPECT_TRUE(RE2::PartialMatch(text, "(?i)([wand]{5})", &result)); EXPECT_TRUE(RE2::FindAndConsume(&sp, "(?i)([wand]{5})", &result)); } @@ -1269,38 +1277,43 @@ TEST(RE2, CL8622304) { EXPECT_EQ(val, "1,0x2F,030,4,5"); } - // Check that RE2 returns correct regexp pieces on error. // In particular, make sure it returns whole runes // and that it always reports invalid UTF-8. // Also check that Perl error flag piece is big enough. static struct ErrorTest { const char *regexp; - const char *error; + RE2::ErrorCode error_code; + const char *error_arg; } error_tests[] = { - { "ab\\αcd", "\\α" }, - { "ef\\x☺01", "\\x☺0" }, - { "gh\\x1☺01", "\\x1☺" }, - { "ij\\x1", "\\x1" }, - { "kl\\x", "\\x" }, - { "uv\\x{0000☺}", "\\x{0000☺" }, - { "wx\\p{ABC", "\\p{ABC" }, - { "yz(?smiUX:abc)", "(?smiUX" }, // used to return (?s but the error is X - { "aa(?sm☺i", "(?sm☺" }, - { "bb[abc", "[abc" }, - - { "mn\\x1\377", "" }, // no argument string returned for invalid UTF-8 - { "op\377qr", "" }, - { "st\\x{00000\377", "" }, - { "zz\\p{\377}", "" }, - { "zz\\x{00\377}", "" }, - { "zz(?Pabc)", "" }, + { "ab\\αcd", RE2::ErrorBadEscape, "\\α" }, + { "ef\\x☺01", RE2::ErrorBadEscape, "\\x☺0" }, + { "gh\\x1☺01", RE2::ErrorBadEscape, "\\x1☺" }, + { "ij\\x1", RE2::ErrorBadEscape, "\\x1" }, + { "kl\\x", RE2::ErrorBadEscape, "\\x" }, + { "uv\\x{0000☺}", RE2::ErrorBadEscape, "\\x{0000☺" }, + { "wx\\p{ABC", RE2::ErrorBadCharRange, "\\p{ABC" }, + // used to return (?s but the error is X + { "yz(?smiUX:abc)", RE2::ErrorBadPerlOp, "(?smiUX" }, + { "aa(?sm☺i", RE2::ErrorBadPerlOp, "(?sm☺" }, + { "bb[abc", RE2::ErrorMissingBracket, "[abc" }, + { "abc(def", RE2::ErrorMissingParen, "abc(def" }, + { "abc)def", RE2::ErrorUnexpectedParen, "abc)def" }, + + // no argument string returned for invalid UTF-8 + { "mn\\x1\377", RE2::ErrorBadUTF8, "" }, + { "op\377qr", RE2::ErrorBadUTF8, "" }, + { "st\\x{00000\377", RE2::ErrorBadUTF8, "" }, + { "zz\\p{\377}", RE2::ErrorBadUTF8, "" }, + { "zz\\x{00\377}", RE2::ErrorBadUTF8, "" }, + { "zz(?Pabc)", RE2::ErrorBadUTF8, "" }, }; -TEST(RE2, ErrorArgs) { +TEST(RE2, ErrorCodeAndArg) { for (size_t i = 0; i < arraysize(error_tests); i++) { RE2 re(error_tests[i].regexp, RE2::Quiet); EXPECT_FALSE(re.ok()); - EXPECT_EQ(re.error_arg(), error_tests[i].error) << re.error(); + EXPECT_EQ(re.error_code(), error_tests[i].error_code) << re.error(); + EXPECT_EQ(re.error_arg(), error_tests[i].error_arg) << re.error(); } } @@ -1628,4 +1641,19 @@ TEST(RE2, Issue104) { ASSERT_EQ("小人小类小", s); } +TEST(RE2, Issue310) { + // (?:|a)* matched more text than (?:|a)+ did. + + std::string s = "aaa"; + StringPiece m; + + RE2 star("(?:|a)*"); + ASSERT_TRUE(star.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1)); + ASSERT_EQ(m, "") << " got m='" << m << "', want ''"; + + RE2 plus("(?:|a)+"); + ASSERT_TRUE(plus.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1)); + ASSERT_EQ(m, "") << " got m='" << m << "', want ''"; +} + } // namespace re2 diff --git a/re2/testing/regexp_benchmark.cc b/re2/testing/regexp_benchmark.cc index 089d822c9e494d4211cb77219acec0067e371617..3eeb09889ed893724d671df08bf70e3c827925b9 100644 --- a/re2/testing/regexp_benchmark.cc +++ b/re2/testing/regexp_benchmark.cc @@ -9,6 +9,7 @@ #include #include #include +#include #include #include "util/benchmark.h" @@ -20,6 +21,7 @@ #include "re2/prog.h" #include "re2/re2.h" #include "re2/regexp.h" +#include "util/mutex.h" #include "util/pcre.h" namespace re2 { @@ -152,32 +154,38 @@ ParseImpl SearchParse1CachedPCRE, SearchParse1CachedRE2; // Generate random text that won't contain the search string, // to test worst-case search behavior. -void MakeText(std::string* text, int64_t nbytes) { - srand(1); - text->resize(nbytes); - for (int64_t i = 0; i < nbytes; i++) { - // Generate a one-byte rune that isn't a control character (e.g. '\n'). - // Clipping to 0x20 introduces some bias, but we don't need uniformity. - int byte = rand() & 0x7F; - if (byte < 0x20) - byte = 0x20; - (*text)[i] = byte; - } +std::string RandomText(int64_t nbytes) { + static const std::string* const text = []() { + std::string* text = new std::string; + srand(1); + text->resize(16<<20); + for (int64_t i = 0; i < 16<<20; i++) { + // Generate a one-byte rune that isn't a control character (e.g. '\n'). + // Clipping to 0x20 introduces some bias, but we don't need uniformity. + int byte = rand() & 0x7F; + if (byte < 0x20) + byte = 0x20; + (*text)[i] = byte; + } + return text; + }(); + CHECK_LE(nbytes, 16<<20); + return text->substr(0, nbytes); } // Makes text of size nbytes, then calls run to search // the text for regexp iters times. void Search(benchmark::State& state, const char* regexp, SearchImpl* search) { - std::string s; - MakeText(&s, state.range(0)); + std::string s = RandomText(state.range(0)); search(state, regexp, s, Prog::kUnanchored, false); state.SetBytesProcessed(state.iterations() * state.range(0)); } -// These two are easy because they start with an A, -// giving the search loop something to memchr for. +// These three are easy because they have prefixes, +// giving the search loop something to prefix accel. #define EASY0 "ABCDEFGHIJKLMNOPQRSTUVWXYZ$" #define EASY1 "A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$" +#define EASY2 "(?i)" EASY0 // This is a little harder, since it starts with a character class // and thus can't be memchr'ed. Could look for ABC and work backward, @@ -221,6 +229,18 @@ BENCHMARK_RANGE(Search_Easy1_CachedPCRE, 8, 16<<20)->ThreadRange(1, NumCPUs() #endif BENCHMARK_RANGE(Search_Easy1_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); +void Search_Easy2_CachedDFA(benchmark::State& state) { Search(state, EASY2, SearchCachedDFA); } +void Search_Easy2_CachedNFA(benchmark::State& state) { Search(state, EASY2, SearchCachedNFA); } +void Search_Easy2_CachedPCRE(benchmark::State& state) { Search(state, EASY2, SearchCachedPCRE); } +void Search_Easy2_CachedRE2(benchmark::State& state) { Search(state, EASY2, SearchCachedRE2); } + +BENCHMARK_RANGE(Search_Easy2_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_Easy2_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_Easy2_CachedPCRE, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_Easy2_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); + void Search_Medium_CachedDFA(benchmark::State& state) { Search(state, MEDIUM, SearchCachedDFA); } void Search_Medium_CachedNFA(benchmark::State& state) { Search(state, MEDIUM, SearchCachedNFA); } void Search_Medium_CachedPCRE(benchmark::State& state) { Search(state, MEDIUM, SearchCachedPCRE); } @@ -273,8 +293,7 @@ void SearchBigFixed(benchmark::State& state, SearchImpl* search) { std::string s; s.append(state.range(0)/2, 'x'); std::string regexp = "^" + s + ".*$"; - std::string t; - MakeText(&t, state.range(0)/2); + std::string t = RandomText(state.range(0)/2); s += t; search(state, regexp.c_str(), s, Prog::kUnanchored, true); state.SetBytesProcessed(state.iterations() * state.range(0)); @@ -295,8 +314,7 @@ BENCHMARK_RANGE(Search_BigFixed_CachedRE2, 8, 1<<20)->ThreadRange(1, NumCPUs // Benchmark: FindAndConsume void FindAndConsume(benchmark::State& state) { - std::string s; - MakeText(&s, state.range(0)); + std::string s = RandomText(state.range(0)); s.append("Hello World"); RE2 re("((Hello World))"); for (auto _ : state) { @@ -314,8 +332,7 @@ BENCHMARK_RANGE(FindAndConsume, 8, 16<<20)->ThreadRange(1, NumCPUs()); void SearchSuccess(benchmark::State& state, const char* regexp, SearchImpl* search) { - std::string s; - MakeText(&s, state.range(0)); + std::string s = RandomText(state.range(0)); search(state, regexp, s, Prog::kAnchored, true); state.SetBytesProcessed(state.iterations() * state.range(0)); } @@ -385,8 +402,7 @@ BENCHMARK_RANGE(Search_Success1_CachedBitState, 8, 2<<20)->ThreadRange(1, NumCPU // Note that OnePass doesn't implement it! void SearchAltMatch(benchmark::State& state, SearchImpl* search) { - std::string s; - MakeText(&s, state.range(0)); + std::string s = RandomText(state.range(0)); search(state, "\\C*", s, Prog::kAnchored, true); state.SetBytesProcessed(state.iterations() * state.range(0)); } @@ -770,8 +786,7 @@ BENCHMARK(BM_RE2_Compile)->ThreadRange(1, NumCPUs()); // Makes text of size nbytes, then calls run to search // the text for regexp iters times. void SearchPhone(benchmark::State& state, ParseImpl* search) { - std::string s; - MakeText(&s, state.range(0)); + std::string s = RandomText(state.range(0)); s.append("(650) 253-0001"); search(state, "(\\d{3}-|\\(\\d{3}\\)\\s+)(\\d{3}-\\d{4})", s); state.SetBytesProcessed(state.iterations() * state.range(0)); @@ -792,40 +807,6 @@ BENCHMARK_RANGE(SearchPhone_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); /* TODO(rsc): Make this work again. - -// Generates and returns a string over binary alphabet {0,1} that contains -// all possible binary sequences of length n as subsequences. The obvious -// brute force method would generate a string of length n * 2^n, but this -// generates a string of length n + 2^n - 1 called a De Bruijn cycle. -// See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17. -static std::string DeBruijnString(int n) { - CHECK_LT(n, 8*sizeof(int)); - CHECK_GT(n, 0); - - std::vector did(1<; + Prog* prog = cache[regexp]; + if (prog == NULL) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + prog = re->CompileToProg(int64_t{1}<<31); // mostly for the DFA + CHECK(prog); + cache[regexp] = prog; + re->Decref(); + // We must call this here - while we have exclusive access. + prog->IsOnePass(); + } + return prog; +} + +PCRE* GetCachedPCRE(const char* regexp) { + static auto& mutex = *new Mutex; + MutexLock lock(&mutex); + static auto& cache = *new std::unordered_map; + PCRE* re = cache[regexp]; + if (re == NULL) { + re = new PCRE(regexp, PCRE::UTF8); + CHECK_EQ(re->error(), ""); + cache[regexp] = re; + } + return re; +} + +RE2* GetCachedRE2(const char* regexp) { + static auto& mutex = *new Mutex; + MutexLock lock(&mutex); + static auto& cache = *new std::unordered_map; + RE2* re = cache[regexp]; + if (re == NULL) { + re = new RE2(regexp); + CHECK_EQ(re->error(), ""); + cache[regexp] = re; + } + return re; +} + void SearchCachedDFA(benchmark::State& state, const char* regexp, const StringPiece& text, Prog::Anchor anchor, bool expect_match) { - Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); - CHECK(re); - Prog* prog = re->CompileToProg(1LL<<31); - CHECK(prog); + Prog* prog = GetCachedProg(regexp); for (auto _ : state) { bool failed = false; CHECK_EQ(prog->SearchDFA(text, StringPiece(), anchor, Prog::kFirstMatch, @@ -987,63 +1009,45 @@ void SearchCachedDFA(benchmark::State& state, const char* regexp, expect_match); CHECK(!failed); } - delete prog; - re->Decref(); } void SearchCachedNFA(benchmark::State& state, const char* regexp, const StringPiece& text, Prog::Anchor anchor, bool expect_match) { - Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); - CHECK(re); - Prog* prog = re->CompileToProg(0); - CHECK(prog); + Prog* prog = GetCachedProg(regexp); for (auto _ : state) { CHECK_EQ(prog->SearchNFA(text, StringPiece(), anchor, Prog::kFirstMatch, NULL, 0), expect_match); } - delete prog; - re->Decref(); } void SearchCachedOnePass(benchmark::State& state, const char* regexp, const StringPiece& text, Prog::Anchor anchor, bool expect_match) { - Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); - CHECK(re); - Prog* prog = re->CompileToProg(0); - CHECK(prog); + Prog* prog = GetCachedProg(regexp); CHECK(prog->IsOnePass()); for (auto _ : state) { CHECK_EQ(prog->SearchOnePass(text, text, anchor, Prog::kFirstMatch, NULL, 0), expect_match); } - delete prog; - re->Decref(); } void SearchCachedBitState(benchmark::State& state, const char* regexp, const StringPiece& text, Prog::Anchor anchor, bool expect_match) { - Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); - CHECK(re); - Prog* prog = re->CompileToProg(0); - CHECK(prog); + Prog* prog = GetCachedProg(regexp); CHECK(prog->CanBitState()); for (auto _ : state) { CHECK_EQ(prog->SearchBitState(text, text, anchor, Prog::kFirstMatch, NULL, 0), expect_match); } - delete prog; - re->Decref(); } void SearchCachedPCRE(benchmark::State& state, const char* regexp, const StringPiece& text, Prog::Anchor anchor, bool expect_match) { - PCRE re(regexp, PCRE::UTF8); - CHECK_EQ(re.error(), ""); + PCRE& re = *GetCachedPCRE(regexp); for (auto _ : state) { if (anchor == Prog::kAnchored) CHECK_EQ(PCRE::FullMatch(text, re), expect_match); @@ -1055,8 +1059,7 @@ void SearchCachedPCRE(benchmark::State& state, const char* regexp, void SearchCachedRE2(benchmark::State& state, const char* regexp, const StringPiece& text, Prog::Anchor anchor, bool expect_match) { - RE2 re(regexp); - CHECK_EQ(re.error(), ""); + RE2& re = *GetCachedRE2(regexp); for (auto _ : state) { if (anchor == Prog::kAnchored) CHECK_EQ(RE2::FullMatch(text, re), expect_match); @@ -1149,67 +1152,46 @@ void Parse3RE2(benchmark::State& state, const char* regexp, void Parse3CachedNFA(benchmark::State& state, const char* regexp, const StringPiece& text) { - Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); - CHECK(re); - Prog* prog = re->CompileToProg(0); - CHECK(prog); + Prog* prog = GetCachedProg(regexp); StringPiece sp[4]; // 4 because sp[0] is whole match. for (auto _ : state) { CHECK(prog->SearchNFA(text, StringPiece(), Prog::kAnchored, Prog::kFullMatch, sp, 4)); } - delete prog; - re->Decref(); } void Parse3CachedOnePass(benchmark::State& state, const char* regexp, const StringPiece& text) { - Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); - CHECK(re); - Prog* prog = re->CompileToProg(0); - CHECK(prog); + Prog* prog = GetCachedProg(regexp); CHECK(prog->IsOnePass()); StringPiece sp[4]; // 4 because sp[0] is whole match. for (auto _ : state) { CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); } - delete prog; - re->Decref(); } void Parse3CachedBitState(benchmark::State& state, const char* regexp, const StringPiece& text) { - Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); - CHECK(re); - Prog* prog = re->CompileToProg(0); - CHECK(prog); + Prog* prog = GetCachedProg(regexp); CHECK(prog->CanBitState()); StringPiece sp[4]; // 4 because sp[0] is whole match. for (auto _ : state) { CHECK(prog->SearchBitState(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); } - delete prog; - re->Decref(); } void Parse3CachedBacktrack(benchmark::State& state, const char* regexp, const StringPiece& text) { - Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); - CHECK(re); - Prog* prog = re->CompileToProg(0); - CHECK(prog); + Prog* prog = GetCachedProg(regexp); StringPiece sp[4]; // 4 because sp[0] is whole match. for (auto _ : state) { CHECK(prog->UnsafeSearchBacktrack(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); } - delete prog; - re->Decref(); } void Parse3CachedPCRE(benchmark::State& state, const char* regexp, const StringPiece& text) { - PCRE re(regexp, PCRE::UTF8); - CHECK_EQ(re.error(), ""); + PCRE& re = *GetCachedPCRE(regexp); StringPiece sp1, sp2, sp3; for (auto _ : state) { CHECK(PCRE::FullMatch(text, re, &sp1, &sp2, &sp3)); @@ -1218,8 +1200,7 @@ void Parse3CachedPCRE(benchmark::State& state, const char* regexp, void Parse3CachedRE2(benchmark::State& state, const char* regexp, const StringPiece& text) { - RE2 re(regexp); - CHECK_EQ(re.error(), ""); + RE2& re = *GetCachedRE2(regexp); StringPiece sp1, sp2, sp3; for (auto _ : state) { CHECK(RE2::FullMatch(text, re, &sp1, &sp2, &sp3)); @@ -1296,67 +1277,46 @@ void Parse1RE2(benchmark::State& state, const char* regexp, void Parse1CachedNFA(benchmark::State& state, const char* regexp, const StringPiece& text) { - Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); - CHECK(re); - Prog* prog = re->CompileToProg(0); - CHECK(prog); + Prog* prog = GetCachedProg(regexp); StringPiece sp[2]; // 2 because sp[0] is whole match. for (auto _ : state) { CHECK(prog->SearchNFA(text, StringPiece(), Prog::kAnchored, Prog::kFullMatch, sp, 2)); } - delete prog; - re->Decref(); } void Parse1CachedOnePass(benchmark::State& state, const char* regexp, const StringPiece& text) { - Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); - CHECK(re); - Prog* prog = re->CompileToProg(0); - CHECK(prog); + Prog* prog = GetCachedProg(regexp); CHECK(prog->IsOnePass()); StringPiece sp[2]; // 2 because sp[0] is whole match. for (auto _ : state) { CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2)); } - delete prog; - re->Decref(); } void Parse1CachedBitState(benchmark::State& state, const char* regexp, const StringPiece& text) { - Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); - CHECK(re); - Prog* prog = re->CompileToProg(0); - CHECK(prog); + Prog* prog = GetCachedProg(regexp); CHECK(prog->CanBitState()); StringPiece sp[2]; // 2 because sp[0] is whole match. for (auto _ : state) { CHECK(prog->SearchBitState(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2)); } - delete prog; - re->Decref(); } void Parse1CachedBacktrack(benchmark::State& state, const char* regexp, const StringPiece& text) { - Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); - CHECK(re); - Prog* prog = re->CompileToProg(0); - CHECK(prog); + Prog* prog = GetCachedProg(regexp); StringPiece sp[2]; // 2 because sp[0] is whole match. for (auto _ : state) { CHECK(prog->UnsafeSearchBacktrack(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2)); } - delete prog; - re->Decref(); } void Parse1CachedPCRE(benchmark::State& state, const char* regexp, const StringPiece& text) { - PCRE re(regexp, PCRE::UTF8); - CHECK_EQ(re.error(), ""); + PCRE& re = *GetCachedPCRE(regexp); StringPiece sp1; for (auto _ : state) { CHECK(PCRE::FullMatch(text, re, &sp1)); @@ -1365,8 +1325,7 @@ void Parse1CachedPCRE(benchmark::State& state, const char* regexp, void Parse1CachedRE2(benchmark::State& state, const char* regexp, const StringPiece& text) { - RE2 re(regexp); - CHECK_EQ(re.error(), ""); + RE2& re = *GetCachedRE2(regexp); StringPiece sp1; for (auto _ : state) { CHECK(RE2::FullMatch(text, re, &sp1)); @@ -1375,8 +1334,7 @@ void Parse1CachedRE2(benchmark::State& state, const char* regexp, void SearchParse2CachedPCRE(benchmark::State& state, const char* regexp, const StringPiece& text) { - PCRE re(regexp, PCRE::UTF8); - CHECK_EQ(re.error(), ""); + PCRE& re = *GetCachedPCRE(regexp); for (auto _ : state) { StringPiece sp1, sp2; CHECK(PCRE::PartialMatch(text, re, &sp1, &sp2)); @@ -1385,8 +1343,7 @@ void SearchParse2CachedPCRE(benchmark::State& state, const char* regexp, void SearchParse2CachedRE2(benchmark::State& state, const char* regexp, const StringPiece& text) { - RE2 re(regexp); - CHECK_EQ(re.error(), ""); + RE2& re = *GetCachedRE2(regexp); for (auto _ : state) { StringPiece sp1, sp2; CHECK(RE2::PartialMatch(text, re, &sp1, &sp2)); @@ -1395,8 +1352,7 @@ void SearchParse2CachedRE2(benchmark::State& state, const char* regexp, void SearchParse1CachedPCRE(benchmark::State& state, const char* regexp, const StringPiece& text) { - PCRE re(regexp, PCRE::UTF8); - CHECK_EQ(re.error(), ""); + PCRE& re = *GetCachedPCRE(regexp); for (auto _ : state) { StringPiece sp1; CHECK(PCRE::PartialMatch(text, re, &sp1)); @@ -1405,8 +1361,7 @@ void SearchParse1CachedPCRE(benchmark::State& state, const char* regexp, void SearchParse1CachedRE2(benchmark::State& state, const char* regexp, const StringPiece& text) { - RE2 re(regexp); - CHECK_EQ(re.error(), ""); + RE2& re = *GetCachedRE2(regexp); for (auto _ : state) { StringPiece sp1; CHECK(RE2::PartialMatch(text, re, &sp1)); @@ -1541,8 +1496,7 @@ BENCHMARK(ASCIIMatchPCRE)->ThreadRange(1, NumCPUs()); BENCHMARK(ASCIIMatchRE2)->ThreadRange(1, NumCPUs()); void FullMatchPCRE(benchmark::State& state, const char *regexp) { - std::string s; - MakeText(&s, state.range(0)); + std::string s = RandomText(state.range(0)); s += "ABCDEFGHIJ"; PCRE re(regexp); for (auto _ : state) { @@ -1552,8 +1506,7 @@ void FullMatchPCRE(benchmark::State& state, const char *regexp) { } void FullMatchRE2(benchmark::State& state, const char *regexp) { - std::string s; - MakeText(&s, state.range(0)); + std::string s = RandomText(state.range(0)); s += "ABCDEFGHIJ"; RE2 re(regexp, RE2::Latin1); for (auto _ : state) { diff --git a/re2/testing/required_prefix_test.cc b/re2/testing/required_prefix_test.cc index 54600456a88ef9746b8830f73b170c1ea2b14fbf..60a11f865382aacf5519a5feb99c42a98eedab5e 100644 --- a/re2/testing/required_prefix_test.cc +++ b/re2/testing/required_prefix_test.cc @@ -6,6 +6,7 @@ #include "util/test.h" #include "util/logging.h" +#include "re2/prog.h" #include "re2/regexp.h" namespace re2 { @@ -19,15 +20,18 @@ struct PrefixTest { }; static PrefixTest tests[] = { - // If the regexp is missing a ^, there's no required prefix. - { "abc", false }, + // Empty cases. { "", false }, { "(?m)^", false }, + { "(?-m)^", false }, + + // If the regexp has no ^, there's no required prefix. + { "abc", false }, // If the regexp immediately goes into // something not a literal match, there's no required prefix. - { "^(abc)", false }, { "^a*", false }, + { "^(abc)", false }, // Otherwise, it should work. { "^abc$", true, "abc", false, "(?-m:$)" }, @@ -53,15 +57,15 @@ TEST(RequiredPrefix, SimpleTests) { bool f; Regexp* s; ASSERT_EQ(t.return_value, re->RequiredPrefix(&p, &f, &s)) - << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf") + << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8") << " " << re->Dump(); if (t.return_value) { ASSERT_EQ(p, std::string(t.prefix)) - << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf"); + << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8"); ASSERT_EQ(f, t.foldcase) - << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf"); + << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8"); ASSERT_EQ(s->ToString(), std::string(t.suffix)) - << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf"); + << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8"); s->Decref(); } re->Decref(); @@ -69,4 +73,127 @@ TEST(RequiredPrefix, SimpleTests) { } } +static PrefixTest for_accel_tests[] = { + // Empty cases. + { "", false }, + { "(?m)^", false }, + { "(?-m)^", false }, + + // If the regexp has a ^, there's no required prefix. + { "^abc", false }, + + // If the regexp immediately goes into + // something not a literal match, there's no required prefix. + { "a*", false }, + + // Unlike RequiredPrefix(), RequiredPrefixForAccel() can "see through" + // capturing groups, but doesn't try to glue prefix fragments together. + { "(a?)def", false }, + { "(ab?)def", true, "a", false }, + { "(abc?)def", true, "ab", false }, + { "(()a)def", false }, + { "((a)b)def", true, "a", false }, + { "((ab)c)def", true, "ab", false }, + + // Otherwise, it should work. + { "abc$", true, "abc", false }, + { "abc", true, "abc", false }, + { "(?i)abc", true, "abc", true }, + { "abcd*", true, "abc", false }, + { "[Aa][Bb]cd*", true, "ab", true }, + { "ab[Cc]d*", true, "ab", false }, + { "☺abc", true, "☺abc", false }, +}; + +TEST(RequiredPrefixForAccel, SimpleTests) { + for (size_t i = 0; i < arraysize(for_accel_tests); i++) { + const PrefixTest& t = for_accel_tests[i]; + for (size_t j = 0; j < 2; j++) { + Regexp::ParseFlags flags = Regexp::LikePerl; + if (j == 0) + flags = flags | Regexp::Latin1; + Regexp* re = Regexp::Parse(t.regexp, flags, NULL); + ASSERT_TRUE(re != NULL) << " " << t.regexp; + + std::string p; + bool f; + ASSERT_EQ(t.return_value, re->RequiredPrefixForAccel(&p, &f)) + << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8") + << " " << re->Dump(); + if (t.return_value) { + ASSERT_EQ(p, std::string(t.prefix)) + << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8"); + ASSERT_EQ(f, t.foldcase) + << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8"); + } + re->Decref(); + } + } +} + +TEST(RequiredPrefixForAccel, CaseFoldingForKAndS) { + Regexp* re; + std::string p; + bool f; + + // With Latin-1 encoding, `(?i)` prefixes can include 'k' and 's'. + re = Regexp::Parse("(?i)KLM", Regexp::LikePerl|Regexp::Latin1, NULL); + ASSERT_TRUE(re != NULL); + ASSERT_TRUE(re->RequiredPrefixForAccel(&p, &f)); + ASSERT_EQ(p, "klm"); + ASSERT_EQ(f, true); + re->Decref(); + + re = Regexp::Parse("(?i)STU", Regexp::LikePerl|Regexp::Latin1, NULL); + ASSERT_TRUE(re != NULL); + ASSERT_TRUE(re->RequiredPrefixForAccel(&p, &f)); + ASSERT_EQ(p, "stu"); + ASSERT_EQ(f, true); + re->Decref(); + + // With UTF-8 encoding, `(?i)` prefixes can't include 'k' and 's'. + // This is because they match U+212A and U+017F, respectively, and + // so the parser ends up emitting character classes, not literals. + re = Regexp::Parse("(?i)KLM", Regexp::LikePerl, NULL); + ASSERT_TRUE(re != NULL); + ASSERT_FALSE(re->RequiredPrefixForAccel(&p, &f)); + re->Decref(); + + re = Regexp::Parse("(?i)STU", Regexp::LikePerl, NULL); + ASSERT_TRUE(re != NULL); + ASSERT_FALSE(re->RequiredPrefixForAccel(&p, &f)); + re->Decref(); +} + +static const char* prefix_accel_tests[] = { + "aababc\\d+", + "(?i)AABABC\\d+", +}; + +TEST(PrefixAccel, SimpleTests) { + for (size_t i = 0; i < arraysize(prefix_accel_tests); i++) { + const char* pattern = prefix_accel_tests[i]; + Regexp* re = Regexp::Parse(pattern, Regexp::LikePerl, NULL); + ASSERT_TRUE(re != NULL); + Prog* prog = re->CompileToProg(0); + ASSERT_TRUE(prog != NULL); + ASSERT_TRUE(prog->can_prefix_accel()); + for (int j = 0; j < 100; j++) { + std::string text(j, 'a'); + const char* p = reinterpret_cast( + prog->PrefixAccel(text.data(), text.size())); + EXPECT_TRUE(p == NULL); + text.append("aababc"); + for (int k = 0; k < 100; k++) { + text.append(k, 'a'); + p = reinterpret_cast( + prog->PrefixAccel(text.data(), text.size())); + EXPECT_EQ(j, p - text.data()); + } + } + delete prog; + re->Decref(); + } +} + } // namespace re2 diff --git a/re2/testing/search_test.cc b/re2/testing/search_test.cc index c20f501daef166ae0dafbc2565b63f3c5ce9d356..5d86dbfa1e2cf714b8acf789702a9e1bd6752f12 100644 --- a/re2/testing/search_test.cc +++ b/re2/testing/search_test.cc @@ -308,6 +308,8 @@ RegexpTest simple_tests[] = { // Former bugs. { "a\\C*|ba\\C", "baba" }, { "\\w*I\\w*", "Inc." }, + { "(?:|a)*", "aaa" }, + { "(?:|a)+", "aaa" }, }; TEST(Regexp, SearchTests) { diff --git a/re2/testing/set_test.cc b/re2/testing/set_test.cc index 61d1cf295f3252eba0fa9718a53a9a2d17f1e7ba..5a760c4b5e27d986ec582bf512f7968e6906194c 100644 --- a/re2/testing/set_test.cc +++ b/re2/testing/set_test.cc @@ -5,6 +5,7 @@ #include #include #include +#include #include "util/test.h" #include "util/logging.h" @@ -201,4 +202,29 @@ TEST(Set, Prefix) { ASSERT_EQ(v[0], 0); } +TEST(Set, MoveSemantics) { + RE2::Set s1(RE2::DefaultOptions, RE2::UNANCHORED); + ASSERT_EQ(s1.Add("foo\\d+", NULL), 0); + ASSERT_EQ(s1.Compile(), true); + ASSERT_EQ(s1.Match("abc foo1 xyz", NULL), true); + ASSERT_EQ(s1.Match("abc bar2 xyz", NULL), false); + + // The moved-to object should do what the moved-from object did. + RE2::Set s2 = std::move(s1); + ASSERT_EQ(s2.Match("abc foo1 xyz", NULL), true); + ASSERT_EQ(s2.Match("abc bar2 xyz", NULL), false); + + // The moved-from object should have been reset and be reusable. + ASSERT_EQ(s1.Add("bar\\d+", NULL), 0); + ASSERT_EQ(s1.Compile(), true); + ASSERT_EQ(s1.Match("abc foo1 xyz", NULL), false); + ASSERT_EQ(s1.Match("abc bar2 xyz", NULL), true); + + // Verify that "overwriting" works and also doesn't leak memory. + // (The latter will need a leak detector such as LeakSanitizer.) + s1 = std::move(s2); + ASSERT_EQ(s1.Match("abc foo1 xyz", NULL), true); + ASSERT_EQ(s1.Match("abc bar2 xyz", NULL), false); +} + } // namespace re2 diff --git a/re2/testing/string_generator.cc b/re2/testing/string_generator.cc index 030cc457e1097b7bd1cae48025fc25b89aaed3d3..44837fe90e682116d52a5ac913f8ecd1ba994343 100644 --- a/re2/testing/string_generator.cc +++ b/re2/testing/string_generator.cc @@ -111,4 +111,31 @@ void StringGenerator::GenerateNULL() { hasnext_ = true; } +std::string DeBruijnString(int n) { + CHECK_GE(n, 1); + CHECK_LE(n, 29); + const size_t size = size_t{1} << static_cast(n); + const size_t mask = size - 1; + std::vector did(size, false); + std::string s; + s.reserve(static_cast(n) + size); + for (size_t i = 0; i < static_cast(n - 1); i++) + s += '0'; + size_t bits = 0; + for (size_t i = 0; i < size; i++) { + bits <<= 1; + bits &= mask; + if (!did[bits | 1]) { + bits |= 1; + s += '1'; + } else { + s += '0'; + } + CHECK(!did[bits]); + did[bits] = true; + } + CHECK_EQ(s.size(), static_cast(n - 1) + size); + return s; +} + } // namespace re2 diff --git a/re2/testing/string_generator.h b/re2/testing/string_generator.h index 6184176523a44a9e569c8d648a9e934fff0e9148..73fbb514513e8d0d239cdffd1439b9e7e4ac2fb2 100644 --- a/re2/testing/string_generator.h +++ b/re2/testing/string_generator.h @@ -58,6 +58,19 @@ class StringGenerator { StringGenerator& operator=(const StringGenerator&) = delete; }; +// Generates and returns a string over binary alphabet {0,1} that contains +// all possible binary sequences of length n as subsequences. The obvious +// brute force method would generate a string of length n * 2^n, but this +// generates a string of length n-1 + 2^n called a De Bruijn cycle. +// See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17. +// +// Such a string is useful for testing a DFA. If you have a DFA +// where distinct last n bytes implies distinct states, then running on a +// DeBruijn string causes the DFA to need to create a new state at every +// position in the input, never reusing any states until it gets to the +// end of the string. This is the worst possible case for DFA execution. +std::string DeBruijnString(int n); + } // namespace re2 #endif // RE2_TESTING_STRING_GENERATOR_H_ diff --git a/re2/testing/tester.cc b/re2/testing/tester.cc index 67d262c234b84dcf944398ec86e53e548c6aa5a3..b0c22f25b2159f5ecd95986252befed76db70e08 100644 --- a/re2/testing/tester.cc +++ b/re2/testing/tester.cc @@ -86,6 +86,20 @@ static uint32_t Engines() { // The result of running a match. struct TestInstance::Result { + Result() + : skipped(false), + matched(false), + untrusted(false), + have_submatch(false), + have_submatch0(false) { + ClearSubmatch(); + } + + void ClearSubmatch() { + for (int i = 0; i < kMaxSubmatch; i++) + submatch[i] = StringPiece(); + } + bool skipped; // test skipped: wasn't applicable bool matched; // found a match bool untrusted; // don't really trust the answer @@ -103,8 +117,8 @@ static std::string FormatCapture(const StringPiece& text, if (s.data() == NULL) return "(?,?)"; return StringPrintf("(%td,%td)", - s.begin() - text.begin(), - s.end() - text.begin()); + BeginPtr(s) - BeginPtr(text), + EndPtr(s) - BeginPtr(text)); } // Returns whether text contains non-ASCII (>= 0x80) bytes. @@ -292,9 +306,6 @@ void TestInstance::RunSearch(Engine type, const StringPiece& orig_context, Prog::Anchor anchor, Result* result) { - // Result is not trivial, so we cannot freely clear it with memset(3), - // but zeroing objects like so is safe and expedient for our purposes. - memset(reinterpret_cast(result), 0, sizeof *result); if (regexp_ == NULL) { result->skipped = true; return; @@ -392,7 +403,7 @@ void TestInstance::RunSearch(Engine type, case kEngineRE2: case kEngineRE2a: case kEngineRE2b: { - if (!re2_ || text.end() != context.end()) { + if (!re2_ || EndPtr(text) != EndPtr(context)) { result->skipped = true; break; } @@ -407,8 +418,8 @@ void TestInstance::RunSearch(Engine type, result->matched = re2_->Match( context, - static_cast(text.begin() - context.begin()), - static_cast(text.end() - context.begin()), + static_cast(BeginPtr(text) - BeginPtr(context)), + static_cast(EndPtr(text) - BeginPtr(context)), re_anchor, result->submatch, nsubmatch); @@ -417,8 +428,8 @@ void TestInstance::RunSearch(Engine type, } case kEnginePCRE: { - if (!re_ || text.begin() != context.begin() || - text.end() != context.end()) { + if (!re_ || BeginPtr(text) != BeginPtr(context) || + EndPtr(text) != EndPtr(context)) { result->skipped = true; break; } @@ -478,7 +489,7 @@ void TestInstance::RunSearch(Engine type, } if (!result->matched) - memset(result->submatch, 0, sizeof result->submatch); + result->ClearSubmatch(); } // Checks whether r is okay given that correct is the right answer. @@ -595,9 +606,9 @@ void TestInstance::LogMatch(const char* prefix, Engine e, << " text " << CEscape(text) << " (" - << text.begin() - context.begin() + << BeginPtr(text) - BeginPtr(context) << "," - << text.end() - context.begin() + << EndPtr(text) - BeginPtr(context) << ") of context " << CEscape(context) << " (" << FormatKind(kind_) diff --git a/re2/tostring.cc b/re2/tostring.cc index 4545a92ddb331cd0a138336cb6d6a569e61b52dc..9c1c038ca6e0913259b6b14a0f1ac6d05bf7fe9e 100644 --- a/re2/tostring.cc +++ b/re2/tostring.cc @@ -291,7 +291,7 @@ int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg, // There's no syntax accepted by the parser to generate // this node (it is generated by RE2::Set) so make something // up that is readable but won't compile. - t_->append("(?HaveMatch:%d)", re->match_id()); + t_->append(StringPrintf("(?HaveMatch:%d)", re->match_id())); break; } diff --git a/re2/unicode.py b/re2/unicode.py index 56ca8119c62669caa3f9747ecc442b9e4fc9935c..727bea5f1126530d7829f2343371de01fdbdf196 100644 --- a/re2/unicode.py +++ b/re2/unicode.py @@ -13,7 +13,7 @@ import re from six.moves import urllib # Directory or URL where Unicode tables reside. -_UNICODE_DIR = "https://www.unicode.org/Public/12.1.0/ucd" +_UNICODE_DIR = "https://www.unicode.org/Public/14.0.0/ucd" # Largest valid Unicode code value. _RUNE_MAX = 0x10FFFF diff --git a/re2/unicode_casefold.cc b/re2/unicode_casefold.cc index 4ea2533edaea336822dd3574cd7f84278b1e19aa..d9de2821d5f86a248670e70fab52f7f458ca1b00 100644 --- a/re2/unicode_casefold.cc +++ b/re2/unicode_casefold.cc @@ -7,7 +7,7 @@ namespace re2 { -// 1381 groups, 2792 pairs, 356 ranges +// 1424 groups, 2878 pairs, 367 ranges const CaseFold unicode_casefold[] = { { 65, 90, 32 }, { 97, 106, -32 }, @@ -299,8 +299,8 @@ const CaseFold unicode_casefold[] = { { 8579, 8580, OddEven }, { 9398, 9423, 26 }, { 9424, 9449, -26 }, - { 11264, 11310, 48 }, - { 11312, 11358, -48 }, + { 11264, 11311, 48 }, + { 11312, 11359, -48 }, { 11360, 11361, EvenOdd }, { 11362, 11362, -10743 }, { 11363, 11363, -3814 }, @@ -344,11 +344,14 @@ const CaseFold unicode_casefold[] = { { 42929, 42929, -42282 }, { 42930, 42930, -42261 }, { 42931, 42931, 928 }, - { 42932, 42943, EvenOdd }, - { 42946, 42947, EvenOdd }, + { 42932, 42947, EvenOdd }, { 42948, 42948, -48 }, { 42949, 42949, -42307 }, { 42950, 42950, -35384 }, + { 42951, 42954, OddEven }, + { 42960, 42961, EvenOdd }, + { 42966, 42969, EvenOdd }, + { 42997, 42998, OddEven }, { 43859, 43859, -928 }, { 43888, 43967, -38864 }, { 65313, 65338, 32 }, @@ -357,6 +360,14 @@ const CaseFold unicode_casefold[] = { { 66600, 66639, -40 }, { 66736, 66771, 40 }, { 66776, 66811, -40 }, + { 66928, 66938, 39 }, + { 66940, 66954, 39 }, + { 66956, 66962, 39 }, + { 66964, 66965, 39 }, + { 66967, 66977, -39 }, + { 66979, 66993, -39 }, + { 66995, 67001, -39 }, + { 67003, 67004, -39 }, { 68736, 68786, 64 }, { 68800, 68850, -64 }, { 71840, 71871, 32 }, @@ -366,9 +377,9 @@ const CaseFold unicode_casefold[] = { { 125184, 125217, 34 }, { 125218, 125251, -34 }, }; -const int num_unicode_casefold = 356; +const int num_unicode_casefold = 367; -// 1381 groups, 1411 pairs, 198 ranges +// 1424 groups, 1454 pairs, 205 ranges const CaseFold unicode_tolower[] = { { 65, 90, 32 }, { 181, 181, 775 }, @@ -519,7 +530,7 @@ const CaseFold unicode_tolower[] = { { 8544, 8559, 16 }, { 8579, 8579, OddEven }, { 9398, 9423, 26 }, - { 11264, 11310, 48 }, + { 11264, 11311, 48 }, { 11360, 11360, EvenOdd }, { 11362, 11362, -10743 }, { 11363, 11363, -3814 }, @@ -555,21 +566,28 @@ const CaseFold unicode_tolower[] = { { 42929, 42929, -42282 }, { 42930, 42930, -42261 }, { 42931, 42931, 928 }, - { 42932, 42942, EvenOddSkip }, - { 42946, 42946, EvenOdd }, + { 42932, 42946, EvenOddSkip }, { 42948, 42948, -48 }, { 42949, 42949, -42307 }, { 42950, 42950, -35384 }, + { 42951, 42953, OddEvenSkip }, + { 42960, 42960, EvenOdd }, + { 42966, 42968, EvenOddSkip }, + { 42997, 42997, OddEven }, { 43888, 43967, -38864 }, { 65313, 65338, 32 }, { 66560, 66599, 40 }, { 66736, 66771, 40 }, + { 66928, 66938, 39 }, + { 66940, 66954, 39 }, + { 66956, 66962, 39 }, + { 66964, 66965, 39 }, { 68736, 68786, 64 }, { 71840, 71871, 32 }, { 93760, 93791, 32 }, { 125184, 125217, 34 }, }; -const int num_unicode_tolower = 198; +const int num_unicode_tolower = 205; diff --git a/re2/unicode_groups.cc b/re2/unicode_groups.cc index 63e611658c25848daf10c2bc0b7b558b8a03bf0b..2a8d7dae1f1ff8ee6d01c99e7f5d58ec7311e596 100644 --- a/re2/unicode_groups.cc +++ b/re2/unicode_groups.cc @@ -15,6 +15,7 @@ static const URange16 C_range16[] = { { 1564, 1564 }, { 1757, 1757 }, { 1807, 1807 }, + { 2192, 2193 }, { 2274, 2274 }, { 6158, 6158 }, { 8203, 8207 }, @@ -46,6 +47,7 @@ static const URange16 Cf_range16[] = { { 1564, 1564 }, { 1757, 1757 }, { 1807, 1807 }, + { 2192, 2193 }, { 2274, 2274 }, { 6158, 6158 }, { 8203, 8207 }, @@ -124,8 +126,9 @@ static const URange16 L_range16[] = { { 2088, 2088 }, { 2112, 2136 }, { 2144, 2154 }, - { 2208, 2228 }, - { 2230, 2237 }, + { 2160, 2183 }, + { 2185, 2190 }, + { 2208, 2249 }, { 2308, 2361 }, { 2365, 2365 }, { 2384, 2384 }, @@ -190,6 +193,7 @@ static const URange16 L_range16[] = { { 3114, 3129 }, { 3133, 3133 }, { 3160, 3162 }, + { 3165, 3165 }, { 3168, 3169 }, { 3200, 3200 }, { 3205, 3212 }, @@ -198,10 +202,10 @@ static const URange16 L_range16[] = { { 3242, 3251 }, { 3253, 3257 }, { 3261, 3261 }, - { 3294, 3294 }, + { 3293, 3294 }, { 3296, 3297 }, { 3313, 3314 }, - { 3333, 3340 }, + { 3332, 3340 }, { 3342, 3344 }, { 3346, 3386 }, { 3389, 3389 }, @@ -269,9 +273,8 @@ static const URange16 L_range16[] = { { 5761, 5786 }, { 5792, 5866 }, { 5873, 5880 }, - { 5888, 5900 }, - { 5902, 5905 }, - { 5920, 5937 }, + { 5888, 5905 }, + { 5919, 5937 }, { 5952, 5969 }, { 5984, 5996 }, { 5998, 6000 }, @@ -292,7 +295,7 @@ static const URange16 L_range16[] = { { 6688, 6740 }, { 6823, 6823 }, { 6917, 6963 }, - { 6981, 6987 }, + { 6981, 6988 }, { 7043, 7072 }, { 7086, 7087 }, { 7098, 7141 }, @@ -343,9 +346,7 @@ static const URange16 L_range16[] = { { 8517, 8521 }, { 8526, 8526 }, { 8579, 8580 }, - { 11264, 11310 }, - { 11312, 11358 }, - { 11360, 11492 }, + { 11264, 11492 }, { 11499, 11502 }, { 11506, 11507 }, { 11520, 11557 }, @@ -372,11 +373,10 @@ static const URange16 L_range16[] = { { 12540, 12543 }, { 12549, 12591 }, { 12593, 12686 }, - { 12704, 12730 }, + { 12704, 12735 }, { 12784, 12799 }, - { 13312, 19893 }, - { 19968, 40943 }, - { 40960, 42124 }, + { 13312, 19903 }, + { 19968, 42124 }, { 42192, 42237 }, { 42240, 42508 }, { 42512, 42527 }, @@ -386,9 +386,11 @@ static const URange16 L_range16[] = { { 42656, 42725 }, { 42775, 42783 }, { 42786, 42888 }, - { 42891, 42943 }, - { 42946, 42950 }, - { 42999, 43009 }, + { 42891, 42954 }, + { 42960, 42961 }, + { 42963, 42963 }, + { 42965, 42969 }, + { 42994, 43009 }, { 43011, 43013 }, { 43015, 43018 }, { 43020, 43042 }, @@ -425,7 +427,7 @@ static const URange16 L_range16[] = { { 43808, 43814 }, { 43816, 43822 }, { 43824, 43866 }, - { 43868, 43879 }, + { 43868, 43881 }, { 43888, 44002 }, { 44032, 55203 }, { 55216, 55238 }, @@ -478,9 +480,20 @@ static const URange32 L_range32[] = { { 66776, 66811 }, { 66816, 66855 }, { 66864, 66915 }, + { 66928, 66938 }, + { 66940, 66954 }, + { 66956, 66962 }, + { 66964, 66965 }, + { 66967, 66977 }, + { 66979, 66993 }, + { 66995, 67001 }, + { 67003, 67004 }, { 67072, 67382 }, { 67392, 67413 }, { 67424, 67431 }, + { 67456, 67461 }, + { 67463, 67504 }, + { 67506, 67514 }, { 67584, 67589 }, { 67592, 67592 }, { 67594, 67637 }, @@ -511,15 +524,22 @@ static const URange32 L_range32[] = { { 68736, 68786 }, { 68800, 68850 }, { 68864, 68899 }, + { 69248, 69289 }, + { 69296, 69297 }, { 69376, 69404 }, { 69415, 69415 }, { 69424, 69445 }, + { 69488, 69505 }, + { 69552, 69572 }, { 69600, 69622 }, { 69635, 69687 }, + { 69745, 69746 }, + { 69749, 69749 }, { 69763, 69807 }, { 69840, 69864 }, { 69891, 69926 }, { 69956, 69956 }, + { 69959, 69959 }, { 69968, 70002 }, { 70006, 70006 }, { 70019, 70066 }, @@ -545,7 +565,7 @@ static const URange32 L_range32[] = { { 70493, 70497 }, { 70656, 70708 }, { 70727, 70730 }, - { 70751, 70751 }, + { 70751, 70753 }, { 70784, 70831 }, { 70852, 70853 }, { 70855, 70855 }, @@ -556,9 +576,16 @@ static const URange32 L_range32[] = { { 71296, 71338 }, { 71352, 71352 }, { 71424, 71450 }, + { 71488, 71494 }, { 71680, 71723 }, { 71840, 71903 }, - { 71935, 71935 }, + { 71935, 71942 }, + { 71945, 71945 }, + { 71948, 71955 }, + { 71957, 71958 }, + { 71960, 71983 }, + { 71999, 71999 }, + { 72001, 72001 }, { 72096, 72103 }, { 72106, 72144 }, { 72161, 72161 }, @@ -569,7 +596,7 @@ static const URange32 L_range32[] = { { 72272, 72272 }, { 72284, 72329 }, { 72349, 72349 }, - { 72384, 72440 }, + { 72368, 72440 }, { 72704, 72712 }, { 72714, 72750 }, { 72768, 72768 }, @@ -583,12 +610,15 @@ static const URange32 L_range32[] = { { 73066, 73097 }, { 73112, 73112 }, { 73440, 73458 }, + { 73648, 73648 }, { 73728, 74649 }, { 74880, 75075 }, + { 77712, 77808 }, { 77824, 78894 }, { 82944, 83526 }, { 92160, 92728 }, { 92736, 92766 }, + { 92784, 92862 }, { 92880, 92909 }, { 92928, 92975 }, { 92992, 92995 }, @@ -601,8 +631,12 @@ static const URange32 L_range32[] = { { 94176, 94177 }, { 94179, 94179 }, { 94208, 100343 }, - { 100352, 101106 }, - { 110592, 110878 }, + { 100352, 101589 }, + { 101632, 101640 }, + { 110576, 110579 }, + { 110581, 110587 }, + { 110589, 110590 }, + { 110592, 110882 }, { 110928, 110930 }, { 110948, 110951 }, { 110960, 111355 }, @@ -640,10 +674,16 @@ static const URange32 L_range32[] = { { 120714, 120744 }, { 120746, 120770 }, { 120772, 120779 }, + { 122624, 122654 }, { 123136, 123180 }, { 123191, 123197 }, { 123214, 123214 }, + { 123536, 123565 }, { 123584, 123627 }, + { 124896, 124902 }, + { 124904, 124907 }, + { 124909, 124910 }, + { 124912, 124926 }, { 124928, 125124 }, { 125184, 125251 }, { 125259, 125259 }, @@ -680,12 +720,13 @@ static const URange32 L_range32[] = { { 126625, 126627 }, { 126629, 126633 }, { 126635, 126651 }, - { 131072, 173782 }, - { 173824, 177972 }, + { 131072, 173791 }, + { 173824, 177976 }, { 177984, 178205 }, { 178208, 183969 }, { 183984, 191456 }, { 194560, 195101 }, + { 196608, 201546 }, }; static const URange16 Ll_range16[] = { { 97, 122 }, @@ -1119,7 +1160,7 @@ static const URange16 Ll_range16[] = { { 8518, 8521 }, { 8526, 8526 }, { 8580, 8580 }, - { 11312, 11358 }, + { 11312, 11359 }, { 11361, 11361 }, { 11365, 11366 }, { 11368, 11368 }, @@ -1288,10 +1329,19 @@ static const URange16 Ll_range16[] = { { 42939, 42939 }, { 42941, 42941 }, { 42943, 42943 }, + { 42945, 42945 }, { 42947, 42947 }, + { 42952, 42952 }, + { 42954, 42954 }, + { 42961, 42961 }, + { 42963, 42963 }, + { 42965, 42965 }, + { 42967, 42967 }, + { 42969, 42969 }, + { 42998, 42998 }, { 43002, 43002 }, { 43824, 43866 }, - { 43872, 43879 }, + { 43872, 43880 }, { 43888, 43967 }, { 64256, 64262 }, { 64275, 64279 }, @@ -1300,6 +1350,10 @@ static const URange16 Ll_range16[] = { static const URange32 Ll_range32[] = { { 66600, 66639 }, { 66776, 66811 }, + { 66967, 66977 }, + { 66979, 66993 }, + { 66995, 67001 }, + { 67003, 67004 }, { 68800, 68850 }, { 71872, 71903 }, { 93792, 93823 }, @@ -1331,6 +1385,8 @@ static const URange32 Ll_range32[] = { { 120746, 120770 }, { 120772, 120777 }, { 120779, 120779 }, + { 122624, 122633 }, + { 122635, 122654 }, { 125218, 125251 }, }; static const URange16 Lm_range16[] = { @@ -1349,6 +1405,7 @@ static const URange16 Lm_range16[] = { { 2074, 2074 }, { 2084, 2084 }, { 2088, 2088 }, + { 2249, 2249 }, { 2417, 2417 }, { 3654, 3654 }, { 3782, 3782 }, @@ -1379,6 +1436,7 @@ static const URange16 Lm_range16[] = { { 42775, 42783 }, { 42864, 42864 }, { 42888, 42888 }, + { 42994, 42996 }, { 43000, 43001 }, { 43471, 43471 }, { 43494, 43494 }, @@ -1386,14 +1444,21 @@ static const URange16 Lm_range16[] = { { 43741, 43741 }, { 43763, 43764 }, { 43868, 43871 }, + { 43881, 43881 }, { 65392, 65392 }, { 65438, 65439 }, }; static const URange32 Lm_range32[] = { + { 67456, 67461 }, + { 67463, 67504 }, + { 67506, 67514 }, { 92992, 92995 }, { 94099, 94111 }, { 94176, 94177 }, { 94179, 94179 }, + { 110576, 110579 }, + { 110581, 110587 }, + { 110589, 110590 }, { 123191, 123197 }, { 125259, 125259 }, }; @@ -1421,8 +1486,9 @@ static const URange16 Lo_range16[] = { { 2048, 2069 }, { 2112, 2136 }, { 2144, 2154 }, - { 2208, 2228 }, - { 2230, 2237 }, + { 2160, 2183 }, + { 2185, 2190 }, + { 2208, 2248 }, { 2308, 2361 }, { 2365, 2365 }, { 2384, 2384 }, @@ -1487,6 +1553,7 @@ static const URange16 Lo_range16[] = { { 3114, 3129 }, { 3133, 3133 }, { 3160, 3162 }, + { 3165, 3165 }, { 3168, 3169 }, { 3200, 3200 }, { 3205, 3212 }, @@ -1495,10 +1562,10 @@ static const URange16 Lo_range16[] = { { 3242, 3251 }, { 3253, 3257 }, { 3261, 3261 }, - { 3294, 3294 }, + { 3293, 3294 }, { 3296, 3297 }, { 3313, 3314 }, - { 3333, 3340 }, + { 3332, 3340 }, { 3342, 3344 }, { 3346, 3386 }, { 3389, 3389 }, @@ -1559,9 +1626,8 @@ static const URange16 Lo_range16[] = { { 5761, 5786 }, { 5792, 5866 }, { 5873, 5880 }, - { 5888, 5900 }, - { 5902, 5905 }, - { 5920, 5937 }, + { 5888, 5905 }, + { 5919, 5937 }, { 5952, 5969 }, { 5984, 5996 }, { 5998, 6000 }, @@ -1581,7 +1647,7 @@ static const URange16 Lo_range16[] = { { 6656, 6678 }, { 6688, 6740 }, { 6917, 6963 }, - { 6981, 6987 }, + { 6981, 6988 }, { 7043, 7072 }, { 7086, 7087 }, { 7098, 7141 }, @@ -1611,11 +1677,10 @@ static const URange16 Lo_range16[] = { { 12543, 12543 }, { 12549, 12591 }, { 12593, 12686 }, - { 12704, 12730 }, + { 12704, 12735 }, { 12784, 12799 }, - { 13312, 19893 }, - { 19968, 40943 }, - { 40960, 40980 }, + { 13312, 19903 }, + { 19968, 40980 }, { 40982, 42124 }, { 42192, 42231 }, { 42240, 42507 }, @@ -1740,15 +1805,22 @@ static const URange32 Lo_range32[] = { { 68480, 68497 }, { 68608, 68680 }, { 68864, 68899 }, + { 69248, 69289 }, + { 69296, 69297 }, { 69376, 69404 }, { 69415, 69415 }, { 69424, 69445 }, + { 69488, 69505 }, + { 69552, 69572 }, { 69600, 69622 }, { 69635, 69687 }, + { 69745, 69746 }, + { 69749, 69749 }, { 69763, 69807 }, { 69840, 69864 }, { 69891, 69926 }, { 69956, 69956 }, + { 69959, 69959 }, { 69968, 70002 }, { 70006, 70006 }, { 70019, 70066 }, @@ -1774,7 +1846,7 @@ static const URange32 Lo_range32[] = { { 70493, 70497 }, { 70656, 70708 }, { 70727, 70730 }, - { 70751, 70751 }, + { 70751, 70753 }, { 70784, 70831 }, { 70852, 70853 }, { 70855, 70855 }, @@ -1785,8 +1857,15 @@ static const URange32 Lo_range32[] = { { 71296, 71338 }, { 71352, 71352 }, { 71424, 71450 }, + { 71488, 71494 }, { 71680, 71723 }, - { 71935, 71935 }, + { 71935, 71942 }, + { 71945, 71945 }, + { 71948, 71955 }, + { 71957, 71958 }, + { 71960, 71983 }, + { 71999, 71999 }, + { 72001, 72001 }, { 72096, 72103 }, { 72106, 72144 }, { 72161, 72161 }, @@ -1797,7 +1876,7 @@ static const URange32 Lo_range32[] = { { 72272, 72272 }, { 72284, 72329 }, { 72349, 72349 }, - { 72384, 72440 }, + { 72368, 72440 }, { 72704, 72712 }, { 72714, 72750 }, { 72768, 72768 }, @@ -1811,12 +1890,15 @@ static const URange32 Lo_range32[] = { { 73066, 73097 }, { 73112, 73112 }, { 73440, 73458 }, + { 73648, 73648 }, { 73728, 74649 }, { 74880, 75075 }, + { 77712, 77808 }, { 77824, 78894 }, { 82944, 83526 }, { 92160, 92728 }, { 92736, 92766 }, + { 92784, 92862 }, { 92880, 92909 }, { 92928, 92975 }, { 93027, 93047 }, @@ -1824,8 +1906,9 @@ static const URange32 Lo_range32[] = { { 93952, 94026 }, { 94032, 94032 }, { 94208, 100343 }, - { 100352, 101106 }, - { 110592, 110878 }, + { 100352, 101589 }, + { 101632, 101640 }, + { 110592, 110882 }, { 110928, 110930 }, { 110948, 110951 }, { 110960, 111355 }, @@ -1833,9 +1916,15 @@ static const URange32 Lo_range32[] = { { 113776, 113788 }, { 113792, 113800 }, { 113808, 113817 }, + { 122634, 122634 }, { 123136, 123180 }, { 123214, 123214 }, + { 123536, 123565 }, { 123584, 123627 }, + { 124896, 124902 }, + { 124904, 124907 }, + { 124909, 124910 }, + { 124912, 124926 }, { 124928, 125124 }, { 126464, 126467 }, { 126469, 126495 }, @@ -1870,12 +1959,13 @@ static const URange32 Lo_range32[] = { { 126625, 126627 }, { 126629, 126633 }, { 126635, 126651 }, - { 131072, 173782 }, - { 173824, 177972 }, + { 131072, 173791 }, + { 173824, 177976 }, { 177984, 178205 }, { 178208, 183969 }, { 183984, 191456 }, { 194560, 195101 }, + { 196608, 201546 }, }; static const URange16 Lt_range16[] = { { 453, 453 }, @@ -2321,7 +2411,7 @@ static const URange16 Lu_range16[] = { { 8510, 8511 }, { 8517, 8517 }, { 8579, 8579 }, - { 11264, 11310 }, + { 11264, 11311 }, { 11360, 11360 }, { 11362, 11364 }, { 11367, 11367 }, @@ -2486,13 +2576,23 @@ static const URange16 Lu_range16[] = { { 42938, 42938 }, { 42940, 42940 }, { 42942, 42942 }, + { 42944, 42944 }, { 42946, 42946 }, - { 42948, 42950 }, + { 42948, 42951 }, + { 42953, 42953 }, + { 42960, 42960 }, + { 42966, 42966 }, + { 42968, 42968 }, + { 42997, 42997 }, { 65313, 65338 }, }; static const URange32 Lu_range32[] = { { 66560, 66599 }, { 66736, 66771 }, + { 66928, 66938 }, + { 66940, 66954 }, + { 66956, 66962 }, + { 66964, 66965 }, { 68736, 68786 }, { 71840, 71871 }, { 93760, 93791 }, @@ -2554,7 +2654,8 @@ static const URange16 M_range16[] = { { 2085, 2087 }, { 2089, 2093 }, { 2137, 2139 }, - { 2259, 2273 }, + { 2200, 2207 }, + { 2250, 2273 }, { 2275, 2307 }, { 2362, 2364 }, { 2366, 2383 }, @@ -2588,7 +2689,7 @@ static const URange16 M_range16[] = { { 2878, 2884 }, { 2887, 2888 }, { 2891, 2893 }, - { 2902, 2903 }, + { 2901, 2903 }, { 2914, 2915 }, { 2946, 2946 }, { 3006, 3010 }, @@ -2596,6 +2697,7 @@ static const URange16 M_range16[] = { { 3018, 3021 }, { 3031, 3031 }, { 3072, 3076 }, + { 3132, 3132 }, { 3134, 3140 }, { 3142, 3144 }, { 3146, 3149 }, @@ -2615,7 +2717,7 @@ static const URange16 M_range16[] = { { 3402, 3405 }, { 3415, 3415 }, { 3426, 3427 }, - { 3458, 3459 }, + { 3457, 3459 }, { 3530, 3530 }, { 3535, 3540 }, { 3542, 3542 }, @@ -2647,13 +2749,14 @@ static const URange16 M_range16[] = { { 4239, 4239 }, { 4250, 4253 }, { 4957, 4959 }, - { 5906, 5908 }, + { 5906, 5909 }, { 5938, 5940 }, { 5970, 5971 }, { 6002, 6003 }, { 6068, 6099 }, { 6109, 6109 }, { 6155, 6157 }, + { 6159, 6159 }, { 6277, 6278 }, { 6313, 6313 }, { 6432, 6443 }, @@ -2662,7 +2765,7 @@ static const URange16 M_range16[] = { { 6741, 6750 }, { 6752, 6780 }, { 6783, 6783 }, - { 6832, 6846 }, + { 6832, 6862 }, { 6912, 6916 }, { 6964, 6980 }, { 7019, 7027 }, @@ -2675,8 +2778,7 @@ static const URange16 M_range16[] = { { 7405, 7405 }, { 7412, 7412 }, { 7415, 7417 }, - { 7616, 7673 }, - { 7675, 7679 }, + { 7616, 7679 }, { 8400, 8432 }, { 11503, 11505 }, { 11647, 11647 }, @@ -2691,6 +2793,7 @@ static const URange16 M_range16[] = { { 43014, 43014 }, { 43019, 43019 }, { 43043, 43047 }, + { 43052, 43052 }, { 43136, 43137 }, { 43188, 43205 }, { 43232, 43249 }, @@ -2728,11 +2831,16 @@ static const URange32 M_range32[] = { { 68159, 68159 }, { 68325, 68326 }, { 68900, 68903 }, + { 69291, 69292 }, { 69446, 69456 }, + { 69506, 69509 }, { 69632, 69634 }, { 69688, 69702 }, + { 69744, 69744 }, + { 69747, 69748 }, { 69759, 69762 }, { 69808, 69818 }, + { 69826, 69826 }, { 69888, 69890 }, { 69927, 69940 }, { 69957, 69958 }, @@ -2740,6 +2848,7 @@ static const URange32 M_range32[] = { { 70016, 70018 }, { 70067, 70080 }, { 70089, 70092 }, + { 70094, 70095 }, { 70188, 70199 }, { 70206, 70206 }, { 70367, 70378 }, @@ -2762,6 +2871,11 @@ static const URange32 M_range32[] = { { 71339, 71351 }, { 71453, 71467 }, { 71724, 71738 }, + { 71984, 71989 }, + { 71991, 71992 }, + { 71995, 71998 }, + { 72000, 72000 }, + { 72002, 72003 }, { 72145, 72151 }, { 72154, 72160 }, { 72164, 72164 }, @@ -2789,7 +2903,11 @@ static const URange32 M_range32[] = { { 94031, 94031 }, { 94033, 94087 }, { 94095, 94098 }, + { 94180, 94180 }, + { 94192, 94193 }, { 113821, 113822 }, + { 118528, 118573 }, + { 118576, 118598 }, { 119141, 119145 }, { 119149, 119154 }, { 119163, 119170 }, @@ -2808,6 +2926,7 @@ static const URange32 M_range32[] = { { 122915, 122916 }, { 122918, 122922 }, { 123184, 123190 }, + { 123566, 123566 }, { 123628, 123631 }, { 125136, 125142 }, { 125252, 125258 }, @@ -2871,6 +2990,8 @@ static const URange16 Mc_range16[] = { { 4231, 4236 }, { 4239, 4239 }, { 4250, 4252 }, + { 5909, 5909 }, + { 5940, 5940 }, { 6070, 6070 }, { 6078, 6085 }, { 6087, 6088 }, @@ -2935,6 +3056,7 @@ static const URange32 Mc_range32[] = { { 70018, 70018 }, { 70067, 70069 }, { 70079, 70080 }, + { 70094, 70094 }, { 70188, 70190 }, { 70194, 70195 }, { 70197, 70197 }, @@ -2966,6 +3088,11 @@ static const URange32 Mc_range32[] = { { 71462, 71462 }, { 71724, 71726 }, { 71736, 71736 }, + { 71984, 71989 }, + { 71991, 71992 }, + { 71997, 71997 }, + { 72000, 72000 }, + { 72002, 72002 }, { 72145, 72147 }, { 72156, 72159 }, { 72164, 72164 }, @@ -2982,6 +3109,7 @@ static const URange32 Mc_range32[] = { { 73110, 73110 }, { 73461, 73462 }, { 94033, 94087 }, + { 94192, 94193 }, { 119141, 119142 }, { 119149, 119154 }, }; @@ -3017,7 +3145,8 @@ static const URange16 Mn_range16[] = { { 2085, 2087 }, { 2089, 2093 }, { 2137, 2139 }, - { 2259, 2273 }, + { 2200, 2207 }, + { 2250, 2273 }, { 2275, 2306 }, { 2362, 2362 }, { 2364, 2364 }, @@ -3051,13 +3180,14 @@ static const URange16 Mn_range16[] = { { 2879, 2879 }, { 2881, 2884 }, { 2893, 2893 }, - { 2902, 2902 }, + { 2901, 2902 }, { 2914, 2915 }, { 2946, 2946 }, { 3008, 3008 }, { 3021, 3021 }, { 3072, 3072 }, { 3076, 3076 }, + { 3132, 3132 }, { 3134, 3136 }, { 3142, 3144 }, { 3146, 3149 }, @@ -3074,6 +3204,7 @@ static const URange16 Mn_range16[] = { { 3393, 3396 }, { 3405, 3405 }, { 3426, 3427 }, + { 3457, 3457 }, { 3530, 3530 }, { 3538, 3540 }, { 3542, 3542 }, @@ -3106,7 +3237,7 @@ static const URange16 Mn_range16[] = { { 4253, 4253 }, { 4957, 4959 }, { 5906, 5908 }, - { 5938, 5940 }, + { 5938, 5939 }, { 5970, 5971 }, { 6002, 6003 }, { 6068, 6069 }, @@ -3115,6 +3246,7 @@ static const URange16 Mn_range16[] = { { 6089, 6099 }, { 6109, 6109 }, { 6155, 6157 }, + { 6159, 6159 }, { 6277, 6278 }, { 6313, 6313 }, { 6432, 6434 }, @@ -3131,6 +3263,7 @@ static const URange16 Mn_range16[] = { { 6771, 6780 }, { 6783, 6783 }, { 6832, 6845 }, + { 6847, 6862 }, { 6912, 6915 }, { 6964, 6964 }, { 6966, 6970 }, @@ -3153,8 +3286,7 @@ static const URange16 Mn_range16[] = { { 7405, 7405 }, { 7412, 7412 }, { 7416, 7417 }, - { 7616, 7673 }, - { 7675, 7679 }, + { 7616, 7679 }, { 8400, 8412 }, { 8417, 8417 }, { 8421, 8432 }, @@ -3171,6 +3303,7 @@ static const URange16 Mn_range16[] = { { 43014, 43014 }, { 43019, 43019 }, { 43045, 43046 }, + { 43052, 43052 }, { 43204, 43205 }, { 43232, 43249 }, { 43263, 43263 }, @@ -3212,12 +3345,17 @@ static const URange32 Mn_range32[] = { { 68159, 68159 }, { 68325, 68326 }, { 68900, 68903 }, + { 69291, 69292 }, { 69446, 69456 }, + { 69506, 69509 }, { 69633, 69633 }, { 69688, 69702 }, + { 69744, 69744 }, + { 69747, 69748 }, { 69759, 69761 }, { 69811, 69814 }, { 69817, 69818 }, + { 69826, 69826 }, { 69888, 69890 }, { 69927, 69931 }, { 69933, 69940 }, @@ -3225,6 +3363,7 @@ static const URange32 Mn_range32[] = { { 70016, 70017 }, { 70070, 70078 }, { 70089, 70092 }, + { 70095, 70095 }, { 70191, 70193 }, { 70196, 70196 }, { 70198, 70199 }, @@ -3260,6 +3399,9 @@ static const URange32 Mn_range32[] = { { 71463, 71467 }, { 71727, 71735 }, { 71737, 71738 }, + { 71995, 71996 }, + { 71998, 71998 }, + { 72003, 72003 }, { 72148, 72151 }, { 72154, 72155 }, { 72160, 72160 }, @@ -3291,7 +3433,10 @@ static const URange32 Mn_range32[] = { { 92976, 92982 }, { 94031, 94031 }, { 94095, 94098 }, + { 94180, 94180 }, { 113821, 113822 }, + { 118528, 118573 }, + { 118576, 118598 }, { 119143, 119145 }, { 119163, 119170 }, { 119173, 119179 }, @@ -3309,6 +3454,7 @@ static const URange32 Mn_range32[] = { { 122915, 122916 }, { 122918, 122922 }, { 123184, 123190 }, + { 123566, 123566 }, { 123628, 123631 }, { 125136, 125142 }, { 125252, 125258 }, @@ -3413,6 +3559,7 @@ static const URange32 N_range32[] = { { 69216, 69246 }, { 69405, 69414 }, { 69457, 69460 }, + { 69573, 69579 }, { 69714, 69743 }, { 69872, 69881 }, { 69942, 69951 }, @@ -3425,12 +3572,14 @@ static const URange32 N_range32[] = { { 71360, 71369 }, { 71472, 71483 }, { 71904, 71922 }, + { 72016, 72025 }, { 72784, 72812 }, { 73040, 73049 }, { 73120, 73129 }, { 73664, 73684 }, { 74752, 74862 }, { 92768, 92777 }, + { 92864, 92873 }, { 93008, 93017 }, { 93019, 93025 }, { 93824, 93846 }, @@ -3447,6 +3596,7 @@ static const URange32 N_range32[] = { { 126209, 126253 }, { 126255, 126269 }, { 127232, 127244 }, + { 130032, 130041 }, }; static const URange16 Nd_range16[] = { { 48, 57 }, @@ -3501,15 +3651,18 @@ static const URange32 Nd_range32[] = { { 71360, 71369 }, { 71472, 71481 }, { 71904, 71913 }, + { 72016, 72025 }, { 72784, 72793 }, { 73040, 73049 }, { 73120, 73129 }, { 92768, 92777 }, + { 92864, 92873 }, { 93008, 93017 }, { 120782, 120831 }, { 123200, 123209 }, { 123632, 123641 }, { 125264, 125273 }, + { 130032, 130041 }, }; static const URange16 Nl_range16[] = { { 5870, 5872 }, @@ -3583,6 +3736,7 @@ static const URange32 No_range32[] = { { 69216, 69246 }, { 69405, 69414 }, { 69457, 69460 }, + { 69573, 69579 }, { 69714, 69733 }, { 70113, 70132 }, { 71482, 71483 }, @@ -3629,7 +3783,7 @@ static const URange16 P_range16[] = { { 1545, 1546 }, { 1548, 1549 }, { 1563, 1563 }, - { 1566, 1567 }, + { 1565, 1567 }, { 1642, 1645 }, { 1748, 1748 }, { 1792, 1805 }, @@ -3668,6 +3822,7 @@ static const URange16 P_range16[] = { { 6816, 6822 }, { 6824, 6829 }, { 7002, 7008 }, + { 7037, 7038 }, { 7164, 7167 }, { 7227, 7231 }, { 7294, 7295 }, @@ -3692,6 +3847,7 @@ static const URange16 P_range16[] = { { 11632, 11632 }, { 11776, 11822 }, { 11824, 11855 }, + { 11858, 11869 }, { 12289, 12291 }, { 12296, 12305 }, { 12308, 12319 }, @@ -3747,7 +3903,9 @@ static const URange32 P_range32[] = { { 68336, 68342 }, { 68409, 68415 }, { 68505, 68508 }, + { 69293, 69293 }, { 69461, 69465 }, + { 69510, 69513 }, { 69703, 69709 }, { 69819, 69820 }, { 69822, 69825 }, @@ -3760,14 +3918,16 @@ static const URange32 P_range32[] = { { 70200, 70205 }, { 70313, 70313 }, { 70731, 70735 }, - { 70747, 70747 }, + { 70746, 70747 }, { 70749, 70749 }, { 70854, 70854 }, { 71105, 71127 }, { 71233, 71235 }, { 71264, 71276 }, + { 71353, 71353 }, { 71484, 71486 }, { 71739, 71739 }, + { 72004, 72006 }, { 72162, 72162 }, { 72255, 72262 }, { 72346, 72348 }, @@ -3777,6 +3937,7 @@ static const URange32 P_range32[] = { { 73463, 73464 }, { 73727, 73727 }, { 74864, 74868 }, + { 77809, 77810 }, { 92782, 92783 }, { 92917, 92917 }, { 92983, 92987 }, @@ -3806,6 +3967,7 @@ static const URange16 Pd_range16[] = { { 11802, 11802 }, { 11834, 11835 }, { 11840, 11840 }, + { 11869, 11869 }, { 12316, 12316 }, { 12336, 12336 }, { 12448, 12448 }, @@ -3814,6 +3976,9 @@ static const URange16 Pd_range16[] = { { 65123, 65123 }, { 65293, 65293 }, }; +static const URange32 Pd_range32[] = { + { 69293, 69293 }, +}; static const URange16 Pe_range16[] = { { 41, 41 }, { 93, 93 }, @@ -3858,6 +4023,10 @@ static const URange16 Pe_range16[] = { { 11813, 11813 }, { 11815, 11815 }, { 11817, 11817 }, + { 11862, 11862 }, + { 11864, 11864 }, + { 11866, 11866 }, + { 11868, 11868 }, { 12297, 12297 }, { 12299, 12299 }, { 12301, 12301 }, @@ -3937,7 +4106,7 @@ static const URange16 Po_range16[] = { { 1545, 1546 }, { 1548, 1549 }, { 1563, 1563 }, - { 1566, 1567 }, + { 1565, 1567 }, { 1642, 1645 }, { 1748, 1748 }, { 1792, 1805 }, @@ -3974,6 +4143,7 @@ static const URange16 Po_range16[] = { { 6816, 6822 }, { 6824, 6829 }, { 7002, 7008 }, + { 7037, 7038 }, { 7164, 7167 }, { 7227, 7231 }, { 7294, 7295 }, @@ -4002,6 +4172,7 @@ static const URange16 Po_range16[] = { { 11836, 11839 }, { 11841, 11841 }, { 11843, 11855 }, + { 11858, 11860 }, { 12289, 12291 }, { 12349, 12349 }, { 12539, 12539 }, @@ -4057,6 +4228,7 @@ static const URange32 Po_range32[] = { { 68409, 68415 }, { 68505, 68508 }, { 69461, 69465 }, + { 69510, 69513 }, { 69703, 69709 }, { 69819, 69820 }, { 69822, 69825 }, @@ -4069,14 +4241,16 @@ static const URange32 Po_range32[] = { { 70200, 70205 }, { 70313, 70313 }, { 70731, 70735 }, - { 70747, 70747 }, + { 70746, 70747 }, { 70749, 70749 }, { 70854, 70854 }, { 71105, 71127 }, { 71233, 71235 }, { 71264, 71276 }, + { 71353, 71353 }, { 71484, 71486 }, { 71739, 71739 }, + { 72004, 72006 }, { 72162, 72162 }, { 72255, 72262 }, { 72346, 72348 }, @@ -4086,6 +4260,7 @@ static const URange32 Po_range32[] = { { 73463, 73464 }, { 73727, 73727 }, { 74864, 74868 }, + { 77809, 77810 }, { 92782, 92783 }, { 92917, 92917 }, { 92983, 92987 }, @@ -4143,6 +4318,10 @@ static const URange16 Ps_range16[] = { { 11814, 11814 }, { 11816, 11816 }, { 11842, 11842 }, + { 11861, 11861 }, + { 11863, 11863 }, + { 11865, 11865 }, + { 11867, 11867 }, { 12296, 12296 }, { 12298, 12298 }, { 12300, 12300 }, @@ -4207,6 +4386,7 @@ static const URange16 S_range16[] = { { 1789, 1790 }, { 2038, 2038 }, { 2046, 2047 }, + { 2184, 2184 }, { 2546, 2547 }, { 2554, 2555 }, { 2801, 2801 }, @@ -4245,7 +4425,7 @@ static const URange16 S_range16[] = { { 8274, 8274 }, { 8314, 8316 }, { 8330, 8332 }, - { 8352, 8383 }, + { 8352, 8384 }, { 8448, 8449 }, { 8451, 8454 }, { 8456, 8457 }, @@ -4274,8 +4454,9 @@ static const URange16 S_range16[] = { { 10716, 10747 }, { 10750, 11123 }, { 11126, 11157 }, - { 11160, 11263 }, + { 11159, 11263 }, { 11493, 11498 }, + { 11856, 11857 }, { 11904, 11929 }, { 11931, 12019 }, { 12032, 12245 }, @@ -4304,9 +4485,12 @@ static const URange16 S_range16[] = { { 43062, 43065 }, { 43639, 43641 }, { 43867, 43867 }, + { 43882, 43883 }, { 64297, 64297 }, - { 64434, 64449 }, - { 65020, 65021 }, + { 64434, 64450 }, + { 64832, 64847 }, + { 64975, 64975 }, + { 65020, 65023 }, { 65122, 65122 }, { 65124, 65126 }, { 65129, 65129 }, @@ -4325,7 +4509,7 @@ static const URange32 S_range32[] = { { 65847, 65855 }, { 65913, 65929 }, { 65932, 65934 }, - { 65936, 65947 }, + { 65936, 65948 }, { 65952, 65952 }, { 66000, 66044 }, { 67703, 67704 }, @@ -4335,13 +4519,14 @@ static const URange32 S_range32[] = { { 92988, 92991 }, { 92997, 92997 }, { 113820, 113820 }, + { 118608, 118723 }, { 118784, 119029 }, { 119040, 119078 }, { 119081, 119140 }, { 119146, 119148 }, { 119171, 119172 }, { 119180, 119209 }, - { 119214, 119272 }, + { 119214, 119274 }, { 119296, 119361 }, { 119365, 119365 }, { 119552, 119638 }, @@ -4372,36 +4557,38 @@ static const URange32 S_range32[] = { { 127153, 127167 }, { 127169, 127183 }, { 127185, 127221 }, - { 127248, 127340 }, - { 127344, 127404 }, + { 127245, 127405 }, { 127462, 127490 }, { 127504, 127547 }, { 127552, 127560 }, { 127568, 127569 }, { 127584, 127589 }, - { 127744, 128725 }, - { 128736, 128748 }, - { 128752, 128762 }, + { 127744, 128727 }, + { 128733, 128748 }, + { 128752, 128764 }, { 128768, 128883 }, { 128896, 128984 }, { 128992, 129003 }, + { 129008, 129008 }, { 129024, 129035 }, { 129040, 129095 }, { 129104, 129113 }, { 129120, 129159 }, { 129168, 129197 }, - { 129280, 129291 }, - { 129293, 129393 }, - { 129395, 129398 }, - { 129402, 129442 }, - { 129445, 129450 }, - { 129454, 129482 }, - { 129485, 129619 }, + { 129200, 129201 }, + { 129280, 129619 }, { 129632, 129645 }, - { 129648, 129651 }, - { 129656, 129658 }, - { 129664, 129666 }, - { 129680, 129685 }, + { 129648, 129652 }, + { 129656, 129660 }, + { 129664, 129670 }, + { 129680, 129708 }, + { 129712, 129722 }, + { 129728, 129733 }, + { 129744, 129753 }, + { 129760, 129767 }, + { 129776, 129782 }, + { 129792, 129938 }, + { 129940, 129994 }, }; static const URange16 Sc_range16[] = { { 36, 36 }, @@ -4415,7 +4602,7 @@ static const URange16 Sc_range16[] = { { 3065, 3065 }, { 3647, 3647 }, { 6107, 6107 }, - { 8352, 8383 }, + { 8352, 8384 }, { 43064, 43064 }, { 65020, 65020 }, { 65129, 65129 }, @@ -4442,6 +4629,7 @@ static const URange16 Sk_range16[] = { { 751, 767 }, { 885, 885 }, { 900, 901 }, + { 2184, 2184 }, { 8125, 8125 }, { 8127, 8129 }, { 8141, 8143 }, @@ -4453,7 +4641,8 @@ static const URange16 Sk_range16[] = { { 42784, 42785 }, { 42889, 42890 }, { 43867, 43867 }, - { 64434, 64449 }, + { 43882, 43883 }, + { 64434, 64450 }, { 65342, 65342 }, { 65344, 65344 }, { 65507, 65507 }, @@ -4610,8 +4799,9 @@ static const URange16 So_range16[] = { { 11077, 11078 }, { 11085, 11123 }, { 11126, 11157 }, - { 11160, 11263 }, + { 11159, 11263 }, { 11493, 11498 }, + { 11856, 11857 }, { 11904, 11929 }, { 11931, 12019 }, { 12032, 12245 }, @@ -4636,7 +4826,9 @@ static const URange16 So_range16[] = { { 43062, 43063 }, { 43065, 43065 }, { 43639, 43641 }, - { 65021, 65021 }, + { 64832, 64847 }, + { 64975, 64975 }, + { 65021, 65023 }, { 65508, 65508 }, { 65512, 65512 }, { 65517, 65518 }, @@ -4646,7 +4838,7 @@ static const URange32 So_range32[] = { { 65847, 65855 }, { 65913, 65929 }, { 65932, 65934 }, - { 65936, 65947 }, + { 65936, 65948 }, { 65952, 65952 }, { 66000, 66044 }, { 67703, 67704 }, @@ -4657,13 +4849,14 @@ static const URange32 So_range32[] = { { 92988, 92991 }, { 92997, 92997 }, { 113820, 113820 }, + { 118608, 118723 }, { 118784, 119029 }, { 119040, 119078 }, { 119081, 119140 }, { 119146, 119148 }, { 119171, 119172 }, { 119180, 119209 }, - { 119214, 119272 }, + { 119214, 119274 }, { 119296, 119361 }, { 119365, 119365 }, { 119552, 119638 }, @@ -4681,37 +4874,39 @@ static const URange32 So_range32[] = { { 127153, 127167 }, { 127169, 127183 }, { 127185, 127221 }, - { 127248, 127340 }, - { 127344, 127404 }, + { 127245, 127405 }, { 127462, 127490 }, { 127504, 127547 }, { 127552, 127560 }, { 127568, 127569 }, { 127584, 127589 }, { 127744, 127994 }, - { 128000, 128725 }, - { 128736, 128748 }, - { 128752, 128762 }, + { 128000, 128727 }, + { 128733, 128748 }, + { 128752, 128764 }, { 128768, 128883 }, { 128896, 128984 }, { 128992, 129003 }, + { 129008, 129008 }, { 129024, 129035 }, { 129040, 129095 }, { 129104, 129113 }, { 129120, 129159 }, { 129168, 129197 }, - { 129280, 129291 }, - { 129293, 129393 }, - { 129395, 129398 }, - { 129402, 129442 }, - { 129445, 129450 }, - { 129454, 129482 }, - { 129485, 129619 }, + { 129200, 129201 }, + { 129280, 129619 }, { 129632, 129645 }, - { 129648, 129651 }, - { 129656, 129658 }, - { 129664, 129666 }, - { 129680, 129685 }, + { 129648, 129652 }, + { 129656, 129660 }, + { 129664, 129670 }, + { 129680, 129708 }, + { 129712, 129722 }, + { 129728, 129733 }, + { 129744, 129753 }, + { 129760, 129767 }, + { 129776, 129782 }, + { 129792, 129938 }, + { 129940, 129994 }, }; static const URange16 Z_range16[] = { { 32, 32 }, @@ -4746,7 +4941,7 @@ static const URange32 Adlam_range32[] = { static const URange32 Ahom_range32[] = { { 71424, 71450 }, { 71453, 71467 }, - { 71472, 71487 }, + { 71472, 71494 }, }; static const URange32 Anatolian_Hieroglyphs_range32[] = { { 82944, 83526 }, @@ -4755,23 +4950,23 @@ static const URange16 Arabic_range16[] = { { 1536, 1540 }, { 1542, 1547 }, { 1549, 1562 }, - { 1564, 1564 }, - { 1566, 1566 }, + { 1564, 1566 }, { 1568, 1599 }, { 1601, 1610 }, { 1622, 1647 }, { 1649, 1756 }, { 1758, 1791 }, { 1872, 1919 }, - { 2208, 2228 }, - { 2230, 2237 }, - { 2259, 2273 }, + { 2160, 2190 }, + { 2192, 2193 }, + { 2200, 2273 }, { 2275, 2303 }, - { 64336, 64449 }, + { 64336, 64450 }, { 64467, 64829 }, - { 64848, 64911 }, + { 64832, 64911 }, { 64914, 64967 }, - { 65008, 65021 }, + { 64975, 64975 }, + { 65008, 65023 }, { 65136, 65140 }, { 65142, 65276 }, }; @@ -4814,8 +5009,7 @@ static const URange32 Arabic_range32[] = { }; static const URange16 Armenian_range16[] = { { 1329, 1366 }, - { 1369, 1416 }, - { 1418, 1418 }, + { 1369, 1418 }, { 1421, 1423 }, { 64275, 64279 }, }; @@ -4824,8 +5018,8 @@ static const URange32 Avestan_range32[] = { { 68409, 68415 }, }; static const URange16 Balinese_range16[] = { - { 6912, 6987 }, - { 6992, 7036 }, + { 6912, 6988 }, + { 6992, 7038 }, }; static const URange16 Bamum_range16[] = { { 42656, 42743 }, @@ -4866,11 +5060,11 @@ static const URange32 Bhaiksuki_range32[] = { static const URange16 Bopomofo_range16[] = { { 746, 747 }, { 12549, 12591 }, - { 12704, 12730 }, + { 12704, 12735 }, }; static const URange32 Brahmi_range32[] = { { 69632, 69709 }, - { 69714, 69743 }, + { 69714, 69749 }, { 69759, 69759 }, }; static const URange16 Braille_range16[] = { @@ -4887,6 +5081,9 @@ static const URange16 Canadian_Aboriginal_range16[] = { { 5120, 5759 }, { 6320, 6389 }, }; +static const URange32 Canadian_Aboriginal_range32[] = { + { 72368, 72383 }, +}; static const URange32 Carian_range32[] = { { 66208, 66256 }, }; @@ -4896,7 +5093,7 @@ static const URange32 Caucasian_Albanian_range32[] = { }; static const URange32 Chakma_range32[] = { { 69888, 69940 }, - { 69942, 69958 }, + { 69942, 69959 }, }; static const URange16 Cham_range16[] = { { 43520, 43574 }, @@ -4909,6 +5106,9 @@ static const URange16 Cherokee_range16[] = { { 5112, 5117 }, { 43888, 43967 }, }; +static const URange32 Chorasmian_range32[] = { + { 69552, 69579 }, +}; static const URange16 Common_range16[] = { { 0, 64 }, { 91, 96 }, @@ -4924,7 +5124,6 @@ static const URange16 Common_range16[] = { { 894, 894 }, { 901, 901 }, { 903, 903 }, - { 1417, 1417 }, { 1541, 1541 }, { 1548, 1548 }, { 1563, 1563 }, @@ -4951,7 +5150,7 @@ static const URange16 Common_range16[] = { { 8294, 8304 }, { 8308, 8318 }, { 8320, 8334 }, - { 8352, 8383 }, + { 8352, 8384 }, { 8448, 8485 }, { 8487, 8489 }, { 8492, 8497 }, @@ -4963,8 +5162,8 @@ static const URange16 Common_range16[] = { { 9312, 10239 }, { 10496, 11123 }, { 11126, 11157 }, - { 11160, 11263 }, - { 11776, 11855 }, + { 11159, 11263 }, + { 11776, 11869 }, { 12272, 12283 }, { 12288, 12292 }, { 12294, 12294 }, @@ -4987,6 +5186,7 @@ static const URange16 Common_range16[] = { { 43310, 43310 }, { 43471, 43471 }, { 43867, 43867 }, + { 43882, 43883 }, { 64830, 64831 }, { 65040, 65049 }, { 65072, 65106 }, @@ -5006,18 +5206,18 @@ static const URange32 Common_range32[] = { { 65792, 65794 }, { 65799, 65843 }, { 65847, 65855 }, - { 65936, 65947 }, + { 65936, 65948 }, { 66000, 66044 }, { 66273, 66299 }, - { 94178, 94179 }, { 113824, 113827 }, + { 118608, 118723 }, { 118784, 119029 }, { 119040, 119078 }, { 119081, 119142 }, { 119146, 119162 }, { 119171, 119172 }, { 119180, 119209 }, - { 119214, 119272 }, + { 119214, 119274 }, { 119520, 119539 }, { 119552, 119638 }, { 119648, 119672 }, @@ -5050,38 +5250,40 @@ static const URange32 Common_range32[] = { { 127153, 127167 }, { 127169, 127183 }, { 127185, 127221 }, - { 127232, 127244 }, - { 127248, 127340 }, - { 127344, 127404 }, + { 127232, 127405 }, { 127462, 127487 }, { 127489, 127490 }, { 127504, 127547 }, { 127552, 127560 }, { 127568, 127569 }, { 127584, 127589 }, - { 127744, 128725 }, - { 128736, 128748 }, - { 128752, 128762 }, + { 127744, 128727 }, + { 128733, 128748 }, + { 128752, 128764 }, { 128768, 128883 }, { 128896, 128984 }, { 128992, 129003 }, + { 129008, 129008 }, { 129024, 129035 }, { 129040, 129095 }, { 129104, 129113 }, { 129120, 129159 }, { 129168, 129197 }, - { 129280, 129291 }, - { 129293, 129393 }, - { 129395, 129398 }, - { 129402, 129442 }, - { 129445, 129450 }, - { 129454, 129482 }, - { 129485, 129619 }, + { 129200, 129201 }, + { 129280, 129619 }, { 129632, 129645 }, - { 129648, 129651 }, - { 129656, 129658 }, - { 129664, 129666 }, - { 129680, 129685 }, + { 129648, 129652 }, + { 129656, 129660 }, + { 129664, 129670 }, + { 129680, 129708 }, + { 129712, 129722 }, + { 129728, 129733 }, + { 129744, 129753 }, + { 129760, 129767 }, + { 129776, 129782 }, + { 129792, 129938 }, + { 129940, 129994 }, + { 130032, 130041 }, { 917505, 917505 }, { 917536, 917631 }, }; @@ -5104,6 +5306,9 @@ static const URange32 Cypriot_range32[] = { { 67644, 67644 }, { 67647, 67647 }, }; +static const URange32 Cypro_Minoan_range32[] = { + { 77712, 77810 }, +}; static const URange16 Cyrillic_range16[] = { { 1024, 1156 }, { 1159, 1327 }, @@ -5123,6 +5328,16 @@ static const URange16 Devanagari_range16[] = { { 2406, 2431 }, { 43232, 43263 }, }; +static const URange32 Dives_Akuru_range32[] = { + { 71936, 71942 }, + { 71945, 71945 }, + { 71948, 71955 }, + { 71957, 71958 }, + { 71960, 71989 }, + { 71991, 71992 }, + { 71995, 72006 }, + { 72016, 72025 }, +}; static const URange32 Dogra_range32[] = { { 71680, 71739 }, }; @@ -5177,6 +5392,12 @@ static const URange16 Ethiopic_range16[] = { { 43808, 43814 }, { 43816, 43822 }, }; +static const URange32 Ethiopic_range32[] = { + { 124896, 124902 }, + { 124904, 124907 }, + { 124909, 124910 }, + { 124912, 124926 }, +}; static const URange16 Georgian_range16[] = { { 4256, 4293 }, { 4295, 4295 }, @@ -5190,8 +5411,7 @@ static const URange16 Georgian_range16[] = { { 11565, 11565 }, }; static const URange16 Glagolitic_range16[] = { - { 11264, 11310 }, - { 11312, 11358 }, + { 11264, 11359 }, }; static const URange32 Glagolitic_range32[] = { { 122880, 122886 }, @@ -5310,18 +5530,21 @@ static const URange16 Han_range16[] = { { 12295, 12295 }, { 12321, 12329 }, { 12344, 12347 }, - { 13312, 19893 }, - { 19968, 40943 }, + { 13312, 19903 }, + { 19968, 40959 }, { 63744, 64109 }, { 64112, 64217 }, }; static const URange32 Han_range32[] = { - { 131072, 173782 }, - { 173824, 177972 }, + { 94178, 94179 }, + { 94192, 94193 }, + { 131072, 173791 }, + { 173824, 177976 }, { 177984, 178205 }, { 178208, 183969 }, { 183984, 191456 }, { 194560, 195101 }, + { 196608, 201546 }, }; static const URange16 Hangul_range16[] = { { 4352, 4607 }, @@ -5367,7 +5590,7 @@ static const URange16 Hiragana_range16[] = { { 12445, 12447 }, }; static const URange32 Hiragana_range32[] = { - { 110593, 110878 }, + { 110593, 110879 }, { 110928, 110930 }, { 127488, 127488 }, }; @@ -5381,15 +5604,14 @@ static const URange16 Inherited_range16[] = { { 1611, 1621 }, { 1648, 1648 }, { 2385, 2388 }, - { 6832, 6846 }, + { 6832, 6862 }, { 7376, 7378 }, { 7380, 7392 }, { 7394, 7400 }, { 7405, 7405 }, { 7412, 7412 }, { 7416, 7417 }, - { 7616, 7673 }, - { 7675, 7679 }, + { 7616, 7679 }, { 8204, 8205 }, { 8400, 8432 }, { 12330, 12333 }, @@ -5401,6 +5623,8 @@ static const URange32 Inherited_range32[] = { { 66045, 66045 }, { 66272, 66272 }, { 70459, 70459 }, + { 118528, 118573 }, + { 118576, 118598 }, { 119143, 119145 }, { 119163, 119170 }, { 119173, 119179 }, @@ -5421,7 +5645,7 @@ static const URange16 Javanese_range16[] = { { 43486, 43487 }, }; static const URange32 Kaithi_range32[] = { - { 69760, 69825 }, + { 69760, 69826 }, { 69837, 69837 }, }; static const URange16 Kannada_range16[] = { @@ -5434,7 +5658,7 @@ static const URange16 Kannada_range16[] = { { 3270, 3272 }, { 3274, 3277 }, { 3285, 3286 }, - { 3294, 3294 }, + { 3293, 3294 }, { 3296, 3299 }, { 3302, 3311 }, { 3313, 3314 }, @@ -5449,7 +5673,11 @@ static const URange16 Katakana_range16[] = { { 65393, 65437 }, }; static const URange32 Katakana_range32[] = { + { 110576, 110579 }, + { 110581, 110587 }, + { 110589, 110590 }, { 110592, 110592 }, + { 110880, 110882 }, { 110948, 110951 }, }; static const URange16 Kayah_Li_range16[] = { @@ -5466,6 +5694,10 @@ static const URange32 Kharoshthi_range32[] = { { 68159, 68168 }, { 68176, 68184 }, }; +static const URange32 Khitan_Small_Script_range32[] = { + { 94180, 94180 }, + { 101120, 101589 }, +}; static const URange16 Khmer_range16[] = { { 6016, 6109 }, { 6112, 6121 }, @@ -5517,16 +5749,24 @@ static const URange16 Latin_range16[] = { { 8544, 8584 }, { 11360, 11391 }, { 42786, 42887 }, - { 42891, 42943 }, - { 42946, 42950 }, - { 42999, 43007 }, + { 42891, 42954 }, + { 42960, 42961 }, + { 42963, 42963 }, + { 42965, 42969 }, + { 42994, 43007 }, { 43824, 43866 }, { 43868, 43876 }, - { 43878, 43879 }, + { 43878, 43881 }, { 64256, 64262 }, { 65313, 65338 }, { 65345, 65370 }, }; +static const URange32 Latin_range32[] = { + { 67456, 67461 }, + { 67463, 67504 }, + { 67506, 67514 }, + { 122624, 122654 }, +}; static const URange16 Lepcha_range16[] = { { 7168, 7223 }, { 7227, 7241 }, @@ -5556,6 +5796,9 @@ static const URange32 Linear_B_range32[] = { static const URange16 Lisu_range16[] = { { 42192, 42239 }, }; +static const URange32 Lisu_range32[] = { + { 73648, 73648 }, +}; static const URange32 Lycian_range32[] = { { 66176, 66204 }, }; @@ -5570,8 +5813,7 @@ static const URange32 Makasar_range32[] = { { 73440, 73464 }, }; static const URange16 Malayalam_range16[] = { - { 3328, 3331 }, - { 3333, 3340 }, + { 3328, 3340 }, { 3342, 3344 }, { 3346, 3396 }, { 3398, 3400 }, @@ -5633,8 +5875,7 @@ static const URange32 Modi_range32[] = { static const URange16 Mongolian_range16[] = { { 6144, 6145 }, { 6148, 6148 }, - { 6150, 6158 }, - { 6160, 6169 }, + { 6150, 6169 }, { 6176, 6264 }, { 6272, 6314 }, }; @@ -5674,9 +5915,8 @@ static const URange16 New_Tai_Lue_range16[] = { { 6622, 6623 }, }; static const URange32 Newa_range32[] = { - { 70656, 70745 }, - { 70747, 70747 }, - { 70749, 70751 }, + { 70656, 70747 }, + { 70749, 70753 }, }; static const URange16 Nko_range16[] = { { 1984, 2042 }, @@ -5726,6 +5966,9 @@ static const URange32 Old_South_Arabian_range32[] = { static const URange32 Old_Turkic_range32[] = { { 68608, 68680 }, }; +static const URange32 Old_Uyghur_range32[] = { + { 69488, 69513 }, +}; static const URange16 Oriya_range16[] = { { 2817, 2819 }, { 2821, 2828 }, @@ -5737,7 +5980,7 @@ static const URange16 Oriya_range16[] = { { 2876, 2884 }, { 2887, 2888 }, { 2891, 2893 }, - { 2902, 2903 }, + { 2901, 2903 }, { 2908, 2909 }, { 2911, 2915 }, { 2918, 2935 }, @@ -5792,8 +6035,7 @@ static const URange16 Saurashtra_range16[] = { { 43214, 43225 }, }; static const URange32 Sharada_range32[] = { - { 70016, 70093 }, - { 70096, 70111 }, + { 70016, 70111 }, }; static const URange32 Shavian_range32[] = { { 66640, 66687 }, @@ -5808,7 +6050,7 @@ static const URange32 SignWriting_range32[] = { { 121505, 121519 }, }; static const URange16 Sinhala_range16[] = { - { 3458, 3459 }, + { 3457, 3459 }, { 3461, 3478 }, { 3482, 3505 }, { 3507, 3515 }, @@ -5839,7 +6081,7 @@ static const URange16 Sundanese_range16[] = { { 7360, 7367 }, }; static const URange16 Syloti_Nagri_range16[] = { - { 43008, 43051 }, + { 43008, 43052 }, }; static const URange16 Syriac_range16[] = { { 1792, 1805 }, @@ -5848,8 +6090,8 @@ static const URange16 Syriac_range16[] = { { 2144, 2154 }, }; static const URange16 Tagalog_range16[] = { - { 5888, 5900 }, - { 5902, 5908 }, + { 5888, 5909 }, + { 5919, 5919 }, }; static const URange16 Tagbanwa_range16[] = { { 5984, 5996 }, @@ -5872,7 +6114,7 @@ static const URange16 Tai_Viet_range16[] = { { 43739, 43743 }, }; static const URange32 Takri_range32[] = { - { 71296, 71352 }, + { 71296, 71353 }, { 71360, 71369 }, }; static const URange16 Tamil_range16[] = { @@ -5897,21 +6139,27 @@ static const URange32 Tamil_range32[] = { { 73664, 73713 }, { 73727, 73727 }, }; +static const URange32 Tangsa_range32[] = { + { 92784, 92862 }, + { 92864, 92873 }, +}; static const URange32 Tangut_range32[] = { { 94176, 94176 }, { 94208, 100343 }, - { 100352, 101106 }, + { 100352, 101119 }, + { 101632, 101640 }, }; static const URange16 Telugu_range16[] = { { 3072, 3084 }, { 3086, 3088 }, { 3090, 3112 }, { 3114, 3129 }, - { 3133, 3140 }, + { 3132, 3140 }, { 3142, 3144 }, { 3146, 3149 }, { 3157, 3158 }, { 3160, 3162 }, + { 3165, 3165 }, { 3168, 3171 }, { 3174, 3183 }, { 3191, 3199 }, @@ -5941,6 +6189,9 @@ static const URange32 Tirhuta_range32[] = { { 70784, 70855 }, { 70864, 70873 }, }; +static const URange32 Toto_range32[] = { + { 123536, 123566 }, +}; static const URange32 Ugaritic_range32[] = { { 66432, 66461 }, { 66463, 66463 }, @@ -5948,6 +6199,16 @@ static const URange32 Ugaritic_range32[] = { static const URange16 Vai_range16[] = { { 42240, 42539 }, }; +static const URange32 Vithkuqi_range32[] = { + { 66928, 66938 }, + { 66940, 66954 }, + { 66956, 66962 }, + { 66964, 66965 }, + { 66967, 66977 }, + { 66979, 66993 }, + { 66995, 67001 }, + { 67003, 67004 }, +}; static const URange32 Wancho_range32[] = { { 123584, 123641 }, { 123647, 123647 }, @@ -5956,6 +6217,11 @@ static const URange32 Warang_Citi_range32[] = { { 71840, 71922 }, { 71935, 71935 }, }; +static const URange32 Yezidi_range32[] = { + { 69248, 69289 }, + { 69291, 69293 }, + { 69296, 69297 }, +}; static const URange16 Yi_range16[] = { { 40960, 42124 }, { 42128, 42182 }, @@ -5963,13 +6229,13 @@ static const URange16 Yi_range16[] = { static const URange32 Zanabazar_Square_range32[] = { { 72192, 72263 }, }; -// 3987 16-bit ranges, 1525 32-bit ranges +// 4038 16-bit ranges, 1712 32-bit ranges const UGroup unicode_groups[] = { { "Adlam", +1, 0, 0, Adlam_range32, 3 }, { "Ahom", +1, 0, 0, Ahom_range32, 3 }, { "Anatolian_Hieroglyphs", +1, 0, 0, Anatolian_Hieroglyphs_range32, 1 }, { "Arabic", +1, Arabic_range16, 22, Arabic_range32, 35 }, - { "Armenian", +1, Armenian_range16, 5, 0, 0 }, + { "Armenian", +1, Armenian_range16, 4, 0, 0 }, { "Avestan", +1, 0, 0, Avestan_range32, 2 }, { "Balinese", +1, Balinese_range16, 2, 0, 0 }, { "Bamum", +1, Bamum_range16, 1, Bamum_range32, 1 }, @@ -5982,39 +6248,42 @@ const UGroup unicode_groups[] = { { "Braille", +1, Braille_range16, 1, 0, 0 }, { "Buginese", +1, Buginese_range16, 2, 0, 0 }, { "Buhid", +1, Buhid_range16, 1, 0, 0 }, - { "C", +1, C_range16, 16, C_range32, 9 }, - { "Canadian_Aboriginal", +1, Canadian_Aboriginal_range16, 2, 0, 0 }, + { "C", +1, C_range16, 17, C_range32, 9 }, + { "Canadian_Aboriginal", +1, Canadian_Aboriginal_range16, 2, Canadian_Aboriginal_range32, 1 }, { "Carian", +1, 0, 0, Carian_range32, 1 }, { "Caucasian_Albanian", +1, 0, 0, Caucasian_Albanian_range32, 2 }, { "Cc", +1, Cc_range16, 2, 0, 0 }, - { "Cf", +1, Cf_range16, 13, Cf_range32, 7 }, + { "Cf", +1, Cf_range16, 14, Cf_range32, 7 }, { "Chakma", +1, 0, 0, Chakma_range32, 2 }, { "Cham", +1, Cham_range16, 4, 0, 0 }, { "Cherokee", +1, Cherokee_range16, 3, 0, 0 }, + { "Chorasmian", +1, 0, 0, Chorasmian_range32, 1 }, { "Co", +1, Co_range16, 1, Co_range32, 2 }, - { "Common", +1, Common_range16, 91, Common_range32, 81 }, + { "Common", +1, Common_range16, 91, Common_range32, 83 }, { "Coptic", +1, Coptic_range16, 3, 0, 0 }, { "Cs", +1, Cs_range16, 1, 0, 0 }, { "Cuneiform", +1, 0, 0, Cuneiform_range32, 4 }, { "Cypriot", +1, 0, 0, Cypriot_range32, 6 }, + { "Cypro_Minoan", +1, 0, 0, Cypro_Minoan_range32, 1 }, { "Cyrillic", +1, Cyrillic_range16, 8, 0, 0 }, { "Deseret", +1, 0, 0, Deseret_range32, 1 }, { "Devanagari", +1, Devanagari_range16, 4, 0, 0 }, + { "Dives_Akuru", +1, 0, 0, Dives_Akuru_range32, 8 }, { "Dogra", +1, 0, 0, Dogra_range32, 1 }, { "Duployan", +1, 0, 0, Duployan_range32, 5 }, { "Egyptian_Hieroglyphs", +1, 0, 0, Egyptian_Hieroglyphs_range32, 2 }, { "Elbasan", +1, 0, 0, Elbasan_range32, 1 }, { "Elymaic", +1, 0, 0, Elymaic_range32, 1 }, - { "Ethiopic", +1, Ethiopic_range16, 32, 0, 0 }, + { "Ethiopic", +1, Ethiopic_range16, 32, Ethiopic_range32, 4 }, { "Georgian", +1, Georgian_range16, 10, 0, 0 }, - { "Glagolitic", +1, Glagolitic_range16, 2, Glagolitic_range32, 5 }, + { "Glagolitic", +1, Glagolitic_range16, 1, Glagolitic_range32, 5 }, { "Gothic", +1, 0, 0, Gothic_range32, 1 }, { "Grantha", +1, 0, 0, Grantha_range32, 15 }, { "Greek", +1, Greek_range16, 33, Greek_range32, 3 }, { "Gujarati", +1, Gujarati_range16, 14, 0, 0 }, { "Gunjala_Gondi", +1, 0, 0, Gunjala_Gondi_range32, 6 }, { "Gurmukhi", +1, Gurmukhi_range16, 16, 0, 0 }, - { "Han", +1, Han_range16, 11, Han_range32, 6 }, + { "Han", +1, Han_range16, 11, Han_range32, 9 }, { "Hangul", +1, Hangul_range16, 14, 0, 0 }, { "Hanifi_Rohingya", +1, 0, 0, Hanifi_Rohingya_range32, 2 }, { "Hanunoo", +1, Hanunoo_range16, 1, 0, 0 }, @@ -6022,42 +6291,43 @@ const UGroup unicode_groups[] = { { "Hebrew", +1, Hebrew_range16, 9, 0, 0 }, { "Hiragana", +1, Hiragana_range16, 2, Hiragana_range32, 3 }, { "Imperial_Aramaic", +1, 0, 0, Imperial_Aramaic_range32, 2 }, - { "Inherited", +1, Inherited_range16, 20, Inherited_range32, 8 }, + { "Inherited", +1, Inherited_range16, 19, Inherited_range32, 10 }, { "Inscriptional_Pahlavi", +1, 0, 0, Inscriptional_Pahlavi_range32, 2 }, { "Inscriptional_Parthian", +1, 0, 0, Inscriptional_Parthian_range32, 2 }, { "Javanese", +1, Javanese_range16, 3, 0, 0 }, { "Kaithi", +1, 0, 0, Kaithi_range32, 2 }, { "Kannada", +1, Kannada_range16, 13, 0, 0 }, - { "Katakana", +1, Katakana_range16, 7, Katakana_range32, 2 }, + { "Katakana", +1, Katakana_range16, 7, Katakana_range32, 6 }, { "Kayah_Li", +1, Kayah_Li_range16, 2, 0, 0 }, { "Kharoshthi", +1, 0, 0, Kharoshthi_range32, 8 }, + { "Khitan_Small_Script", +1, 0, 0, Khitan_Small_Script_range32, 2 }, { "Khmer", +1, Khmer_range16, 4, 0, 0 }, { "Khojki", +1, 0, 0, Khojki_range32, 2 }, { "Khudawadi", +1, 0, 0, Khudawadi_range32, 2 }, - { "L", +1, L_range16, 380, L_range32, 229 }, + { "L", +1, L_range16, 380, L_range32, 268 }, { "Lao", +1, Lao_range16, 11, 0, 0 }, - { "Latin", +1, Latin_range16, 32, 0, 0 }, + { "Latin", +1, Latin_range16, 34, Latin_range32, 4 }, { "Lepcha", +1, Lepcha_range16, 3, 0, 0 }, { "Limbu", +1, Limbu_range16, 5, 0, 0 }, { "Linear_A", +1, 0, 0, Linear_A_range32, 3 }, { "Linear_B", +1, 0, 0, Linear_B_range32, 7 }, - { "Lisu", +1, Lisu_range16, 1, 0, 0 }, - { "Ll", +1, Ll_range16, 608, Ll_range32, 34 }, - { "Lm", +1, Lm_range16, 54, Lm_range32, 6 }, - { "Lo", +1, Lo_range16, 290, Lo_range32, 186 }, + { "Lisu", +1, Lisu_range16, 1, Lisu_range32, 1 }, + { "Ll", +1, Ll_range16, 617, Ll_range32, 40 }, + { "Lm", +1, Lm_range16, 57, Lm_range32, 12 }, + { "Lo", +1, Lo_range16, 290, Lo_range32, 211 }, { "Lt", +1, Lt_range16, 10, 0, 0 }, - { "Lu", +1, Lu_range16, 599, Lu_range32, 37 }, + { "Lu", +1, Lu_range16, 605, Lu_range32, 41 }, { "Lycian", +1, 0, 0, Lycian_range32, 1 }, { "Lydian", +1, 0, 0, Lydian_range32, 2 }, - { "M", +1, M_range16, 186, M_range32, 94 }, + { "M", +1, M_range16, 189, M_range32, 110 }, { "Mahajani", +1, 0, 0, Mahajani_range32, 1 }, { "Makasar", +1, 0, 0, Makasar_range32, 1 }, - { "Malayalam", +1, Malayalam_range16, 8, 0, 0 }, + { "Malayalam", +1, Malayalam_range16, 7, 0, 0 }, { "Mandaic", +1, Mandaic_range16, 2, 0, 0 }, { "Manichaean", +1, 0, 0, Manichaean_range32, 2 }, { "Marchen", +1, 0, 0, Marchen_range32, 3 }, { "Masaram_Gondi", +1, 0, 0, Masaram_Gondi_range32, 7 }, - { "Mc", +1, Mc_range16, 109, Mc_range32, 59 }, + { "Mc", +1, Mc_range16, 111, Mc_range32, 66 }, { "Me", +1, Me_range16, 5, 0, 0 }, { "Medefaidrin", +1, 0, 0, Medefaidrin_range32, 1 }, { "Meetei_Mayek", +1, Meetei_Mayek_range16, 3, 0, 0 }, @@ -6065,21 +6335,21 @@ const UGroup unicode_groups[] = { { "Meroitic_Cursive", +1, 0, 0, Meroitic_Cursive_range32, 3 }, { "Meroitic_Hieroglyphs", +1, 0, 0, Meroitic_Hieroglyphs_range32, 1 }, { "Miao", +1, 0, 0, Miao_range32, 3 }, - { "Mn", +1, Mn_range16, 207, Mn_range32, 111 }, + { "Mn", +1, Mn_range16, 212, Mn_range32, 124 }, { "Modi", +1, 0, 0, Modi_range32, 2 }, - { "Mongolian", +1, Mongolian_range16, 6, Mongolian_range32, 1 }, + { "Mongolian", +1, Mongolian_range16, 5, Mongolian_range32, 1 }, { "Mro", +1, 0, 0, Mro_range32, 3 }, { "Multani", +1, 0, 0, Multani_range32, 5 }, { "Myanmar", +1, Myanmar_range16, 3, 0, 0 }, - { "N", +1, N_range16, 67, N_range32, 63 }, + { "N", +1, N_range16, 67, N_range32, 67 }, { "Nabataean", +1, 0, 0, Nabataean_range32, 2 }, { "Nandinagari", +1, 0, 0, Nandinagari_range32, 3 }, - { "Nd", +1, Nd_range16, 37, Nd_range32, 22 }, + { "Nd", +1, Nd_range16, 37, Nd_range32, 25 }, { "New_Tai_Lue", +1, New_Tai_Lue_range16, 4, 0, 0 }, - { "Newa", +1, 0, 0, Newa_range32, 3 }, + { "Newa", +1, 0, 0, Newa_range32, 2 }, { "Nko", +1, Nko_range16, 2, 0, 0 }, { "Nl", +1, Nl_range16, 7, Nl_range32, 5 }, - { "No", +1, No_range16, 29, No_range32, 41 }, + { "No", +1, No_range16, 29, No_range32, 42 }, { "Nushu", +1, 0, 0, Nushu_range32, 2 }, { "Nyiakeng_Puachue_Hmong", +1, 0, 0, Nyiakeng_Puachue_Hmong_range32, 4 }, { "Ogham", +1, Ogham_range16, 1, 0, 0 }, @@ -6092,37 +6362,38 @@ const UGroup unicode_groups[] = { { "Old_Sogdian", +1, 0, 0, Old_Sogdian_range32, 1 }, { "Old_South_Arabian", +1, 0, 0, Old_South_Arabian_range32, 1 }, { "Old_Turkic", +1, 0, 0, Old_Turkic_range32, 1 }, + { "Old_Uyghur", +1, 0, 0, Old_Uyghur_range32, 1 }, { "Oriya", +1, Oriya_range16, 14, 0, 0 }, { "Osage", +1, 0, 0, Osage_range32, 2 }, { "Osmanya", +1, 0, 0, Osmanya_range32, 2 }, - { "P", +1, P_range16, 131, P_range32, 51 }, + { "P", +1, P_range16, 133, P_range32, 56 }, { "Pahawh_Hmong", +1, 0, 0, Pahawh_Hmong_range32, 5 }, { "Palmyrene", +1, 0, 0, Palmyrene_range32, 1 }, { "Pau_Cin_Hau", +1, 0, 0, Pau_Cin_Hau_range32, 1 }, { "Pc", +1, Pc_range16, 6, 0, 0 }, - { "Pd", +1, Pd_range16, 17, 0, 0 }, - { "Pe", +1, Pe_range16, 72, 0, 0 }, + { "Pd", +1, Pd_range16, 18, Pd_range32, 1 }, + { "Pe", +1, Pe_range16, 76, 0, 0 }, { "Pf", +1, Pf_range16, 10, 0, 0 }, { "Phags_Pa", +1, Phags_Pa_range16, 1, 0, 0 }, { "Phoenician", +1, 0, 0, Phoenician_range32, 2 }, { "Pi", +1, Pi_range16, 11, 0, 0 }, - { "Po", +1, Po_range16, 128, Po_range32, 51 }, - { "Ps", +1, Ps_range16, 75, 0, 0 }, + { "Po", +1, Po_range16, 130, Po_range32, 55 }, + { "Ps", +1, Ps_range16, 79, 0, 0 }, { "Psalter_Pahlavi", +1, 0, 0, Psalter_Pahlavi_range32, 3 }, { "Rejang", +1, Rejang_range16, 2, 0, 0 }, { "Runic", +1, Runic_range16, 2, 0, 0 }, - { "S", +1, S_range16, 146, S_range32, 80 }, + { "S", +1, S_range16, 151, S_range32, 83 }, { "Samaritan", +1, Samaritan_range16, 2, 0, 0 }, { "Saurashtra", +1, Saurashtra_range16, 2, 0, 0 }, { "Sc", +1, Sc_range16, 18, Sc_range32, 3 }, - { "Sharada", +1, 0, 0, Sharada_range32, 2 }, + { "Sharada", +1, 0, 0, Sharada_range32, 1 }, { "Shavian", +1, 0, 0, Shavian_range32, 1 }, { "Siddham", +1, 0, 0, Siddham_range32, 2 }, { "SignWriting", +1, 0, 0, SignWriting_range32, 3 }, { "Sinhala", +1, Sinhala_range16, 12, Sinhala_range32, 1 }, - { "Sk", +1, Sk_range16, 28, Sk_range32, 1 }, + { "Sk", +1, Sk_range16, 30, Sk_range32, 1 }, { "Sm", +1, Sm_range16, 53, Sm_range32, 11 }, - { "So", +1, So_range16, 111, So_range32, 69 }, + { "So", +1, So_range16, 114, So_range32, 72 }, { "Sogdian", +1, 0, 0, Sogdian_range32, 1 }, { "Sora_Sompeng", +1, 0, 0, Sora_Sompeng_range32, 2 }, { "Soyombo", +1, 0, 0, Soyombo_range32, 1 }, @@ -6136,17 +6407,21 @@ const UGroup unicode_groups[] = { { "Tai_Viet", +1, Tai_Viet_range16, 2, 0, 0 }, { "Takri", +1, 0, 0, Takri_range32, 2 }, { "Tamil", +1, Tamil_range16, 16, Tamil_range32, 2 }, - { "Tangut", +1, 0, 0, Tangut_range32, 3 }, - { "Telugu", +1, Telugu_range16, 12, 0, 0 }, + { "Tangsa", +1, 0, 0, Tangsa_range32, 2 }, + { "Tangut", +1, 0, 0, Tangut_range32, 4 }, + { "Telugu", +1, Telugu_range16, 13, 0, 0 }, { "Thaana", +1, Thaana_range16, 1, 0, 0 }, { "Thai", +1, Thai_range16, 2, 0, 0 }, { "Tibetan", +1, Tibetan_range16, 7, 0, 0 }, { "Tifinagh", +1, Tifinagh_range16, 3, 0, 0 }, { "Tirhuta", +1, 0, 0, Tirhuta_range32, 2 }, + { "Toto", +1, 0, 0, Toto_range32, 1 }, { "Ugaritic", +1, 0, 0, Ugaritic_range32, 2 }, { "Vai", +1, Vai_range16, 1, 0, 0 }, + { "Vithkuqi", +1, 0, 0, Vithkuqi_range32, 8 }, { "Wancho", +1, 0, 0, Wancho_range32, 2 }, { "Warang_Citi", +1, 0, 0, Warang_Citi_range32, 2 }, + { "Yezidi", +1, 0, 0, Yezidi_range32, 3 }, { "Yi", +1, Yi_range16, 2, 0, 0 }, { "Z", +1, Z_range16, 8, 0, 0 }, { "Zanabazar_Square", +1, 0, 0, Zanabazar_Square_range32, 1 }, @@ -6154,7 +6429,7 @@ const UGroup unicode_groups[] = { { "Zp", +1, Zp_range16, 1, 0, 0 }, { "Zs", +1, Zs_range16, 7, 0, 0 }, }; -const int num_unicode_groups = 188; +const int num_unicode_groups = 197; } // namespace re2 diff --git a/re2/walker-inl.h b/re2/walker-inl.h index 310be54bd7ebbfe6d3d894a71110c5c5527225fb..4d064a0970f495a6333bdde404c3261777c536dd 100644 --- a/re2/walker-inl.h +++ b/re2/walker-inl.h @@ -89,7 +89,7 @@ template class Regexp::Walker { private: // Walk state for the entire traversal. - std::stack >* stack_; + std::stack> stack_; bool stopped_early_; int max_visits_; @@ -119,7 +119,7 @@ template T Regexp::Walker::Copy(T arg) { // State about a single level in the traversal. template struct WalkState { - WalkState(Regexp* re, T parent) + WalkState(Regexp* re, T parent) : re(re), n(-1), parent_arg(parent), @@ -134,24 +134,23 @@ template struct WalkState { }; template Regexp::Walker::Walker() { - stack_ = new std::stack >; stopped_early_ = false; } template Regexp::Walker::~Walker() { Reset(); - delete stack_; } // Clears the stack. Should never be necessary, since // Walk always enters and exits with an empty stack. // Logs DFATAL if stack is not already clear. template void Regexp::Walker::Reset() { - if (stack_ && stack_->size() > 0) { + if (!stack_.empty()) { LOG(DFATAL) << "Stack not empty."; - while (stack_->size() > 0) { - delete[] stack_->top().child_args; - stack_->pop(); + while (!stack_.empty()) { + if (stack_.top().re->nsub_ > 1) + delete[] stack_.top().child_args; + stack_.pop(); } } } @@ -165,13 +164,13 @@ template T Regexp::Walker::WalkInternal(Regexp* re, T top_arg, return top_arg; } - stack_->push(WalkState(re, top_arg)); + stack_.push(WalkState(re, top_arg)); WalkState* s; for (;;) { T t; - s = &stack_->top(); - Regexp* re = s->re; + s = &stack_.top(); + re = s->re; switch (s->n) { case -1: { if (--max_visits_ < 0) { @@ -201,7 +200,7 @@ template T Regexp::Walker::WalkInternal(Regexp* re, T top_arg, s->child_args[s->n] = Copy(s->child_args[s->n - 1]); s->n++; } else { - stack_->push(WalkState(sub[s->n], s->pre_arg)); + stack_.push(WalkState(sub[s->n], s->pre_arg)); } continue; } @@ -214,12 +213,12 @@ template T Regexp::Walker::WalkInternal(Regexp* re, T top_arg, } } - // We've finished stack_->top(). + // We've finished stack_.top(). // Update next guy down. - stack_->pop(); - if (stack_->size() == 0) + stack_.pop(); + if (stack_.empty()) return t; - s = &stack_->top(); + s = &stack_.top(); if (s->child_args != NULL) s->child_args[s->n] = t; else diff --git a/testinstall.cc b/testinstall.cc index 47db4e68cc560568fc2af638c8b703ace3342e5d..19cc9003bf8decc18a7be4dacfb0bb4f181a39b0 100644 --- a/testinstall.cc +++ b/testinstall.cc @@ -2,23 +2,26 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#include -#include #include +#include +#include + +int main() { + re2::FilteredRE2 f; + int id; + f.Add("a.*b.*c", RE2::DefaultOptions, &id); + std::vector v; + f.Compile(&v); + std::vector ids; + f.FirstMatch("abbccc", ids); -int main(void) { - re2::FilteredRE2 f; - int id; - f.Add("a.*b.*c", RE2::DefaultOptions, &id); - std::vector v; - f.Compile(&v); - std::vector ids; - f.FirstMatch("abbccc", ids); + int n; + if (RE2::FullMatch("axbyc", "a.*b.*c") && + RE2::PartialMatch("foo123bar", "(\\d+)", &n) && n == 123) { + printf("PASS\n"); + return 0; + } - if(RE2::FullMatch("axbyc", "a.*b.*c")) { - printf("PASS\n"); - return 0; - } - printf("FAIL\n"); - return 2; + printf("FAIL\n"); + return 2; } diff --git a/util/mutex.h b/util/mutex.h index 9c491580481ce133bff76559e3dca41b4c8f8dfa..158046bb5c9f40b7bbcbebe2c5ebf8dae23979d2 100644 --- a/util/mutex.h +++ b/util/mutex.h @@ -10,7 +10,13 @@ * You should assume the locks are *not* re-entrant. */ -#if !defined(_WIN32) +#ifdef _WIN32 +// Requires Windows Vista or Windows Server 2008 at minimum. +#include +#if defined(WINVER) && WINVER >= 0x0600 +#define MUTEX_IS_WIN32_SRWLOCK +#endif +#else #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif @@ -20,7 +26,9 @@ #endif #endif -#if defined(MUTEX_IS_PTHREAD_RWLOCK) +#if defined(MUTEX_IS_WIN32_SRWLOCK) +typedef SRWLOCK MutexType; +#elif defined(MUTEX_IS_PTHREAD_RWLOCK) #include #include typedef pthread_rwlock_t MutexType; @@ -56,7 +64,16 @@ class Mutex { Mutex& operator=(const Mutex&) = delete; }; -#if defined(MUTEX_IS_PTHREAD_RWLOCK) +#if defined(MUTEX_IS_WIN32_SRWLOCK) + +Mutex::Mutex() : mutex_(SRWLOCK_INIT) { } +Mutex::~Mutex() { } +void Mutex::Lock() { AcquireSRWLockExclusive(&mutex_); } +void Mutex::Unlock() { ReleaseSRWLockExclusive(&mutex_); } +void Mutex::ReaderLock() { AcquireSRWLockShared(&mutex_); } +void Mutex::ReaderUnlock() { ReleaseSRWLockShared(&mutex_); } + +#elif defined(MUTEX_IS_PTHREAD_RWLOCK) #define SAFE_PTHREAD(fncall) \ do { \ diff --git a/util/pcre.cc b/util/pcre.cc index b3a32ebff6151d410eb32a0753c2931c696bb5ae..b68985144ff6439182e849c485636b9fe697732b 100644 --- a/util/pcre.cc +++ b/util/pcre.cc @@ -22,9 +22,7 @@ #include "util/strutil.h" // Silence warnings about the wacky formatting in the operator() functions. -// Note that we test for Clang first because it defines __GNUC__ as well. -#if defined(__clang__) -#elif defined(__GNUC__) && __GNUC__ >= 6 +#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 6 #pragma GCC diagnostic ignored "-Wmisleading-indentation" #endif diff --git a/util/pcre.h b/util/pcre.h index 644dce68c24287024989a82411510468b876d590..896b0bdf8935a8e901fc74a2be0fdf0786e0bbc4 100644 --- a/util/pcre.h +++ b/util/pcre.h @@ -555,7 +555,7 @@ class PCRE_Options { // Hex/Octal/Binary? // Special class for parsing into objects that define a ParseFrom() method -template +template class _PCRE_MatchObject { public: static inline bool Parse(const char* str, size_t n, void* dest) { @@ -600,9 +600,9 @@ class PCRE::Arg { #undef MAKE_PARSER // Generic constructor - template Arg(T*, Parser parser); + template Arg(T*, Parser parser); // Generic constructor template - template Arg(T* p) + template Arg(T* p) : arg_(p), parser_(_PCRE_MatchObject::Parse) { }