497 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Groff
		
	
	
	
	
	
			
		
		
	
	
			497 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Groff
		
	
	
	
	
	
| .TH PCRESYNTAX 3 "11 November 2012" "PCRE 8.32"
 | |
| .SH NAME
 | |
| PCRE - Perl-compatible regular expressions
 | |
| .SH "PCRE REGULAR EXPRESSION SYNTAX SUMMARY"
 | |
| .rs
 | |
| .sp
 | |
| The full syntax and semantics of the regular expressions that are supported by
 | |
| PCRE are described in the
 | |
| .\" HREF
 | |
| \fBpcrepattern\fP
 | |
| .\"
 | |
| documentation. This document contains a quick-reference summary of the syntax.
 | |
| .
 | |
| .
 | |
| .SH "QUOTING"
 | |
| .rs
 | |
| .sp
 | |
|   \ex         where x is non-alphanumeric is a literal x
 | |
|   \eQ...\eE    treat enclosed characters as literal
 | |
| .
 | |
| .
 | |
| .SH "CHARACTERS"
 | |
| .rs
 | |
| .sp
 | |
|   \ea         alarm, that is, the BEL character (hex 07)
 | |
|   \ecx        "control-x", where x is any ASCII character
 | |
|   \ee         escape (hex 1B)
 | |
|   \ef         form feed (hex 0C)
 | |
|   \en         newline (hex 0A)
 | |
|   \er         carriage return (hex 0D)
 | |
|   \et         tab (hex 09)
 | |
|   \eddd       character with octal code ddd, or backreference
 | |
|   \exhh       character with hex code hh
 | |
|   \ex{hhh..}  character with hex code hhh..
 | |
| .
 | |
| .
 | |
| .SH "CHARACTER TYPES"
 | |
| .rs
 | |
| .sp
 | |
|   .          any character except newline;
 | |
|                in dotall mode, any character whatsoever
 | |
|   \eC         one data unit, even in UTF mode (best avoided)
 | |
|   \ed         a decimal digit
 | |
|   \eD         a character that is not a decimal digit
 | |
|   \eh         a horizontal white space character
 | |
|   \eH         a character that is not a horizontal white space character
 | |
|   \eN         a character that is not a newline
 | |
|   \ep{\fIxx\fP}     a character with the \fIxx\fP property
 | |
|   \eP{\fIxx\fP}     a character without the \fIxx\fP property
 | |
|   \eR         a newline sequence
 | |
|   \es         a white space character
 | |
|   \eS         a character that is not a white space character
 | |
|   \ev         a vertical white space character
 | |
|   \eV         a character that is not a vertical white space character
 | |
|   \ew         a "word" character
 | |
|   \eW         a "non-word" character
 | |
|   \eX         a Unicode extended grapheme cluster
 | |
| .sp
 | |
| In PCRE, by default, \ed, \eD, \es, \eS, \ew, and \eW recognize only ASCII
 | |
| characters, even in a UTF mode. However, this can be changed by setting the
 | |
| PCRE_UCP option.
 | |
| .
 | |
| .
 | |
| .SH "GENERAL CATEGORY PROPERTIES FOR \ep and \eP"
 | |
| .rs
 | |
| .sp
 | |
|   C          Other
 | |
|   Cc         Control
 | |
|   Cf         Format
 | |
|   Cn         Unassigned
 | |
|   Co         Private use
 | |
|   Cs         Surrogate
 | |
| .sp
 | |
|   L          Letter
 | |
|   Ll         Lower case letter
 | |
|   Lm         Modifier letter
 | |
|   Lo         Other letter
 | |
|   Lt         Title case letter
 | |
|   Lu         Upper case letter
 | |
|   L&         Ll, Lu, or Lt
 | |
| .sp
 | |
|   M          Mark
 | |
|   Mc         Spacing mark
 | |
|   Me         Enclosing mark
 | |
|   Mn         Non-spacing mark
 | |
| .sp
 | |
|   N          Number
 | |
|   Nd         Decimal number
 | |
|   Nl         Letter number
 | |
|   No         Other number
 | |
| .sp
 | |
|   P          Punctuation
 | |
|   Pc         Connector punctuation
 | |
|   Pd         Dash punctuation
 | |
|   Pe         Close punctuation
 | |
|   Pf         Final punctuation
 | |
|   Pi         Initial punctuation
 | |
|   Po         Other punctuation
 | |
|   Ps         Open punctuation
 | |
| .sp
 | |
|   S          Symbol
 | |
|   Sc         Currency symbol
 | |
|   Sk         Modifier symbol
 | |
|   Sm         Mathematical symbol
 | |
|   So         Other symbol
 | |
| .sp
 | |
|   Z          Separator
 | |
|   Zl         Line separator
 | |
|   Zp         Paragraph separator
 | |
|   Zs         Space separator
 | |
| .
 | |
| .
 | |
| .SH "PCRE SPECIAL CATEGORY PROPERTIES FOR \ep and \eP"
 | |
| .rs
 | |
| .sp
 | |
|   Xan        Alphanumeric: union of properties L and N
 | |
|   Xps        POSIX space: property Z or tab, NL, VT, FF, CR
 | |
|   Xsp        Perl space: property Z or tab, NL, FF, CR
 | |
|   Xwd        Perl word: property Xan or underscore
 | |
| .
 | |
| .
 | |
| .SH "SCRIPT NAMES FOR \ep AND \eP"
 | |
| .rs
 | |
| .sp
 | |
| Arabic,
 | |
| Armenian,
 | |
| Avestan,
 | |
| Balinese,
 | |
| Bamum,
 | |
| Batak,
 | |
| Bengali,
 | |
| Bopomofo,
 | |
| Brahmi,
 | |
| Braille,
 | |
| Buginese,
 | |
| Buhid,
 | |
| Canadian_Aboriginal,
 | |
| Carian,
 | |
| Chakma,
 | |
| Cham,
 | |
| Cherokee,
 | |
| Common,
 | |
| Coptic,
 | |
| Cuneiform,
 | |
| Cypriot,
 | |
| Cyrillic,
 | |
| Deseret,
 | |
| Devanagari,
 | |
| Egyptian_Hieroglyphs,
 | |
| Ethiopic,
 | |
| Georgian,
 | |
| Glagolitic,
 | |
| Gothic,
 | |
| Greek,
 | |
| Gujarati,
 | |
| Gurmukhi,
 | |
| Han,
 | |
| Hangul,
 | |
| Hanunoo,
 | |
| Hebrew,
 | |
| Hiragana,
 | |
| Imperial_Aramaic,
 | |
| Inherited,
 | |
| Inscriptional_Pahlavi,
 | |
| Inscriptional_Parthian,
 | |
| Javanese,
 | |
| Kaithi,
 | |
| Kannada,
 | |
| Katakana,
 | |
| Kayah_Li,
 | |
| Kharoshthi,
 | |
| Khmer,
 | |
| Lao,
 | |
| Latin,
 | |
| Lepcha,
 | |
| Limbu,
 | |
| Linear_B,
 | |
| Lisu,
 | |
| Lycian,
 | |
| Lydian,
 | |
| Malayalam,
 | |
| Mandaic,
 | |
| Meetei_Mayek,
 | |
| Meroitic_Cursive,
 | |
| Meroitic_Hieroglyphs,
 | |
| Miao,
 | |
| Mongolian,
 | |
| Myanmar,
 | |
| New_Tai_Lue,
 | |
| Nko,
 | |
| Ogham,
 | |
| Old_Italic,
 | |
| Old_Persian,
 | |
| Old_South_Arabian,
 | |
| Old_Turkic,
 | |
| Ol_Chiki,
 | |
| Oriya,
 | |
| Osmanya,
 | |
| Phags_Pa,
 | |
| Phoenician,
 | |
| Rejang,
 | |
| Runic,
 | |
| Samaritan,
 | |
| Saurashtra,
 | |
| Sharada,
 | |
| Shavian,
 | |
| Sinhala,
 | |
| Sora_Sompeng,
 | |
| Sundanese,
 | |
| Syloti_Nagri,
 | |
| Syriac,
 | |
| Tagalog,
 | |
| Tagbanwa,
 | |
| Tai_Le,
 | |
| Tai_Tham,
 | |
| Tai_Viet,
 | |
| Takri,
 | |
| Tamil,
 | |
| Telugu,
 | |
| Thaana,
 | |
| Thai,
 | |
| Tibetan,
 | |
| Tifinagh,
 | |
| Ugaritic,
 | |
| Vai,
 | |
| Yi.
 | |
| .
 | |
| .
 | |
| .SH "CHARACTER CLASSES"
 | |
| .rs
 | |
| .sp
 | |
|   [...]       positive character class
 | |
|   [^...]      negative character class
 | |
|   [x-y]       range (can be used for hex characters)
 | |
|   [[:xxx:]]   positive POSIX named set
 | |
|   [[:^xxx:]]  negative POSIX named set
 | |
| .sp
 | |
|   alnum       alphanumeric
 | |
|   alpha       alphabetic
 | |
|   ascii       0-127
 | |
|   blank       space or tab
 | |
|   cntrl       control character
 | |
|   digit       decimal digit
 | |
|   graph       printing, excluding space
 | |
|   lower       lower case letter
 | |
|   print       printing, including space
 | |
|   punct       printing, excluding alphanumeric
 | |
|   space       white space
 | |
|   upper       upper case letter
 | |
|   word        same as \ew
 | |
|   xdigit      hexadecimal digit
 | |
| .sp
 | |
| In PCRE, POSIX character set names recognize only ASCII characters by default,
 | |
| but some of them use Unicode properties if PCRE_UCP is set. You can use
 | |
| \eQ...\eE inside a character class.
 | |
| .
 | |
| .
 | |
| .SH "QUANTIFIERS"
 | |
| .rs
 | |
| .sp
 | |
|   ?           0 or 1, greedy
 | |
|   ?+          0 or 1, possessive
 | |
|   ??          0 or 1, lazy
 | |
|   *           0 or more, greedy
 | |
|   *+          0 or more, possessive
 | |
|   *?          0 or more, lazy
 | |
|   +           1 or more, greedy
 | |
|   ++          1 or more, possessive
 | |
|   +?          1 or more, lazy
 | |
|   {n}         exactly n
 | |
|   {n,m}       at least n, no more than m, greedy
 | |
|   {n,m}+      at least n, no more than m, possessive
 | |
|   {n,m}?      at least n, no more than m, lazy
 | |
|   {n,}        n or more, greedy
 | |
|   {n,}+       n or more, possessive
 | |
|   {n,}?       n or more, lazy
 | |
| .
 | |
| .
 | |
| .SH "ANCHORS AND SIMPLE ASSERTIONS"
 | |
| .rs
 | |
| .sp
 | |
|   \eb          word boundary
 | |
|   \eB          not a word boundary
 | |
|   ^           start of subject
 | |
|                also after internal newline in multiline mode
 | |
|   \eA          start of subject
 | |
|   $           end of subject
 | |
|                also before newline at end of subject
 | |
|                also before internal newline in multiline mode
 | |
|   \eZ          end of subject
 | |
|                also before newline at end of subject
 | |
|   \ez          end of subject
 | |
|   \eG          first matching position in subject
 | |
| .
 | |
| .
 | |
| .SH "MATCH POINT RESET"
 | |
| .rs
 | |
| .sp
 | |
|   \eK          reset start of match
 | |
| .
 | |
| .
 | |
| .SH "ALTERNATION"
 | |
| .rs
 | |
| .sp
 | |
|   expr|expr|expr...
 | |
| .
 | |
| .
 | |
| .SH "CAPTURING"
 | |
| .rs
 | |
| .sp
 | |
|   (...)           capturing group
 | |
|   (?<name>...)    named capturing group (Perl)
 | |
|   (?'name'...)    named capturing group (Perl)
 | |
|   (?P<name>...)   named capturing group (Python)
 | |
|   (?:...)         non-capturing group
 | |
|   (?|...)         non-capturing group; reset group numbers for
 | |
|                    capturing groups in each alternative
 | |
| .
 | |
| .
 | |
| .SH "ATOMIC GROUPS"
 | |
| .rs
 | |
| .sp
 | |
|   (?>...)         atomic, non-capturing group
 | |
| .
 | |
| .
 | |
| .
 | |
| .
 | |
| .SH "COMMENT"
 | |
| .rs
 | |
| .sp
 | |
|   (?#....)        comment (not nestable)
 | |
| .
 | |
| .
 | |
| .SH "OPTION SETTING"
 | |
| .rs
 | |
| .sp
 | |
|   (?i)            caseless
 | |
|   (?J)            allow duplicate names
 | |
|   (?m)            multiline
 | |
|   (?s)            single line (dotall)
 | |
|   (?U)            default ungreedy (lazy)
 | |
|   (?x)            extended (ignore white space)
 | |
|   (?-...)         unset option(s)
 | |
| .sp
 | |
| The following are recognized only at the start of a pattern or after one of the
 | |
| newline-setting options with similar syntax:
 | |
| .sp
 | |
|   (*NO_START_OPT) no start-match optimization (PCRE_NO_START_OPTIMIZE)
 | |
|   (*UTF8)         set UTF-8 mode: 8-bit library (PCRE_UTF8)
 | |
|   (*UTF16)        set UTF-16 mode: 16-bit library (PCRE_UTF16)
 | |
|   (*UTF32)        set UTF-32 mode: 32-bit library (PCRE_UTF32)
 | |
|   (*UTF)          set appropriate UTF mode for the library in use
 | |
|   (*UCP)          set PCRE_UCP (use Unicode properties for \ed etc)
 | |
| .
 | |
| .
 | |
| .SH "LOOKAHEAD AND LOOKBEHIND ASSERTIONS"
 | |
| .rs
 | |
| .sp
 | |
|   (?=...)         positive look ahead
 | |
|   (?!...)         negative look ahead
 | |
|   (?<=...)        positive look behind
 | |
|   (?<!...)        negative look behind
 | |
| .sp
 | |
| Each top-level branch of a look behind must be of a fixed length.
 | |
| .
 | |
| .
 | |
| .SH "BACKREFERENCES"
 | |
| .rs
 | |
| .sp
 | |
|   \en              reference by number (can be ambiguous)
 | |
|   \egn             reference by number
 | |
|   \eg{n}           reference by number
 | |
|   \eg{-n}          relative reference by number
 | |
|   \ek<name>        reference by name (Perl)
 | |
|   \ek'name'        reference by name (Perl)
 | |
|   \eg{name}        reference by name (Perl)
 | |
|   \ek{name}        reference by name (.NET)
 | |
|   (?P=name)       reference by name (Python)
 | |
| .
 | |
| .
 | |
| .SH "SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)"
 | |
| .rs
 | |
| .sp
 | |
|   (?R)            recurse whole pattern
 | |
|   (?n)            call subpattern by absolute number
 | |
|   (?+n)           call subpattern by relative number
 | |
|   (?-n)           call subpattern by relative number
 | |
|   (?&name)        call subpattern by name (Perl)
 | |
|   (?P>name)       call subpattern by name (Python)
 | |
|   \eg<name>        call subpattern by name (Oniguruma)
 | |
|   \eg'name'        call subpattern by name (Oniguruma)
 | |
|   \eg<n>           call subpattern by absolute number (Oniguruma)
 | |
|   \eg'n'           call subpattern by absolute number (Oniguruma)
 | |
|   \eg<+n>          call subpattern by relative number (PCRE extension)
 | |
|   \eg'+n'          call subpattern by relative number (PCRE extension)
 | |
|   \eg<-n>          call subpattern by relative number (PCRE extension)
 | |
|   \eg'-n'          call subpattern by relative number (PCRE extension)
 | |
| .
 | |
| .
 | |
| .SH "CONDITIONAL PATTERNS"
 | |
| .rs
 | |
| .sp
 | |
|   (?(condition)yes-pattern)
 | |
|   (?(condition)yes-pattern|no-pattern)
 | |
| .sp
 | |
|   (?(n)...        absolute reference condition
 | |
|   (?(+n)...       relative reference condition
 | |
|   (?(-n)...       relative reference condition
 | |
|   (?(<name>)...   named reference condition (Perl)
 | |
|   (?('name')...   named reference condition (Perl)
 | |
|   (?(name)...     named reference condition (PCRE)
 | |
|   (?(R)...        overall recursion condition
 | |
|   (?(Rn)...       specific group recursion condition
 | |
|   (?(R&name)...   specific recursion condition
 | |
|   (?(DEFINE)...   define subpattern for reference
 | |
|   (?(assert)...   assertion condition
 | |
| .
 | |
| .
 | |
| .SH "BACKTRACKING CONTROL"
 | |
| .rs
 | |
| .sp
 | |
| The following act immediately they are reached:
 | |
| .sp
 | |
|   (*ACCEPT)       force successful match
 | |
|   (*FAIL)         force backtrack; synonym (*F)
 | |
|   (*MARK:NAME)    set name to be passed back; synonym (*:NAME)
 | |
| .sp
 | |
| The following act only when a subsequent match failure causes a backtrack to
 | |
| reach them. They all force a match failure, but they differ in what happens
 | |
| afterwards. Those that advance the start-of-match point do so only if the
 | |
| pattern is not anchored.
 | |
| .sp
 | |
|   (*COMMIT)       overall failure, no advance of starting point
 | |
|   (*PRUNE)        advance to next starting character
 | |
|   (*PRUNE:NAME)   equivalent to (*MARK:NAME)(*PRUNE)
 | |
|   (*SKIP)         advance to current matching position
 | |
|   (*SKIP:NAME)    advance to position corresponding to an earlier
 | |
|                   (*MARK:NAME); if not found, the (*SKIP) is ignored
 | |
|   (*THEN)         local failure, backtrack to next alternation
 | |
|   (*THEN:NAME)    equivalent to (*MARK:NAME)(*THEN)
 | |
| .
 | |
| .
 | |
| .SH "NEWLINE CONVENTIONS"
 | |
| .rs
 | |
| .sp
 | |
| These are recognized only at the very start of the pattern or after a
 | |
| (*BSR_...), (*UTF8), (*UTF16), (*UTF32) or (*UCP) option.
 | |
| .sp
 | |
|   (*CR)           carriage return only
 | |
|   (*LF)           linefeed only
 | |
|   (*CRLF)         carriage return followed by linefeed
 | |
|   (*ANYCRLF)      all three of the above
 | |
|   (*ANY)          any Unicode newline sequence
 | |
| .
 | |
| .
 | |
| .SH "WHAT \eR MATCHES"
 | |
| .rs
 | |
| .sp
 | |
| These are recognized only at the very start of the pattern or after a
 | |
| (*...) option that sets the newline convention or a UTF or UCP mode.
 | |
| .sp
 | |
|   (*BSR_ANYCRLF)  CR, LF, or CRLF
 | |
|   (*BSR_UNICODE)  any Unicode newline sequence
 | |
| .
 | |
| .
 | |
| .SH "CALLOUTS"
 | |
| .rs
 | |
| .sp
 | |
|   (?C)      callout
 | |
|   (?Cn)     callout with data n
 | |
| .
 | |
| .
 | |
| .SH "SEE ALSO"
 | |
| .rs
 | |
| .sp
 | |
| \fBpcrepattern\fP(3), \fBpcreapi\fP(3), \fBpcrecallout\fP(3),
 | |
| \fBpcrematching\fP(3), \fBpcre\fP(3).
 | |
| .
 | |
| .
 | |
| .SH AUTHOR
 | |
| .rs
 | |
| .sp
 | |
| .nf
 | |
| Philip Hazel
 | |
| University Computing Service
 | |
| Cambridge CB2 3QH, England.
 | |
| .fi
 | |
| .
 | |
| .
 | |
| .SH REVISION
 | |
| .rs
 | |
| .sp
 | |
| .nf
 | |
| Last updated: 11 November 2012
 | |
| Copyright (c) 1997-2012 University of Cambridge.
 | |
| .fi
 |