450 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Groff
		
	
	
	
	
	
			
		
		
	
	
			450 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Groff
		
	
	
	
	
	
.TH PCRESYNTAX 3
 | 
						|
.SH NAME
 | 
						|
PCRE - Perl-compatible regular expressions
 | 
						|
.SH "PCRE REGULAR EXPRESSION SYNTAX SUMMARY"
 | 
						|
.rs
 | 
						|
.sp
 | 
						|
The full syntax and semantics of the regular expressions that are supported by
 | 
						|
PCRE are described in the
 | 
						|
.\" HREF
 | 
						|
\fBpcrepattern\fP
 | 
						|
.\"
 | 
						|
documentation. This document contains just a quick-reference summary of the
 | 
						|
syntax.
 | 
						|
.
 | 
						|
.
 | 
						|
.SH "QUOTING"
 | 
						|
.rs
 | 
						|
.sp
 | 
						|
  \ex         where x is non-alphanumeric is a literal x
 | 
						|
  \eQ...\eE    treat enclosed characters as literal
 | 
						|
.
 | 
						|
.
 | 
						|
.SH "CHARACTERS"
 | 
						|
.rs
 | 
						|
.sp
 | 
						|
  \ea         alarm, that is, the BEL character (hex 07)
 | 
						|
  \ecx        "control-x", where x is any character
 | 
						|
  \ee         escape (hex 1B)
 | 
						|
  \ef         formfeed (hex 0C)
 | 
						|
  \en         newline (hex 0A)
 | 
						|
  \er         carriage return (hex 0D)
 | 
						|
  \et         tab (hex 09)
 | 
						|
  \eddd       character with octal code ddd, or backreference
 | 
						|
  \exhh       character with hex code hh
 | 
						|
  \ex{hhh..}  character with hex code hhh..
 | 
						|
.
 | 
						|
.
 | 
						|
.SH "CHARACTER TYPES"
 | 
						|
.rs
 | 
						|
.sp
 | 
						|
  .          any character except newline;
 | 
						|
               in dotall mode, any character whatsoever
 | 
						|
  \eC         one byte, even in UTF-8 mode (best avoided)
 | 
						|
  \ed         a decimal digit
 | 
						|
  \eD         a character that is not a decimal digit
 | 
						|
  \eh         a horizontal whitespace character
 | 
						|
  \eH         a character that is not a horizontal whitespace character
 | 
						|
  \ep{\fIxx\fP}     a character with the \fIxx\fP property
 | 
						|
  \eP{\fIxx\fP}     a character without the \fIxx\fP property
 | 
						|
  \eR         a newline sequence
 | 
						|
  \es         a whitespace character
 | 
						|
  \eS         a character that is not a whitespace character
 | 
						|
  \ev         a vertical whitespace character
 | 
						|
  \eV         a character that is not a vertical whitespace character
 | 
						|
  \ew         a "word" character
 | 
						|
  \eW         a "non-word" character
 | 
						|
  \eX         an extended Unicode sequence
 | 
						|
.sp
 | 
						|
In PCRE, \ed, \eD, \es, \eS, \ew, and \eW recognize only ASCII characters.
 | 
						|
.
 | 
						|
.
 | 
						|
.SH "GENERAL CATEGORY PROPERTY CODES FOR \ep and \eP"
 | 
						|
.rs
 | 
						|
.sp
 | 
						|
  C          Other
 | 
						|
  Cc         Control
 | 
						|
  Cf         Format
 | 
						|
  Cn         Unassigned
 | 
						|
  Co         Private use
 | 
						|
  Cs         Surrogate
 | 
						|
.sp
 | 
						|
  L          Letter
 | 
						|
  Ll         Lower case letter
 | 
						|
  Lm         Modifier letter
 | 
						|
  Lo         Other letter
 | 
						|
  Lt         Title case letter
 | 
						|
  Lu         Upper case letter
 | 
						|
  L&         Ll, Lu, or Lt
 | 
						|
.sp
 | 
						|
  M          Mark
 | 
						|
  Mc         Spacing mark
 | 
						|
  Me         Enclosing mark
 | 
						|
  Mn         Non-spacing mark
 | 
						|
.sp
 | 
						|
  N          Number
 | 
						|
  Nd         Decimal number
 | 
						|
  Nl         Letter number
 | 
						|
  No         Other number
 | 
						|
.sp
 | 
						|
  P          Punctuation
 | 
						|
  Pc         Connector punctuation
 | 
						|
  Pd         Dash punctuation
 | 
						|
  Pe         Close punctuation
 | 
						|
  Pf         Final punctuation
 | 
						|
  Pi         Initial punctuation
 | 
						|
  Po         Other punctuation
 | 
						|
  Ps         Open punctuation
 | 
						|
.sp
 | 
						|
  S          Symbol
 | 
						|
  Sc         Currency symbol
 | 
						|
  Sk         Modifier symbol
 | 
						|
  Sm         Mathematical symbol
 | 
						|
  So         Other symbol
 | 
						|
.sp
 | 
						|
  Z          Separator
 | 
						|
  Zl         Line separator
 | 
						|
  Zp         Paragraph separator
 | 
						|
  Zs         Space separator
 | 
						|
.
 | 
						|
.
 | 
						|
.SH "SCRIPT NAMES FOR \ep AND \eP"
 | 
						|
.rs
 | 
						|
.sp
 | 
						|
Arabic,
 | 
						|
Armenian,
 | 
						|
Balinese,
 | 
						|
Bengali,
 | 
						|
Bopomofo,
 | 
						|
Braille,
 | 
						|
Buginese,
 | 
						|
Buhid,
 | 
						|
Canadian_Aboriginal,
 | 
						|
Carian,
 | 
						|
Cham,
 | 
						|
Cherokee,
 | 
						|
Common,
 | 
						|
Coptic,
 | 
						|
Cuneiform,
 | 
						|
Cypriot,
 | 
						|
Cyrillic,
 | 
						|
Deseret,
 | 
						|
Devanagari,
 | 
						|
Ethiopic,
 | 
						|
Georgian,
 | 
						|
Glagolitic,
 | 
						|
Gothic,
 | 
						|
Greek,
 | 
						|
Gujarati,
 | 
						|
Gurmukhi,
 | 
						|
Han,
 | 
						|
Hangul,
 | 
						|
Hanunoo,
 | 
						|
Hebrew,
 | 
						|
Hiragana,
 | 
						|
Inherited,
 | 
						|
Kannada,
 | 
						|
Katakana,
 | 
						|
Kayah_Li,
 | 
						|
Kharoshthi,
 | 
						|
Khmer,
 | 
						|
Lao,
 | 
						|
Latin,
 | 
						|
Lepcha,
 | 
						|
Limbu,
 | 
						|
Linear_B,
 | 
						|
Lycian,
 | 
						|
Lydian,
 | 
						|
Malayalam,
 | 
						|
Mongolian,
 | 
						|
Myanmar,
 | 
						|
New_Tai_Lue,
 | 
						|
Nko,
 | 
						|
Ogham,
 | 
						|
Old_Italic,
 | 
						|
Old_Persian,
 | 
						|
Ol_Chiki,
 | 
						|
Oriya,
 | 
						|
Osmanya,
 | 
						|
Phags_Pa,
 | 
						|
Phoenician,
 | 
						|
Rejang,
 | 
						|
Runic,
 | 
						|
Saurashtra,
 | 
						|
Shavian,
 | 
						|
Sinhala,
 | 
						|
Sudanese,
 | 
						|
Syloti_Nagri,
 | 
						|
Syriac,
 | 
						|
Tagalog,
 | 
						|
Tagbanwa,
 | 
						|
Tai_Le,
 | 
						|
Tamil,
 | 
						|
Telugu,
 | 
						|
Thaana,
 | 
						|
Thai,
 | 
						|
Tibetan,
 | 
						|
Tifinagh,
 | 
						|
Ugaritic,
 | 
						|
Vai,
 | 
						|
Yi.
 | 
						|
.
 | 
						|
.
 | 
						|
.SH "CHARACTER CLASSES"
 | 
						|
.rs
 | 
						|
.sp
 | 
						|
  [...]       positive character class
 | 
						|
  [^...]      negative character class
 | 
						|
  [x-y]       range (can be used for hex characters)
 | 
						|
  [[:xxx:]]   positive POSIX named set
 | 
						|
  [[:^xxx:]]  negative POSIX named set
 | 
						|
.sp
 | 
						|
  alnum       alphanumeric
 | 
						|
  alpha       alphabetic
 | 
						|
  ascii       0-127
 | 
						|
  blank       space or tab
 | 
						|
  cntrl       control character
 | 
						|
  digit       decimal digit
 | 
						|
  graph       printing, excluding space
 | 
						|
  lower       lower case letter
 | 
						|
  print       printing, including space
 | 
						|
  punct       printing, excluding alphanumeric
 | 
						|
  space       whitespace
 | 
						|
  upper       upper case letter
 | 
						|
  word        same as \ew
 | 
						|
  xdigit      hexadecimal digit
 | 
						|
.sp
 | 
						|
In PCRE, POSIX character set names recognize only ASCII characters. You can use
 | 
						|
\eQ...\eE inside a character class.
 | 
						|
.
 | 
						|
.
 | 
						|
.SH "QUANTIFIERS"
 | 
						|
.rs
 | 
						|
.sp
 | 
						|
  ?           0 or 1, greedy
 | 
						|
  ?+          0 or 1, possessive
 | 
						|
  ??          0 or 1, lazy
 | 
						|
  *           0 or more, greedy
 | 
						|
  *+          0 or more, possessive
 | 
						|
  *?          0 or more, lazy
 | 
						|
  +           1 or more, greedy
 | 
						|
  ++          1 or more, possessive
 | 
						|
  +?          1 or more, lazy
 | 
						|
  {n}         exactly n
 | 
						|
  {n,m}       at least n, no more than m, greedy
 | 
						|
  {n,m}+      at least n, no more than m, possessive
 | 
						|
  {n,m}?      at least n, no more than m, lazy
 | 
						|
  {n,}        n or more, greedy
 | 
						|
  {n,}+       n or more, possessive
 | 
						|
  {n,}?       n or more, lazy
 | 
						|
.
 | 
						|
.
 | 
						|
.SH "ANCHORS AND SIMPLE ASSERTIONS"
 | 
						|
.rs
 | 
						|
.sp
 | 
						|
  \eb          word boundary (only ASCII letters recognized)
 | 
						|
  \eB          not a word boundary
 | 
						|
  ^           start of subject
 | 
						|
               also after internal newline in multiline mode
 | 
						|
  \eA          start of subject
 | 
						|
  $           end of subject
 | 
						|
               also before newline at end of subject
 | 
						|
               also before internal newline in multiline mode
 | 
						|
  \eZ          end of subject
 | 
						|
               also before newline at end of subject
 | 
						|
  \ez          end of subject
 | 
						|
  \eG          first matching position in subject
 | 
						|
.
 | 
						|
.
 | 
						|
.SH "MATCH POINT RESET"
 | 
						|
.rs
 | 
						|
.sp
 | 
						|
  \eK          reset start of match
 | 
						|
.
 | 
						|
.
 | 
						|
.SH "ALTERNATION"
 | 
						|
.rs
 | 
						|
.sp
 | 
						|
  expr|expr|expr...
 | 
						|
.
 | 
						|
.
 | 
						|
.SH "CAPTURING"
 | 
						|
.rs
 | 
						|
.sp
 | 
						|
  (...)           capturing group
 | 
						|
  (?<name>...)    named capturing group (Perl)
 | 
						|
  (?'name'...)    named capturing group (Perl)
 | 
						|
  (?P<name>...)   named capturing group (Python)
 | 
						|
  (?:...)         non-capturing group
 | 
						|
  (?|...)         non-capturing group; reset group numbers for
 | 
						|
                   capturing groups in each alternative
 | 
						|
.
 | 
						|
.
 | 
						|
.SH "ATOMIC GROUPS"
 | 
						|
.rs
 | 
						|
.sp
 | 
						|
  (?>...)         atomic, non-capturing group
 | 
						|
.
 | 
						|
.
 | 
						|
.
 | 
						|
.
 | 
						|
.SH "COMMENT"
 | 
						|
.rs
 | 
						|
.sp
 | 
						|
  (?#....)        comment (not nestable)
 | 
						|
.
 | 
						|
.
 | 
						|
.SH "OPTION SETTING"
 | 
						|
.rs
 | 
						|
.sp
 | 
						|
  (?i)            caseless
 | 
						|
  (?J)            allow duplicate names
 | 
						|
  (?m)            multiline
 | 
						|
  (?s)            single line (dotall)
 | 
						|
  (?U)            default ungreedy (lazy)
 | 
						|
  (?x)            extended (ignore white space)
 | 
						|
  (?-...)         unset option(s)
 | 
						|
.sp
 | 
						|
The following is recognized only at the start of a pattern or after one of the
 | 
						|
newline-setting options with similar syntax:
 | 
						|
.sp
 | 
						|
  (*UTF8)         set UTF-8 mode
 | 
						|
.
 | 
						|
.
 | 
						|
.SH "LOOKAHEAD AND LOOKBEHIND ASSERTIONS"
 | 
						|
.rs
 | 
						|
.sp
 | 
						|
  (?=...)         positive look ahead
 | 
						|
  (?!...)         negative look ahead
 | 
						|
  (?<=...)        positive look behind
 | 
						|
  (?<!...)        negative look behind
 | 
						|
.sp
 | 
						|
Each top-level branch of a look behind must be of a fixed length.
 | 
						|
.
 | 
						|
.
 | 
						|
.SH "BACKREFERENCES"
 | 
						|
.rs
 | 
						|
.sp
 | 
						|
  \en              reference by number (can be ambiguous)
 | 
						|
  \egn             reference by number
 | 
						|
  \eg{n}           reference by number
 | 
						|
  \eg{-n}          relative reference by number
 | 
						|
  \ek<name>        reference by name (Perl)
 | 
						|
  \ek'name'        reference by name (Perl)
 | 
						|
  \eg{name}        reference by name (Perl)
 | 
						|
  \ek{name}        reference by name (.NET)
 | 
						|
  (?P=name)       reference by name (Python)
 | 
						|
.
 | 
						|
.
 | 
						|
.SH "SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)"
 | 
						|
.rs
 | 
						|
.sp
 | 
						|
  (?R)            recurse whole pattern
 | 
						|
  (?n)            call subpattern by absolute number
 | 
						|
  (?+n)           call subpattern by relative number
 | 
						|
  (?-n)           call subpattern by relative number
 | 
						|
  (?&name)        call subpattern by name (Perl)
 | 
						|
  (?P>name)       call subpattern by name (Python)
 | 
						|
  \eg<name>        call subpattern by name (Oniguruma)
 | 
						|
  \eg'name'        call subpattern by name (Oniguruma)
 | 
						|
  \eg<n>           call subpattern by absolute number (Oniguruma)
 | 
						|
  \eg'n'           call subpattern by absolute number (Oniguruma)
 | 
						|
  \eg<+n>          call subpattern by relative number (PCRE extension)
 | 
						|
  \eg'+n'          call subpattern by relative number (PCRE extension)
 | 
						|
  \eg<-n>          call subpattern by relative number (PCRE extension)
 | 
						|
  \eg'-n'          call subpattern by relative number (PCRE extension)
 | 
						|
.
 | 
						|
.
 | 
						|
.SH "CONDITIONAL PATTERNS"
 | 
						|
.rs
 | 
						|
.sp
 | 
						|
  (?(condition)yes-pattern)
 | 
						|
  (?(condition)yes-pattern|no-pattern)
 | 
						|
.sp
 | 
						|
  (?(n)...        absolute reference condition
 | 
						|
  (?(+n)...       relative reference condition
 | 
						|
  (?(-n)...       relative reference condition
 | 
						|
  (?(<name>)...   named reference condition (Perl)
 | 
						|
  (?('name')...   named reference condition (Perl)
 | 
						|
  (?(name)...     named reference condition (PCRE)
 | 
						|
  (?(R)...        overall recursion condition
 | 
						|
  (?(Rn)...       specific group recursion condition
 | 
						|
  (?(R&name)...   specific recursion condition
 | 
						|
  (?(DEFINE)...   define subpattern for reference
 | 
						|
  (?(assert)...   assertion condition
 | 
						|
.
 | 
						|
.
 | 
						|
.SH "BACKTRACKING CONTROL"
 | 
						|
.rs
 | 
						|
.sp
 | 
						|
The following act immediately they are reached:
 | 
						|
.sp
 | 
						|
  (*ACCEPT)       force successful match
 | 
						|
  (*FAIL)         force backtrack; synonym (*F)
 | 
						|
.sp
 | 
						|
The following act only when a subsequent match failure causes a backtrack to
 | 
						|
reach them. They all force a match failure, but they differ in what happens
 | 
						|
afterwards. Those that advance the start-of-match point do so only if the
 | 
						|
pattern is not anchored.
 | 
						|
.sp
 | 
						|
  (*COMMIT)       overall failure, no advance of starting point
 | 
						|
  (*PRUNE)        advance to next starting character
 | 
						|
  (*SKIP)         advance start to current matching position
 | 
						|
  (*THEN)         local failure, backtrack to next alternation
 | 
						|
.
 | 
						|
.
 | 
						|
.SH "NEWLINE CONVENTIONS"
 | 
						|
.rs
 | 
						|
.sp
 | 
						|
These are recognized only at the very start of the pattern or after a
 | 
						|
(*BSR_...) or (*UTF8) option.
 | 
						|
.sp
 | 
						|
  (*CR)           carriage return only
 | 
						|
  (*LF)           linefeed only
 | 
						|
  (*CRLF)         carriage return followed by linefeed
 | 
						|
  (*ANYCRLF)      all three of the above
 | 
						|
  (*ANY)          any Unicode newline sequence
 | 
						|
.
 | 
						|
.
 | 
						|
.SH "WHAT \eR MATCHES"
 | 
						|
.rs
 | 
						|
.sp
 | 
						|
These are recognized only at the very start of the pattern or after a
 | 
						|
(*...) option that sets the newline convention or UTF-8 mode.
 | 
						|
.sp
 | 
						|
  (*BSR_ANYCRLF)  CR, LF, or CRLF
 | 
						|
  (*BSR_UNICODE)  any Unicode newline sequence
 | 
						|
.
 | 
						|
.
 | 
						|
.SH "CALLOUTS"
 | 
						|
.rs
 | 
						|
.sp
 | 
						|
  (?C)      callout
 | 
						|
  (?Cn)     callout with data n
 | 
						|
.
 | 
						|
.
 | 
						|
.SH "SEE ALSO"
 | 
						|
.rs
 | 
						|
.sp
 | 
						|
\fBpcrepattern\fP(3), \fBpcreapi\fP(3), \fBpcrecallout\fP(3),
 | 
						|
\fBpcrematching\fP(3), \fBpcre\fP(3).
 | 
						|
.
 | 
						|
.
 | 
						|
.SH AUTHOR
 | 
						|
.rs
 | 
						|
.sp
 | 
						|
.nf
 | 
						|
Philip Hazel
 | 
						|
University Computing Service
 | 
						|
Cambridge CB2 3QH, England.
 | 
						|
.fi
 | 
						|
.
 | 
						|
.
 | 
						|
.SH REVISION
 | 
						|
.rs
 | 
						|
.sp
 | 
						|
.nf
 | 
						|
Last updated: 11 April 2009
 | 
						|
Copyright (c) 1997-2009 University of Cambridge.
 | 
						|
.fi
 |