在C++中使用正则表达式

在学习C++的过程中，我发现，C++在C的基础上，并不仅仅是提供全套OOP代码组织工具，还有一套STL库。C++更像Python，所谓的battery included，适合各种应用领域的编程。正则表达式就是一例。

这个接口要Full Match，才返回true，可以匹配C风格的字符串，string对象，或iterator pair。

#include <iostream>
#include <string>
#include <regex>
using namespace std;


int main(){
    regex re{"abc"};
    // C string
    cout << regex_match("abc123",re) << endl; // false
    cout << regex_match("abc",re) << endl;    // true
    // string object
    string t1{"abc123"};
    string t2{"abc"};
    cout << regex_match(t1, re) << endl;  // false
    cout << regex_match(t2, re) << endl;  // true
    // iterator pair
    cout << regex_match(t1.begin(), t1.begin()+3, re) << endl;  // true
    cout << regex_match(t2.begin(), t2.end(), re) << endl;      // true
    return 0;
}

regex_search

regex_search在存在子串匹配的时候，也会返回true，此时the updated match object开始变得有用了。

#include <iostream>
#include <string>
#include <regex>
using namespace std;


int main(){
    regex re{"abc"};
    cmatch cm; // const char match object
    cout << regex_search("123abc456",cm,re) << endl;
    cout << "cm.prefix():" << cm.prefix() << endl;
    cout << "cm.suffix():" << cm.suffix() << endl;
    cout << "cm.position():" << cm.position() << endl;
    cout << "cm.length():" << cm.length() << endl;
    cout << "cm.str():" << cm.str() << endl;
    cout << "cm[0]:" << cm[0] << endl;
    cout << "cm.size():" << cm.size() << endl;
    for(auto &x: cm)
        cout << x << " ";
    cout << endl;
    return 0;
}

1
cm.prefix():123
cm.suffix():456
cm.position():3
cm.length():3
cm.str():abc
cm[0]:abc
cm.size():1
abc

cm.size表示匹配了多少次，当正则表达式中有分组的时候，这个值可以用来遍历（似乎cmatch也是个容器）：

#include <iostream>
#include <string>
#include <regex>
using namespace std;


int main(){
    regex re{R"((\d+)([.]abc))"}; // raw, must be enclosed by ()
    cmatch cm;
    cout << regex_search("123.abc.456",cm,re) << endl;
    cout << "cm.prefix():" << cm.prefix() << endl;
    cout << "cm.suffix():" << cm.suffix() << endl;
    cout << "cm.position():" << cm.position() << endl;
    cout << "cm.length():" << cm.length() << endl;
    cout << "cm.size():" << cm.size() << endl;
    for(cmatch::size_type i=0; i<cm.size(); ++i)
        cout << cm.str(i) << "|" << cm[i] << endl;
    for(auto &x: cm)
        cout << x << ";" << x.length() << endl;  // length of sub-match
    return 0;
}

1
cm.prefix():
cm.suffix():.456
cm.position():0
cm.length():7
cm.size():3
123.abc|123.abc
123|123
.abc|.abc
123.abc;7
123;3
.abc;4

两个分组，size=3，首先是一个完整匹配的子串，然后是两个分组的子串。测试代码的最后，是直接遍历match对象，一样的效果。而prefix，suffix，position，length都是按完整匹配计算的结果。length同时也支持sub-match。

regex re{R"(abc)"};
cmatch cm;
string s{"123123abc"};
regex_search(s.c_str(),cm,re);

regex_replace

#include <iostream>
#include <string>
#include <regex>
using namespace std;


int main(){
    regex re{R"(\d+)"};
    cout << regex_replace("123.abc.456",re,"x") << endl;
    cout << regex_replace("123.abc.456",re,"[$&]") << endl;
    cout << regex_replace("123.abc.456",re,"$&") << endl;
    return 0;
}

x.abc.x
[123].abc.[456]
123.abc.456

忽略大小写

regex reg{"abc", regex::icase};

sregex_token_iterator

#include <iostream>
#include <regex>
#include <string>
using namespace std;


int main(void) {
    regex reg{R"(\d+)"};
    string cont{"!!@123_asdf8&II13423_09y"};
    cout << cont << endl;
    cout << "----" << endl;

    {
    sregex_token_iterator num(cont.cbegin(), cont.cend(), reg, 0);
    decltype(num) end;
    for (; num != end; ++num)
        cout << num->str() << endl;
    }

    cout << "----" << endl;

    {
    sregex_token_iterator nn(cont.cbegin(), cont.cend(), reg, -1);
    decltype(nn) end;
    for (; nn != end; ++nn)
        cout << nn->str() << endl;
    }

    return 0;
}

遍历的结束判断条件有点迷惑，似乎是与一个同类型的空对象在比较。sregex_token_iterator对象创建的时候，最后的那个参数0表示匹配的子串，-1表示匹配之前的子串。

!!@123_asdf8&II13423_09y
----
123
8
13423
09
----
!!@
_asdf
&II
_
y