10. Regular Expression Matching,简易正则表达式匹配

Last Updated: 2023-09-30 07:42:40 Saturday

-- TOC --

学习Python正则表达式

题目分析

Given an input string s and a pattern p, implement regular expression matching with support for '.' and '*' where:

The matching should cover the entire input string (not partial). 这是一个Full Match专用算法。

Example 1:
Input: s = "aa", p = "a"
Output: false
Explanation: "a" does not match the entire string "aa".

Example 2:
Input: s = "aa", p = "a*"
Output: true
Explanation: '*' means zero or more of the preceding element, 'a'. Therefore, by repeating 'a' once, it becomes "aa".

Example 3:
Input: s = "ab", p = ".*"
Output: true
Explanation: ".*" means "zero or more (*) of any character (.)".

Constraints:

递归

平常匹配一串字符,都是走循环,但走循环难以处理*带来的复杂度。递归的逻辑是,每次只匹配最开始的那个字符,然后通过走递归的方式匹配后面的字符。

复杂度分析:设字符串长度为n,pattern的长度为m,参考下面代码,最坏的情况是,

\(T(n,m) = T(n,m-2) + T(n-1,m)\)

前一项表示zero match,后一项表示one match。这是Crazy-bad的指数级增长!隐藏着重复计算相同内容的魔鬼...

Python和C++都是TLE,只有C可以提交,但超级慢!

Python

class Solution:
    def isMatch(self, s: str, p: str) -> bool:
        # recursive base
        # if run out of p, return not s
        if not p:  # if p == ''
            return not s  # return s == '' 
        # check if the first char matched,
        # if run out of s, False
        match = s and p[0] in (s[0], '.')
        # two possibilities for a*
        # 1. ignore them (match zero)
        # 2. match one or more a
        if len(p)>=2 and p[1]=='*':
            return (self.isMatch(s,p[2:]) or
                    match and self.isMatch(s[1:],p))
        # no a* and recursive
        return match and self.isMatch(s[1:], p[1:])

C++

class Solution {
public:
    bool isMatch(string s, string p) noexcept{
        if(p.empty())
            return s.empty() ? 1 : 0;
        bool match {!s.empty() && (p[0]==s[0]||p[0]=='.')};
        if((p.size()>=2) && (p[1]=='*'))
            return (isMatch(s,p.substr(2)) ||
                    match && isMatch(s.substr(1),p));
        return match && isMatch(s.substr(1),p.substr(1));
    }
};

C

bool isMatch(char *s, char *p){
    if(*p == 0)
        return *s==0 ? 1 : 0;
    bool match = (*s!=0 && (p[0]==s[0]||p[0]=='.'));
    if(strlen(p)>=2 && p[1]=='*')
        return (isMatch(s,p+2) ||
                match && isMatch(s+1,p));
    return match && isMatch(s+1,p+1);
}

Dynamic Programming (Top-Down)

在递归的过程中,记录和查询已算出的结果。

假设带分析字符串的长度为a,pattern的长度为b,二者每一个长度组合都会有一次\(O(1)\)的计算,并将结果保存起来(后面的就是查询)。因此时间和空间复杂度,都是\(O(ab)\)。

Python

class Solution:
    def isMatch(self, s: str, p: str) -> bool:
        def _is_match(i,j):
            if (i,j) in match_data:
                return match_data[i,j]
            if j == plen:
                return i == slen
            match = s[i:] and p[j] in (s[i],'.')
            if plen-j>=2 and p[j+1]=='*':
                match_data[i,j] = (_is_match(i,j+2) or
                                   match and _is_match(i+1,j))
            else:
                match_data[i,j] = match and _is_match(i+1,j+1)
            return match_data[i,j]

        slen = len(s)
        plen = len(p)
        match_data = {}
        return _is_match(0,0)

C++

class Solution {
    string s;
    string p;
    size_t slen;
    size_t plen;
    vector<vector<char>> match_data;

    bool _is_match(size_t i, size_t j) noexcept{
        if(match_data[i][j] != -1)
            return match_data[i][j];
        if(j == plen)
            return i==slen ? true : false;
        bool match {i!=slen && (p[j]==s[i]||p[j]=='.')};
        if((plen-j>=2) && (p[j+1]=='*'))
            match_data[i][j] = (_is_match(i,j+2) ||
                          match && _is_match(i+1,j));
        else
            match_data[i][j] = match && _is_match(i+1,j+1);
        return match_data[i][j];
    }

public:
    bool isMatch(string s, string p) noexcept{
        this->s = s;
        this->p = p;
        slen = s.size();
        plen = p.size();
        for(size_t i{}; i<slen+1; ++i)
            match_data.emplace_back(plen+1, -1);
        return _is_match(0,0);
    }
};

C

static char *gs;
static char *gp;
static size_t slen;
static size_t plen;
static char *match_data = NULL;


#define IJ  (i*(plen+1)+j)
bool _is_match(int i, int j){
    if(match_data[IJ] != -1)
        return match_data[IJ];
    if(j == plen)
        return i == slen;
    bool match = (gs[i]!=0 && (gp[j]==gs[i]||gp[j]=='.'));
    if((plen-j)>=2 && gp[j+1]=='*')
        match_data[IJ] = (_is_match(i,j+2) ||
                          (match && _is_match(i+1,j)));
    else
        match_data[IJ] = match && _is_match(i+1,j+1);
    return match_data[IJ];
}


bool isMatch(char *s, char *p){
    slen = strlen(gs=s);
    plen = strlen(gp=p);
    match_data = (char*)malloc((slen+1)*(plen+1));
    memset(match_data, -1, (slen+1)*(plen+1));
    bool rtv = _is_match(0,0);
    free(match_data);
    match_data = NULL;
    return rtv;
}

用一个char型二维数组来存放中间结果。

Dynamic Programming (Bottom-Up)

一般情况下,没有递归自底向上的DP解法会更快!

Python

class Solution:
    def isMatch(self, s: str, p: str) -> bool:
        slen = len(s)
        plen = len(p)
        match_data = {(i,j):False for i in range(slen+1)
                                  for j in range(plen+1)}
        match_data[slen,plen] = True
        for i in range(slen, -1, -1):
            for j in range(plen-1, -1, -1):
                match = i<slen and p[j] in (s[i],'.')
                if plen-j>=2 and p[j+1]=='*':
                    match_data[i,j] = (match_data[i,j+2] or
                                       match and match_data[i+1,j])
                else:
                    match_data[i,j] = match and match_data[i+1,j+1]
        return match_data[0,0]

C++

class Solution {
public:
    bool isMatch(string s, string p) noexcept{
        int slen {static_cast<int>(s.size())};
        int plen {static_cast<int>(p.size())};
        vector<vector<bool>> match_data(slen+1, vector<bool>(plen+1,false));
        match_data[slen][plen] = true;
        for(int i{slen}; i>=0; --i)
            for(int j{plen-1}; j>=0; --j){
                bool match {i<slen && (p[j]==s[i]||p[j]=='.')};
                if((plen-j)>=2 && (p[j+1]=='*'))
                    match_data[i][j] = (match_data[i][j+2] ||
                                        match && match_data[i+1][j]);
                else
                    match_data[i][j] = match && match_data[i+1][j+1];
            }
        return match_data[0][0];
    }
};

C

这是最舒服的C版本!

bool isMatch(char *s, char *p){
    int slen = strlen(s);
    int plen = strlen(p);
    char (*match_data)[plen+1] = (char*)calloc(slen+1,plen+1);
    match_data[slen][plen] = 1;
    for(int i=slen; i>=0; --i)
        for(int j=plen-1; j>=0; --j){
            bool match = i<slen && (p[j]==s[i]||p[j]=='.');
            if((plen-j)>=2 && (p[j+1]=='*'))
                match_data[i][j] = (match_data[i][j+2] ||
                                    match && match_data[i+1][j]);
            else
                match_data[i][j] = match && match_data[i+1][j+1];
        }
    bool rtv = match_data[0][0];
    free(match_data);
    return rtv;
}

用stack实现backtracking

当遇到*时,状态转换可能存在多条路径,默认采用greedy匹配,同时将其它可能的路径用stack保存,当发现无法匹配(无法到达成功的状态)时,如果stack不为空,则从stack中取出一条路径,重新开始后面的匹配(状态变换)。

其实,recursive的实现也是backtracking,只是方式不同。backtracking都很慢!用stack只是规避了recursive call的深度问题,执行速度快一些。

比如这个正则表达式:(a?){4},按照stack+greedy的思路,我们用1表示匹配一个a,用0表示跳过,可能的尝试有:

1111
1110
1101
1100
1011
1010
1001
1000
0111
0110
0101
0100
0011
0010
0001
0000

即最多存在2^4这么多尝试的路径,才能确实最后的结果。这是指数级的增长!

可惜的是,通过学习各种资料发现,Python的re模块(很多其它编程语言自带的正则模块都一样,大部分都采用PCRE),就是这种backtracking的实现,例如下面这个匹配,不知道需要多长时间,反正就是停不下来:

>>> re.match(r'(a?){64}a{64}', 'a'*64)  # God knows when stop

参考:https://swtch.com/~rsc/regexp/regexp1.html。这就是下面投机取巧的方法,也不是特别快的原因吧。

投机取巧

代码:

class Solution:
    def isMatch(self, s: str, p: str) -> bool:
        return re.match(p+'$',s)
re.error: multiple repeat at position 2
    raise source.error("multiple repeat",
Line 672 in _parse (/usr/lib/python3.10/sre_parse.py)
    itemsappend(_parse(source, state, verbose, nested + 1,
Line 444 in _parse_sub (/usr/lib/python3.10/sre_parse.py)
    p = _parse_sub(source, state, flags & SRE_FLAG_VERBOSE, 0)
Line 955 in parse (/usr/lib/python3.10/sre_parse.py)
    p = sre_parse.parse(p, flags)
Line 788 in compile (/usr/lib/python3.10/sre_compile.py)
    p = sre_compile.compile(pattern, flags)
Line 303 in _compile (/usr/lib/python3.10/re.py)
    return _compile(pattern, flags).match(string)
Line 190 in match (/usr/lib/python3.10/re.py)
    return re.match(p+'$',s)
Line 3 in isMatch (Solution.py)
    ret = Solution().isMatch(param_1, param_2)
Line 27 in _driver (Solution.py)
    _driver()
Line 38 in <module> (Solution.py)

Last Executed Input
"abc"
"a***abc"

测试用例有点问题!难道是故意的?避免有人采用这种投机的方法提交代码。

规避的方法:

class Solution:
    def isMatch(self, s: str, p: str) -> bool:
        p = re.sub(r'[*]+', '*', p)
        return re.match(p+'$',s)

但执行效率很低!这可能就是因为通用的正则都比较慢吧。

NFA

最后决定用NFA的思路搞一下,采用Thompson multi-state的思路走NFA,one pass就能得到结果。

本文链接:https://cs.pynote.net/ag/leetcode/202212181/

-- EOF --

-- MORE --