10. Regular Expression Matching，简易正则表达式匹配

Example 1:
Input: s = "aa", p = "a"
Output: false
Explanation: "a" does not match the entire string "aa".

Example 2:
Input: s = "aa", p = "a*"
Output: true
Explanation: '*' means zero or more of the preceding element, 'a'. Therefore, by repeating 'a' once, it becomes "aa".

Example 3:
Input: s = "ab", p = ".*"
Output: true
Explanation: ".*" means "zero or more (*) of any character (.)".

递归

平常匹配一串字符，都是走循环，但走循环难以处理*带来的复杂度。递归的逻辑是，每次只匹配最开始的那个字符，然后通过走递归的方式匹配后面的字符。

复杂度分析：设字符串长度为n，pattern的长度为m，参考下面代码，最坏的情况是,

前一项表示zero match，后一项表示one match。这是Crazy-bad的指数级增长！隐藏着重复计算相同内容的魔鬼...

Python

class Solution:
    def isMatch(self, s: str, p: str) -> bool:
        # recursive base
        # if run out of p, return not s
        if not p:  # if p == ''
            return not s  # return s == '' 
        # check if the first char matched,
        # if run out of s, False
        match = s and p[0] in (s[0], '.')
        # two possibilities for a*
        # 1. ignore them (match zero)
        # 2. match one or more a
        if len(p)>=2 and p[1]=='*':
            return (self.isMatch(s,p[2:]) or
                    match and self.isMatch(s[1:],p))
        # no a* and recursive
        return match and self.isMatch(s[1:], p[1:])

C++

class Solution {
public:
    bool isMatch(string s, string p) noexcept{
        if(p.empty())
            return s.empty() ? 1 : 0;
        bool match {!s.empty() && (p[0]==s[0]||p[0]=='.')};
        if((p.size()>=2) && (p[1]=='*'))
            return (isMatch(s,p.substr(2)) ||
                    match && isMatch(s.substr(1),p));
        return match && isMatch(s.substr(1),p.substr(1));
    }
};

bool isMatch(char *s, char *p){
    if(*p == 0)
        return *s==0 ? 1 : 0;
    bool match = (*s!=0 && (p[0]==s[0]||p[0]=='.'));
    if(strlen(p)>=2 && p[1]=='*')
        return (isMatch(s,p+2) ||
                match && isMatch(s+1,p));
    return match && isMatch(s+1,p+1);
}

Dynamic Programming （Top-Down）

假设带分析字符串的长度为a，pattern的长度为b，二者每一个长度组合都会有一次\(O(1)\)的计算，并将结果保存起来（后面的就是查询）。因此时间和空间复杂度，都是\(O(ab)\)。

Python

class Solution:
    def isMatch(self, s: str, p: str) -> bool:
        def _is_match(i,j):
            if (i,j) in match_data:
                return match_data[i,j]
            if j == plen:
                return i == slen
            match = s[i:] and p[j] in (s[i],'.')
            if plen-j>=2 and p[j+1]=='*':
                match_data[i,j] = (_is_match(i,j+2) or
                                   match and _is_match(i+1,j))
            else:
                match_data[i,j] = match and _is_match(i+1,j+1)
            return match_data[i,j]

        slen = len(s)
        plen = len(p)
        match_data = {}
        return _is_match(0,0)

C++

class Solution {
    string s;
    string p;
    size_t slen;
    size_t plen;
    vector<vector<char>> match_data;

    bool _is_match(size_t i, size_t j) noexcept{
        if(match_data[i][j] != -1)
            return match_data[i][j];
        if(j == plen)
            return i==slen ? true : false;
        bool match {i!=slen && (p[j]==s[i]||p[j]=='.')};
        if((plen-j>=2) && (p[j+1]=='*'))
            match_data[i][j] = (_is_match(i,j+2) ||
                          match && _is_match(i+1,j));
        else
            match_data[i][j] = match && _is_match(i+1,j+1);
        return match_data[i][j];
    }

public:
    bool isMatch(string s, string p) noexcept{
        this->s = s;
        this->p = p;
        slen = s.size();
        plen = p.size();
        for(size_t i{}; i<slen+1; ++i)
            match_data.emplace_back(plen+1, -1);
        return _is_match(0,0);
    }
};

static char *gs;
static char *gp;
static size_t slen;
static size_t plen;
static char *match_data = NULL;


#define IJ  (i*(plen+1)+j)
bool _is_match(int i, int j){
    if(match_data[IJ] != -1)
        return match_data[IJ];
    if(j == plen)
        return i == slen;
    bool match = (gs[i]!=0 && (gp[j]==gs[i]||gp[j]=='.'));
    if((plen-j)>=2 && gp[j+1]=='*')
        match_data[IJ] = (_is_match(i,j+2) ||
                          (match && _is_match(i+1,j)));
    else
        match_data[IJ] = match && _is_match(i+1,j+1);
    return match_data[IJ];
}


bool isMatch(char *s, char *p){
    slen = strlen(gs=s);
    plen = strlen(gp=p);
    match_data = (char*)malloc((slen+1)*(plen+1));
    memset(match_data, -1, (slen+1)*(plen+1));
    bool rtv = _is_match(0,0);
    free(match_data);
    match_data = NULL;
    return rtv;
}

Dynamic Programming （Bottom-Up）

Python

class Solution:
    def isMatch(self, s: str, p: str) -> bool:
        slen = len(s)
        plen = len(p)
        match_data = {(i,j):False for i in range(slen+1)
                                  for j in range(plen+1)}
        match_data[slen,plen] = True
        for i in range(slen, -1, -1):
            for j in range(plen-1, -1, -1):
                match = i<slen and p[j] in (s[i],'.')
                if plen-j>=2 and p[j+1]=='*':
                    match_data[i,j] = (match_data[i,j+2] or
                                       match and match_data[i+1,j])
                else:
                    match_data[i,j] = match and match_data[i+1,j+1]
        return match_data[0,0]

C++

class Solution {
public:
    bool isMatch(string s, string p) noexcept{
        int slen {static_cast<int>(s.size())};
        int plen {static_cast<int>(p.size())};
        vector<vector<bool>> match_data(slen+1, vector<bool>(plen+1,false));
        match_data[slen][plen] = true;
        for(int i{slen}; i>=0; --i)
            for(int j{plen-1}; j>=0; --j){
                bool match {i<slen && (p[j]==s[i]||p[j]=='.')};
                if((plen-j)>=2 && (p[j+1]=='*'))
                    match_data[i][j] = (match_data[i][j+2] ||
                                        match && match_data[i+1][j]);
                else
                    match_data[i][j] = match && match_data[i+1][j+1];
            }
        return match_data[0][0];
    }
};

bool isMatch(char *s, char *p){
    int slen = strlen(s);
    int plen = strlen(p);
    char (*match_data)[plen+1] = (char*)calloc(slen+1,plen+1);
    match_data[slen][plen] = 1;
    for(int i=slen; i>=0; --i)
        for(int j=plen-1; j>=0; --j){
            bool match = i<slen && (p[j]==s[i]||p[j]=='.');
            if((plen-j)>=2 && (p[j+1]=='*'))
                match_data[i][j] = (match_data[i][j+2] ||
                                    match && match_data[i+1][j]);
            else
                match_data[i][j] = match && match_data[i+1][j+1];
        }
    bool rtv = match_data[0][0];
    free(match_data);
    return rtv;
}

用stack实现backtracking

当遇到*时，状态转换可能存在多条路径，默认采用greedy匹配，同时将其它可能的路径用stack保存，当发现无法匹配（无法到达成功的状态）时，如果stack不为空，则从stack中取出一条路径，重新开始后面的匹配（状态变换）。

其实，recursive的实现也是backtracking，只是方式不同。backtracking都很慢！用stack只是规避了recursive call的深度问题，执行速度快一些。

比如这个正则表达式：(a?){4}，按照stack+greedy的思路，我们用1表示匹配一个a，用0表示跳过，可能的尝试有：

即最多存在2^4这么多尝试的路径，才能确实最后的结果。这是指数级的增长！

可惜的是，通过学习各种资料发现，Python的re模块（很多其它编程语言自带的正则模块都一样，大部分都采用PCRE），就是这种backtracking的实现，例如下面这个匹配，不知道需要多长时间，反正就是停不下来：

>>> re.match(r'(a?){64}a{64}', 'a'*64)  # God knows when stop

投机取巧

class Solution:
    def isMatch(self, s: str, p: str) -> bool:
        return re.match(p+'$',s)

re.error: multiple repeat at position 2
    raise source.error("multiple repeat",
Line 672 in _parse (/usr/lib/python3.10/sre_parse.py)
    itemsappend(_parse(source, state, verbose, nested + 1,
Line 444 in _parse_sub (/usr/lib/python3.10/sre_parse.py)
    p = _parse_sub(source, state, flags & SRE_FLAG_VERBOSE, 0)
Line 955 in parse (/usr/lib/python3.10/sre_parse.py)
    p = sre_parse.parse(p, flags)
Line 788 in compile (/usr/lib/python3.10/sre_compile.py)
    p = sre_compile.compile(pattern, flags)
Line 303 in _compile (/usr/lib/python3.10/re.py)
    return _compile(pattern, flags).match(string)
Line 190 in match (/usr/lib/python3.10/re.py)
    return re.match(p+'$',s)
Line 3 in isMatch (Solution.py)
    ret = Solution().isMatch(param_1, param_2)
Line 27 in _driver (Solution.py)
    _driver()
Line 38 in <module> (Solution.py)

Last Executed Input
"abc"
"a***abc"

测试用例有点问题！难道是故意的？避免有人采用这种投机的方法提交代码。

class Solution:
    def isMatch(self, s: str, p: str) -> bool:
        p = re.sub(r'[*]+', '*', p)
        return re.match(p+'$',s)

NFA

最后决定用NFA的思路搞一下，采用Thompson multi-state的思路走NFA，one pass就能得到结果。