Last Updated: 2023-09-30 07:42:40 Saturday
-- TOC --
Given an input string s and a pattern p, implement regular expression matching with support for '.' and '*' where:
.
Matches any single character. (see constraints)*
Matches zero or more of the preceding element.The matching should cover the entire input string (not partial). 这是一个
Full Match
专用算法。
Example 1:
Input: s = "aa", p = "a"
Output: false
Explanation: "a" does not match the entire string "aa".
Example 2:
Input: s = "aa", p = "a*"
Output: true
Explanation: '*' means zero or more of the preceding element, 'a'. Therefore, by repeating 'a' once, it becomes "aa".
Example 3:
Input: s = "ab", p = ".*"
Output: true
Explanation: ".*" means "zero or more (*) of any character (.)".
Constraints:
- 1 <= s.length <= 20
- 1 <= p.length <= 30
- s contains only lowercase English letters.(这里面只有小写字母,没有'\n')
- p contains only lowercase English letters, '.', and '*'.
- It is guaranteed for each appearance of the character '*', there will be a previous valid character to match.
平常匹配一串字符,都是走循环,但走循环难以处理*
带来的复杂度。递归的逻辑是,每次只匹配最开始的那个字符,然后通过走递归的方式匹配后面的字符。
复杂度分析:设字符串长度为n,pattern的长度为m,参考下面代码,最坏的情况是,
\(T(n,m) = T(n,m-2) + T(n-1,m)\)
前一项表示zero match,后一项表示one match。这是Crazy-bad的指数级增长!隐藏着重复计算相同内容的魔鬼...
Python和C++都是TLE,只有C可以提交,但超级慢!
class Solution:
def isMatch(self, s: str, p: str) -> bool:
# recursive base
# if run out of p, return not s
if not p: # if p == ''
return not s # return s == ''
# check if the first char matched,
# if run out of s, False
match = s and p[0] in (s[0], '.')
# two possibilities for a*
# 1. ignore them (match zero)
# 2. match one or more a
if len(p)>=2 and p[1]=='*':
return (self.isMatch(s,p[2:]) or
match and self.isMatch(s[1:],p))
# no a* and recursive
return match and self.isMatch(s[1:], p[1:])
class Solution {
public:
bool isMatch(string s, string p) noexcept{
if(p.empty())
return s.empty() ? 1 : 0;
bool match {!s.empty() && (p[0]==s[0]||p[0]=='.')};
if((p.size()>=2) && (p[1]=='*'))
return (isMatch(s,p.substr(2)) ||
match && isMatch(s.substr(1),p));
return match && isMatch(s.substr(1),p.substr(1));
}
};
bool isMatch(char *s, char *p){
if(*p == 0)
return *s==0 ? 1 : 0;
bool match = (*s!=0 && (p[0]==s[0]||p[0]=='.'));
if(strlen(p)>=2 && p[1]=='*')
return (isMatch(s,p+2) ||
match && isMatch(s+1,p));
return match && isMatch(s+1,p+1);
}
在递归的过程中,记录和查询已算出的结果。
假设带分析字符串的长度为a,pattern的长度为b,二者每一个长度组合都会有一次\(O(1)\)的计算,并将结果保存起来(后面的就是查询)。因此时间和空间复杂度,都是\(O(ab)\)。
class Solution:
def isMatch(self, s: str, p: str) -> bool:
def _is_match(i,j):
if (i,j) in match_data:
return match_data[i,j]
if j == plen:
return i == slen
match = s[i:] and p[j] in (s[i],'.')
if plen-j>=2 and p[j+1]=='*':
match_data[i,j] = (_is_match(i,j+2) or
match and _is_match(i+1,j))
else:
match_data[i,j] = match and _is_match(i+1,j+1)
return match_data[i,j]
slen = len(s)
plen = len(p)
match_data = {}
return _is_match(0,0)
[i,j]
,就是[(i,j)]
,里面是个tuple。class Solution {
string s;
string p;
size_t slen;
size_t plen;
vector<vector<char>> match_data;
bool _is_match(size_t i, size_t j) noexcept{
if(match_data[i][j] != -1)
return match_data[i][j];
if(j == plen)
return i==slen ? true : false;
bool match {i!=slen && (p[j]==s[i]||p[j]=='.')};
if((plen-j>=2) && (p[j+1]=='*'))
match_data[i][j] = (_is_match(i,j+2) ||
match && _is_match(i+1,j));
else
match_data[i][j] = match && _is_match(i+1,j+1);
return match_data[i][j];
}
public:
bool isMatch(string s, string p) noexcept{
this->s = s;
this->p = p;
slen = s.size();
plen = p.size();
for(size_t i{}; i<slen+1; ++i)
match_data.emplace_back(plen+1, -1);
return _is_match(0,0);
}
};
static char *gs;
static char *gp;
static size_t slen;
static size_t plen;
static char *match_data = NULL;
#define IJ (i*(plen+1)+j)
bool _is_match(int i, int j){
if(match_data[IJ] != -1)
return match_data[IJ];
if(j == plen)
return i == slen;
bool match = (gs[i]!=0 && (gp[j]==gs[i]||gp[j]=='.'));
if((plen-j)>=2 && gp[j+1]=='*')
match_data[IJ] = (_is_match(i,j+2) ||
(match && _is_match(i+1,j)));
else
match_data[IJ] = match && _is_match(i+1,j+1);
return match_data[IJ];
}
bool isMatch(char *s, char *p){
slen = strlen(gs=s);
plen = strlen(gp=p);
match_data = (char*)malloc((slen+1)*(plen+1));
memset(match_data, -1, (slen+1)*(plen+1));
bool rtv = _is_match(0,0);
free(match_data);
match_data = NULL;
return rtv;
}
用一个char型二维数组来存放中间结果。
一般情况下,没有递归自底向上的DP解法会更快!
class Solution:
def isMatch(self, s: str, p: str) -> bool:
slen = len(s)
plen = len(p)
match_data = {(i,j):False for i in range(slen+1)
for j in range(plen+1)}
match_data[slen,plen] = True
for i in range(slen, -1, -1):
for j in range(plen-1, -1, -1):
match = i<slen and p[j] in (s[i],'.')
if plen-j>=2 and p[j+1]=='*':
match_data[i,j] = (match_data[i,j+2] or
match and match_data[i+1,j])
else:
match_data[i,j] = match and match_data[i+1,j+1]
return match_data[0,0]
class Solution {
public:
bool isMatch(string s, string p) noexcept{
int slen {static_cast<int>(s.size())};
int plen {static_cast<int>(p.size())};
vector<vector<bool>> match_data(slen+1, vector<bool>(plen+1,false));
match_data[slen][plen] = true;
for(int i{slen}; i>=0; --i)
for(int j{plen-1}; j>=0; --j){
bool match {i<slen && (p[j]==s[i]||p[j]=='.')};
if((plen-j)>=2 && (p[j+1]=='*'))
match_data[i][j] = (match_data[i][j+2] ||
match && match_data[i+1][j]);
else
match_data[i][j] = match && match_data[i+1][j+1];
}
return match_data[0][0];
}
};
这是最舒服的C版本!
bool isMatch(char *s, char *p){
int slen = strlen(s);
int plen = strlen(p);
char (*match_data)[plen+1] = (char*)calloc(slen+1,plen+1);
match_data[slen][plen] = 1;
for(int i=slen; i>=0; --i)
for(int j=plen-1; j>=0; --j){
bool match = i<slen && (p[j]==s[i]||p[j]=='.');
if((plen-j)>=2 && (p[j+1]=='*'))
match_data[i][j] = (match_data[i][j+2] ||
match && match_data[i+1][j]);
else
match_data[i][j] = match && match_data[i+1][j+1];
}
bool rtv = match_data[0][0];
free(match_data);
return rtv;
}
size_t
是unsigned类型,不能去判断>=0
。&&
优先级高于||
。当遇到*
时,状态转换可能存在多条路径,默认采用greedy匹配,同时将其它可能的路径用stack保存,当发现无法匹配(无法到达成功的状态)时,如果stack不为空,则从stack中取出一条路径,重新开始后面的匹配(状态变换)。
其实,recursive的实现也是backtracking,只是方式不同。backtracking都很慢!用stack只是规避了recursive call的深度问题,执行速度快一些。
比如这个正则表达式:(a?){4}
,按照stack+greedy的思路,我们用1表示匹配一个a,用0表示跳过,可能的尝试有:
1111
1110
1101
1100
1011
1010
1001
1000
0111
0110
0101
0100
0011
0010
0001
0000
即最多存在2^4这么多尝试的路径,才能确实最后的结果。这是指数级的增长!
可惜的是,通过学习各种资料发现,Python的re模块(很多其它编程语言自带的正则模块都一样,大部分都采用PCRE),就是这种backtracking的实现,例如下面这个匹配,不知道需要多长时间,反正就是停不下来:
>>> re.match(r'(a?){64}a{64}', 'a'*64) # God knows when stop
参考:https://swtch.com/~rsc/regexp/regexp1.html。这就是下面投机取巧的方法,也不是特别快的原因吧。
代码:
class Solution:
def isMatch(self, s: str, p: str) -> bool:
return re.match(p+'$',s)
re.error: multiple repeat at position 2
raise source.error("multiple repeat",
Line 672 in _parse (/usr/lib/python3.10/sre_parse.py)
itemsappend(_parse(source, state, verbose, nested + 1,
Line 444 in _parse_sub (/usr/lib/python3.10/sre_parse.py)
p = _parse_sub(source, state, flags & SRE_FLAG_VERBOSE, 0)
Line 955 in parse (/usr/lib/python3.10/sre_parse.py)
p = sre_parse.parse(p, flags)
Line 788 in compile (/usr/lib/python3.10/sre_compile.py)
p = sre_compile.compile(pattern, flags)
Line 303 in _compile (/usr/lib/python3.10/re.py)
return _compile(pattern, flags).match(string)
Line 190 in match (/usr/lib/python3.10/re.py)
return re.match(p+'$',s)
Line 3 in isMatch (Solution.py)
ret = Solution().isMatch(param_1, param_2)
Line 27 in _driver (Solution.py)
_driver()
Line 38 in <module> (Solution.py)
Last Executed Input
"abc"
"a***abc"
测试用例有点问题!难道是故意的?避免有人采用这种投机的方法提交代码。
规避的方法:
class Solution:
def isMatch(self, s: str, p: str) -> bool:
p = re.sub(r'[*]+', '*', p)
return re.match(p+'$',s)
但执行效率很低!这可能就是因为通用的正则都比较慢吧。
最后决定用NFA的思路搞一下,采用Thompson multi-state的思路走NFA,one pass就能得到结果。
本文链接:https://cs.pynote.net/ag/leetcode/202212181/
-- EOF --
-- MORE --