Last Updated: 2023-11-10 15:22:17 Friday
-- TOC --
写一个特别的字符串到32位整型数字的函数接口,此题的说明实在太详细....
经过测试,此题的要求,就是实现一个32位版本的strtoll
函数接口!只是少了两个入参,少了两个功能而已......而C++中也存在一个新的stoi
接口可以直接使用!
Implement the myAtoi(string s) function, which converts a string to a 32-bit signed integer (similar to C/C++'s atoi function). 将字符串转换成32位的整型数。
The algorithm for myAtoi(string s) is as follows:(这道题给出了计算步骤)
- Read in and ignore any leading whitespace.
- Check if the next character (if not already at the end of the string) is '-' or '+'. Read this character in if it is either. This determines if the final result is negative or positive respectively. Assume the result is positive if neither is present.
- Read in next the characters until the next non-digit character or the end of the input is reached. The rest of the string is ignored.
- Convert these digits into an integer (i.e. "123" -> 123, "0032" -> 32). If no digits were read, then the integer is 0. Change the sign as necessary (from step 2). 如果没有数字,返回0。
- If the integer is out of the 32-bit signed integer range [-2^31, 2^31 - 1], then clamp the integer so that it remains in the range. Specifically, integers less than -2^31 should be clamped to -2^31, and integers greater than 2^31 - 1 should be clamped to 2^31 - 1. 这个clamp动作比较特别,饱和计算。
- Return the integer as the final result.
Note:
- Only the space character ' ' is considered a whitespace character.(简化了判断leading space的动作)
- Do not ignore any characters other than the leading whitespace or the rest of the string after the digits.
Example 1:
Input: s = "42"
Output: 42
Example 2:
Input: s = " -42"
Output: -42
Example 3:
Input: s = "4193 with words"
Output: 4193
Since 4193 is in the range [-2^31, 2^31-1], the final result is 4193.
Constraints:
- 0 <= s.length <= 200(32位的int,最多存放10位数字,看来要及时判断overflow的情况,做clamp)
- s consists of English letters (lower-case and upper-case), digits (0-9), ' ', '+', '-', and '.'.
class Solution:
def myAtoi(self, s: str) -> int:
# \s* guarantees the_match will not be None
the_match = re.match(r'\s*([-+]?\d*)', s)
group1 = the_match.group(1)
if group1 in ('-','+',''):
return 0
num = int(group1)
return -0x80000000 if num<-0x80000000 else \
0x7FFFFFFF if num>0x7FFFFFFF else num
小技巧:如果遇到重复使用的正则表达式,先compile,然后再反复使用。
下面这个C++实现,虽然使用了正则表达式,但还是调用了stoi接口。本文后面有一个纯用stoi接口的实现!
class Solution {
public:
int myAtoi(string s) {
regex re {R"(\s*([-+]?\d*))"};
cmatch cm;
regex_search(s.c_str(), cm, re);
if((cm[1].length() == 0) ||
(cm[1] == '-') ||
(cm[1] == '+'))
return 0;
int sign {string(cm[1])[0]=='-'?-1:1};
int rtv{};
try{
rtv = stoi(cm[1]);
}catch(const std::out_of_range&){
return sign==1 ? 0x7FFFFFFF : 0x80000000;
}
return rtv;
}
};
啊。。。为什么这个C++版本超慢。。。?Python用时35ms,C++用时657ms!
按照题目的约束从头开始遍历,并在发现overflow的时候,立即返回。
INT_MIN = -0x80000000
INT_MAX = 0x7FFFFFFF
DIGITS = tuple(str(i) for i in range(10))
class Solution:
def myAtoi(self, s: str) -> int:
sign = 0
i = 0
for i,c in enumerate(s):
if c == ' ':
continue
if c == '-':
if sign != 0:
return 0
sign = -1
i += 1
elif c == '+':
if sign != 0:
return 0
sign = 1
i += 1
break
if sign == 0:
sign = 1
rtv = ''
while i<len(s) and s[i] in DIGITS:
if len(rtv) >= 9:
if int(rtv) > (INT_MAX//10):
return INT_MAX if sign==1 else INT_MIN
elif int(rtv) == (INT_MAX//10):
if sign==1 and s[i]>='7':
return INT_MAX
if sign==-1 and s[i]>='8':
return INT_MIN
rtv += s[i]
i += 1
return int(rtv)*sign if len(rtv) else 0
对于C++代码,可以酌情加个noexcept,取个巧...:)
class Solution {
public:
int myAtoi(string s) noexcept {
int sign {};
int i {};
for(auto c: s){
if(c == ' '){
++i;
continue;
}
if(c == '-'){
if(sign != 0)
return 0;
sign = -1;
++i;
}
if(c == '+'){
if(sign != 0)
return 0;
sign = 1;
++i;
}
break;
}
if(sign == 0)
sign = 1;
size_t s_size {s.size()};
int rtv {};
while((i<s_size) && (isdigit(s[i]))){
if(rtv > INT_MAX/10)
return sign==1 ? INT_MAX : INT_MIN;
if(rtv == INT_MAX/10){
if((sign==1) && (s[i]>='7'))
return INT_MAX;
if((sign==-1) && (s[i]>='8'))
return INT_MIN;
}
rtv = rtv*10 + (s[i]-0x30);
++i;
}
return rtv * sign;
}
};
int myAtoi(char *s){
int sign = 0;
int i = 0;
while(s[i]){
if(s[i] == ' '){
++i;
continue;
}
if(s[i] == '-'){
if(sign != 0)
return 0;
++i;
sign = -1;
}
if(s[i] == '+'){
if(sign != 0)
return 0;
++i;
sign = 1;
}
break;
}
if(sign == 0)
sign = 1;
int rtv = 0;
size_t s_size = strlen(s);
while((i<s_size) && (isdigit(s[i]))){
if(rtv > INT_MAX/10)
return sign==1 ? INT_MAX : INT_MIN;
if(rtv == INT_MAX/10){
if((sign==1) && (s[i]>='7'))
return INT_MAX;
if((sign==-1) && (s[i]>='8'))
return INT_MIN;
}
rtv = rtv*10 + (s[i]-0x30);
++i;
}
return rtv * sign;
}
strtoll
是标准C库中的接口,它的功能完全覆盖了本题需求,唯一需要注意的是,strtoll返回long long型数据,需要写点代码将其转成32的int形式。
int myAtoi(char *s){
long long a = strtoll(s, NULL, 10);
if(a >= 0)
return (a&0xFFFFFFFF80000000) ? 0x7FFFFFFF : (int)a;
return a<=-2147483648 ? 0x80000000 : (int)a;
}
strtol
的可移植性存在点问题,long型在windows平台下是32位的,所以用strtoll
。strto*
接口,如果有,这道题就无需做了。class Solution {
public:
int myAtoi(string s) {
int a;
try{
a = stoi(s);
}catch(const std::out_of_range&){
string whitespaces{"\t\n\r\f\v "};
s.erase(0, s.find_first_not_of(whitespaces));
return s[0]!='-' ? 0x7FFFFFFF : 0x80000000;
}catch(const std::invalid_argument&){
return 0;
}
return a;
}
};
我首先怀疑的是C++的异常导致的慢,先学了一点理论和网络上关于C++异常性能不佳的观点。后来,我自己用google benchmark作了测试,测试代码和结果如下。
测试代码:
#include <iostream>
#include <string>
#include <regex>
#include <benchmark/benchmark.h>
using namespace std;
class Solution {
public:
int myAtoi(string s) {
regex re {R"(\s*([-+]?\d*))"};
cmatch cm;
regex_search(s.c_str(), cm, re);
if((cm[1].length() == 0) ||
(cm[1] == '-') ||
(cm[1] == '+'))
return 0;
int sign {string(cm[1])[0]=='-'?-1:1};
int rtv{};
try{
rtv = stoi(cm[1]);
}catch(const out_of_range &e){
return sign==1 ? 0x7FFFFFFF : 0x80000000;
}
return rtv;
}
};
static void BM_test_myAtoi(benchmark::State &state){
Solution slt;
string a{" -123456abcdefg"};
for(auto _: state)
slt.myAtoi(a);
}
static void BM_test_myAtoi_exc(benchmark::State &state){
Solution slt;
string a{" -123456999999999abcdefg"};
for(auto _: state)
slt.myAtoi(a);
}
BENCHMARK(BM_test_myAtoi);
BENCHMARK(BM_test_myAtoi_exc);
class Solution2 {
public:
int myAtoi(string s) {
regex re {R"(\s*([-+]?\d*))"};
cmatch cm;
regex_search(s.c_str(), cm, re);
if((cm[1].length() == 0) ||
(cm[1] == '-') ||
(cm[1] == '+'))
return 0;
int sign {string(cm[1])[0]=='-'?-1:1};
int rtv{};
rtv = stoi(cm[1]);
return rtv;
}
};
static void BM_test_myAtoi_no_exc(benchmark::State &state){
Solution slt;
string a{" -123456abcdefg"};
for(auto _: state)
slt.myAtoi(a);
}
BENCHMARK(BM_test_myAtoi_no_exc);
BENCHMARK_MAIN();
测试结果:
2022-12-17T17:00:30+08:00
Running ./a.out
Run on (4 X 3100 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x2)
L1 Instruction 32 KiB (x2)
L2 Unified 256 KiB (x2)
L3 Unified 3072 KiB (x1)
Load Average: 0.21, 0.46, 0.61
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
----------------------------------------------------------------
Benchmark Time CPU Iterations
----------------------------------------------------------------
BM_test_myAtoi 117608 ns 117230 ns 5788
BM_test_myAtoi_exc 122375 ns 121955 ns 5622
BM_test_myAtoi_no_exc 117365 ns 117012 ns 5918
不知道怎么关闭cpu scaling?
反复测试了多次,结果都一致,异常处理只慢5微秒左右(1毫秒=1000微秒),好像没啥!?而代码中是否有异常处理,在不会触发异常的时候,性能差不多(那一点点的差异在纳秒级)。
继续查看资料,我倒是接受了一个观点:正则表达式是通用算法,它基本上比专用算法要慢,而且占用资源也多!不要在对性能要求很高的场景下用正则表达式。
那最后,就只有一个原因了,Python的RE比C++的要快!用Python的timeit在同一台电脑上测试,Python的那个实现,的确要比C++的实现,快十几倍。(代码略,Python的re模块地层连接的是C代码)
难道C++的正则表达式真的很慢?
本文链接:https://cs.pynote.net/ag/leetcode/202212112/
-- EOF --
-- MORE --