Last Updated: 2024-06-14 22:49:28 Friday
-- TOC --
排在LeetCode最前面的,第1题,Two Sum,寻找两个数的和等于给定的Target,难度等级Easy。但是,这个既是Easy同时还是第1道热身的题,也有非常不简单的地方。
Given an array of integers nums and an integer target, return indices of the two numbers such that they add up to target. You may assume that each input would have exactly one solution, and you may not use the same element twice. You can return the answer in any order.
输入一个int array,一个target,在此array中寻找两个数,使这两个数的和等于target。每组输入都有且只有一个解。对于数组中的每个index对应的元素,只能使用一次(如果找到的是两个相等的数,则这两个数的index必不相同)。返回的index可以任意顺序。
Constraints:
- 2 <= nums.length <= 10^4
- -10^9 <= nums[i] <= 10^9(这个范围确定了两个int相加不会溢出)
- -10^9 <= target <= 10^9
- Only one valid answer exists
寻找两个数的和,最简单粗暴的方式,就是两重循环,通过尝试所有的两两组合,来确定这个必定存在的解。由于只有一个解,找到即可返回。
C
/**
* Note: The returned array must be malloced, assume caller calls free().
*/
int* twoSum(int* nums, int numsSize, int target, int* returnSize){
int i,j;
int *rp = NULL;
*returnSize = 0;
for (i=0; i<numsSize; ++i)
for (j=i+1; j<numsSize; ++j)
if ((nums[i]+nums[j]) == target) {
rp = (int*)malloc(sizeof(int)*2);
if (rp == NULL)
return rp;
rp[0] = i;
rp[1] = j;
*returnSize = 2;
return rp;
}
return rp;
}
应该在此接口最开始的地方malloc,如果找到了答案,但malloc失败,就白忙豁了...如果caller不做free,这里用一小块static memory最好。
Python
class Solution:
def twoSum(self, nums: List[int], target: int) -> List[int]:
size = len(nums)
for i in range(size):
for j in range(i+1,size):
if nums[i]+nums[j] == target:
return [i,j]
return [-1,-1] # might be used to indicate error
C++
class Solution {
public:
vector<int> twoSum(vector<int>& nums, int target) {
vector rtv{-1,-1}; // might be used to indicate error
for (int i=0; i<nums.size(); ++i)
for (int j=i+1; j<nums.size(); ++j)
if (nums[i]+nums[j] == target) {
rtv[0] = i;
rtv[1] = j;
return rtv;
}
return rtv;
}
};
使用Hash Table查找,在没有冲突的情况下是\(O(1)\)。当我们固定一个数\(a\),\(target - a = b\),对于某个a,如果b存在(index不同),这就是解。先建立Hash Table,然后在一个循环中查询Hash Table,查询是\(O(1)\)。建立Hash Table需要的时间和空间,与n是线性关系。
Python
Python自带的dict类型,就是非常好用的hash table,所以,用Python来写这段代码,比较简单轻松。
class Solution:
def twoSum(self, nums: List[int], target: int) -> List[int]:
# create hash table,
# the index with same value will be replaced.
hash_tab = {v:i for i,v in enumerate(nums)}
for i,v in enumerate(nums):
a = target - v
if a in hash_tab and hash_tab[a]!=i:
return [i,hash_tab[a]]
return [-1,-1]
无需顾虑哈希表中的冲突,nums中相同的值在hash_tab中的index值会被最大的index覆盖,根据题目,that's fine,要么这两个相同的元素就是解,在第2次循环时,排在前面的数会找到排在后面的数。要么这两个相同的元素都不是解。
C
下面是C语言版的解法,代码就比较复杂了,但是执行速度真心快。
typedef struct hash_node {
int val;
int idx;
struct hash_node *next;
} hash_node;
/**
* Note: The returned array must be malloced, assume caller calls free().
*/
int* twoSum(int* nums, int numsSize, int target, int* returnSize){
// prepare return value
*returnSize = 0;
int *prtv = malloc(sizeof(int*) * 2);
if (prtv == NULL)
return NULL; // has to return NULL
prtv[0] = prtv[1] = -1; // might be used to indicate error
// create hash table
hash_node **phash = (hash_node**)malloc(sizeof(char*) * numsSize);
if (phash == NULL)
return prtv;
memset(phash, 0, sizeof(char*)*numsSize);
// pass one: create hash table nodes
int key;
hash_node *pnode, *ptmp;
for (int i=0; i<numsSize; ++i) {
key = abs(nums[i]) % numsSize; // value to key
pnode = (hash_node*)malloc(sizeof(hash_node));
if (pnode == NULL)
goto end;
pnode->val = nums[i];
pnode->idx = i;
pnode->next = NULL;
if (phash[key] == NULL)
phash[key] = pnode;
else {
ptmp = phash[key];
while (ptmp->next)
ptmp = ptmp->next;
ptmp->next = pnode;
}
}
// pass two: search
int v;
for (int i=0; i<numsSize; ++i) {
v = target-nums[i];
key = abs(v) % numsSize;
if (phash[key] != NULL) {
ptmp = phash[key];
do {
if ((ptmp->val==v) && (ptmp->idx!=i)) {
prtv[0] = i;
prtv[1] = ptmp->idx;
*returnSize = 2;
goto end; // get out of the nested loop
}
} while (ptmp=ptmp->next);
}
}
end:; // free and return
hash_node *prev;
for (int i=0; i<numsSize; ++i) {
if (phash[i] != NULL) {
prev = phash[i];
do {
ptmp = prev->next;
free(prev);
} while (prev=ptmp);
}
}
free(phash);
return prtv;
}
这个C版本,建立hash表,从value到key的计算方式为:key = abs(nums[i]) % numsSize
。因此,冲突是很可能存在的!这个版本的内存占用比较高!这个实现考虑了冲突,其实是可以不用考虑的,不想改了...后面有uthash版本。
C++
class Solution {
public:
vector<int> twoSum(vector<int>& nums, int target) {
int i,v;
vector rtv {-1,-1};
unordered_map<int,int> udict;
udict.reserve(nums.size());
// create hash table,
// only save the last index for same value.
for (i=0; i<nums.size(); ++i)
udict.emplace(nums[i], i);
// search
for (i=0; i<nums.size(); ++i) {
v = target - nums[i];
auto t {udict.find(v)};
if (t!=udict.end() && (udict[v]!=i)) {
rtv[0] = i;
rtv[1] = udict[v];
return rtv;
}
}
return rtv;
}
};
One-pass的思路是,在建立Hash Table的同时,其实就可以做查询了,一边建立一边查询,这就是one pass,只遍历一遍数据。解的两个数,一前一后,当循环到后面的那个数的时候,前面的数一定已经在hash表中存在了。而且,这个思路还彻底规避了找到相同index的风险。
Python
class Solution:
def twoSum(self, nums: List[int], target: int) -> List[int]:
hash_tab = {}
for i,v in enumerate(nums):
# search first
a = target - v
if a in hash_tab:
return [i,hash_tab[a]]
# add to hash table
hash_tab[v] = i
return [-1,-1]
C++
class Solution {
public:
vector<int> twoSum(vector<int>& nums, int target) {
vector rtv {-1,-1};
int len { static_cast<int>(nums.size()) };
unordered_map<int,int> ump;
for(int i{}; i<len; ++i){
int a = target - nums[i];
if(ump.contains(a)){
rtv[0] = i;
rtv[1] = ump[a];
return rtv;
}
ump[nums[i]] = i;
}
return rtv;
}
};
C
typedef struct hash_node {
int val;
int idx;
struct hash_node *next;
} hash_node;
/**
* Note: The returned array must be malloced, assume caller calls free().
*/
int* twoSum(int* nums, int numsSize, int target, int* returnSize){
// prepare return value
*returnSize = 0;
int *prtv = (int*)malloc(sizeof(int*) * 2);
if (prtv == NULL)
return NULL; // has to return NULL
prtv[0] = prtv[1] = -1; // might be used to indicate error
// create hash table
hash_node **phash = (hash_node**)malloc(sizeof(char*) * numsSize);
if (phash == NULL)
return prtv;
memset(phash, 0, sizeof(char*)*numsSize);
// one pass
int key,v;
hash_node *pnode, *ptmp;
for (int i=0; i<numsSize; ++i) {
// search first
v = target - nums[i];
key = abs(v) % numsSize;
if (phash[key] != NULL) {
ptmp = phash[key];
do {
if (ptmp->val==v) {
prtv[0] = i;
prtv[1] = ptmp->idx;
*returnSize = 2;
goto end; // get out of the nested loop
}
} while (ptmp=ptmp->next);
}
// add to hash table
key = abs(nums[i]) % numsSize;
pnode = (hash_node*)malloc(sizeof(hash_node));
if (pnode == NULL)
goto end;
pnode->val = nums[i];
pnode->idx = i;
pnode->next = NULL;
if (phash[key] == NULL)
phash[key] = pnode;
else {
ptmp = phash[key];
while (ptmp->next)
ptmp = ptmp->next;
ptmp->next = pnode;
}
}
end:; // free and return
hash_node *prev;
for (int i=0; i<numsSize; ++i) {
if (phash[i] != NULL) {
prev = phash[i];
do {
ptmp = prev->next;
free(prev);
} while (prev=ptmp);
}
}
free(phash);
return prtv;
}
C语言的实现,还有个可以优化的细节,即使用memory pool。建立Hash Table的时候,不要一次次的malloc(对应一次次的free),而是事先准备一个足够大小的memory pool,每次从pool中取一块来使用。虽然会造成memory的使用增大,但,从来都是内存换速度呀.....
此题的约束条件中说明,int array的最大长度是10^4
。使用下面这个struct hash_node
:
typedef struct hash_node {
int val;
int idx;
struct hash_node *next;
} hash_node;
最大需要的内存为:156.25KB
,不算特别多,值得一试!代码如下:
typedef struct hash_node {
int val;
int idx;
struct hash_node *next;
} hash_node;
/**
* Note: The returned array must be malloced, assume caller calls free().
*/
int* twoSum(int* nums, int numsSize, int target, int* returnSize){
// prepare return value
*returnSize = 0;
int *prtv = (int*)malloc(sizeof(int*) * 2);
if(!prtv)
return NULL; // has to return NULL
prtv[0] = prtv[1] = -1; // might be used to indicate error
// create hash table
hash_node **phash = (hash_node**)malloc(sizeof(char*) * numsSize);
if (phash == NULL)
return prtv;
memset(phash, 0, sizeof(char*)*numsSize);
// malloc memory pool
hash_node *ht_mem = (hash_node*)malloc(sizeof(hash_node)*numsSize);
if(!ht_mem){
free(phash);
return NULL;
}
int ht_idx = 0;
// one pass
int key,v;
hash_node *pnode, *ptmp;
for (int i=0; i<numsSize; ++i) {
// search first
v = target - nums[i];
key = abs(v) % numsSize;
if (phash[key] != NULL) {
ptmp = phash[key];
do {
if (ptmp->val==v) {
prtv[0] = i;
prtv[1] = ptmp->idx;
*returnSize = 2;
goto end; // get out of the nested loop
}
} while (ptmp=ptmp->next);
}
// add to hash table
key = abs(nums[i]) % numsSize;
// pnode = (hash_node*)malloc(sizeof(hash_node));
pnode = (hash_node*)&ht_mem[ht_idx++];
pnode->val = nums[i];
pnode->idx = i;
pnode->next = NULL;
if (phash[key] == NULL)
phash[key] = pnode;
else {
ptmp = phash[key];
while (ptmp->next)
ptmp = ptmp->next;
ptmp->next = pnode;
}
}
end:; // free and return
free(phash);
free(ht_mem);
return prtv;
}
代码变简单了一些。
这个算法思路有个专业术语:Two Points Problem
加速搜索的常规手段,前面已经使用了Hash Table,另一个是先排序,对排序后的数据进行搜索,比如著名的二分搜索算法。但此题不能使用二分搜索,因为要寻找的是两个数,以他们的和作为找到的判定条件。我们可以采用一个巧妙的办法,用双向逼近的方法。这个方法,同样避免了index相同的场景。由于存在一次排序,这个算法的Big-Oh将由排序算法主导,各语言内置的排序算法接口,都非常快,但最快也要\(N\log{N}\)。
Python
class Solution:
def twoSum(self, nums: List[int], target: int) -> List[int]:
n = len(nums)
a = [(n,i) for i,n in enumerate(nums)]
a.sort(key=lambda x: x[0])
left = 0
right = n-1
while left < right:
s = a[left][0] + a[right][0]
if s == target:
return [a[left][1], a[right][1]]
if s < target:
left += 1
else:
right -= 1
return [-1,-1]
C
用到了循环展开。
typedef struct node {
int val;
int idx;
} node;
int comp_node(const void *pa, const void *pb){
// 用比较的方式,规避了int计算可能的溢出
int a = ((node*)pa)->val;
int b = ((node*)pb)->val;
return a<b ? -1 : a>b ? 1 : 0;
}
/**
* Note: The returned array must be malloced, assume caller calls free().
*/
int* twoSum(int* nums, int numsSize, int target, int* returnSize){
*returnSize = 2;
int *prtv = (int*)malloc(sizeof(int*) * 2);
// sort by value
node *pnode = (node*)malloc(sizeof(node) * numsSize);
/*for(int i=0; i<numsSize; ++i){
pnode[i].val = nums[i];
pnode[i].idx = i;
}*/
int i = 0;
int g = numsSize / 16;
while(g--){
pnode[i].val=nums[i]; pnode[i].idx=i; ++i;
pnode[i].val=nums[i]; pnode[i].idx=i; ++i;
pnode[i].val=nums[i]; pnode[i].idx=i; ++i;
pnode[i].val=nums[i]; pnode[i].idx=i; ++i;
pnode[i].val=nums[i]; pnode[i].idx=i; ++i;
pnode[i].val=nums[i]; pnode[i].idx=i; ++i;
pnode[i].val=nums[i]; pnode[i].idx=i; ++i;
pnode[i].val=nums[i]; pnode[i].idx=i; ++i;
pnode[i].val=nums[i]; pnode[i].idx=i; ++i;
pnode[i].val=nums[i]; pnode[i].idx=i; ++i;
pnode[i].val=nums[i]; pnode[i].idx=i; ++i;
pnode[i].val=nums[i]; pnode[i].idx=i; ++i;
pnode[i].val=nums[i]; pnode[i].idx=i; ++i;
pnode[i].val=nums[i]; pnode[i].idx=i; ++i;
pnode[i].val=nums[i]; pnode[i].idx=i; ++i;
pnode[i].val=nums[i]; pnode[i].idx=i; ++i;
}
while(i < numsSize){
pnode[i].val=nums[i]; pnode[i].idx=i; ++i;
}
qsort(pnode, numsSize, sizeof(node), comp_node);
// bi-directional search
int left = 0;
int right = numsSize-1;
while(left < right){
int sum = pnode[left].val + pnode[right].val;
if(sum == target){
prtv[0] = pnode[left].idx;
prtv[1] = pnode[right].idx;
free(pnode);
return prtv;
}
if(sum < target)
++left;
else
--right;
}
// never reach here...
free(pnode);
return prtv;
}
C++
用decltype直接得到container的value_type。
class Solution {
public:
vector<int> twoSum(vector<int>& nums, int target) {
vector rtv {-1,-1};
vector<pair<int,size_t>> idxnums;
size_t len { nums.size() };
idxnums.reserve(len);
for(size_t i{0}; i<len; ++i)
idxnums.emplace_back(nums[i],i);
sort(idxnums.begin(),
idxnums.end(),
[](decltype(idxnums)::value_type &a,
decltype(idxnums)::value_type &b){
return a.first < b.first;
});
size_t left {0};
size_t right {len-1};
while(left < right){
int sum { idxnums[left].first+idxnums[right].first };
if(sum == target){
rtv[0] = idxnums[left].second;
rtv[1] = idxnums[right].second;
return rtv;
}
if(sum < target)
++left;
else
--right;
}
// never reach here
return rtv;
}
};
在C语言中构建较为复杂的hash表是比较费劲的,LeetCode提供uthash可以用,想必是uthash用的人多啊。我也来尝试一下,使用one pass算法:
typedef struct node {
int val;
int idx;
UT_hash_handle hh;
} Node;
int* twoSum(int* nums, int numsSize, int target, int* returnSize){
*returnSize = 2;
int *prtv = malloc(sizeof(int*)*2);
Node *pnodes = malloc(sizeof(Node)*numsSize);
int node_idx = 0;
Node *htable = NULL;
Node *ps;
int search;
for(int i=0; i<numsSize; ++i){
ps = NULL;
search = target - nums[i];
HASH_FIND_INT(htable, &search, ps);
if(!ps){
Node *pa = &pnodes[node_idx++];
pa->val = nums[i];
pa->idx = i;
HASH_ADD_INT(htable, val, pa);
}
else{
prtv[0] = ps->idx;
prtv[1] = i;
break;
}
}
HASH_CLEAR(hh, htable);
free(pnodes);
return prtv;
}
这个版本的C代码,看着舒服多了...
class Solution:
def twoSum(self, nums: List[int], target: int) -> List[int]:
n = len(nums)
for i in range(n):
for j in range(i+1,n):
if nums[i]+nums[j] == target:
return [i,j]
if nums[n-i-1]+nums[n-j-1] == target:
return [n-i-1,n-j-1]
return [0,0]
在每一个loop中,干双份工作,同时按两个方向遍历。存在潜在的重复计算问题,但在此问题上,实际效果非常好!非常好...各个语言的版本都是最好战绩,而且它不消耗内存!
这个方法快的原因:只要找到解,就立即return!同时,解出现的位置,大概率在数组两边!
纯粹为了学习在折腾....
C++ with SSE
#include <immintrin.h>
class Solution {
public:
vector<int> twoSum(vector<int>& nums, int target) {
vector rtv {-1,-1};
int len { static_cast<int>(nums.size()) };
unordered_map<int,int> ump;
__m128i t = _mm_set1_epi32(target);
int *data { nums.data() };
int presult[4];
int i{};
for(; i<len/4; ++i){
__m128i a = _mm_loadu_si128((__m128i_u*)(data+i*4));
__m128i b = _mm_sub_epi32(t, a);
_mm_storeu_si128((__m128i_u*)presult, b);
for(int j{}; j<4; ++j){
if(ump.contains(presult[j])){
rtv[0] = i*4+j;
rtv[1] = ump[presult[j]];
return rtv;
}
ump[nums[i*4+j]] = i*4+j;
}
}
for(i=i*4; i<len; ++i){
int a = target - nums[i];
if(ump.contains(a)){
rtv[0] = i;
rtv[1] = ump[a];
return rtv;
}
ump[nums[i]] = i;
}
return rtv;
}
};
C++ with AVX
#include <immintrin.h>
class Solution {
public:
__attribute__((target("avx2")))
vector<int> twoSum(vector<int>& nums, int target) {
vector rtv {-1,-1};
int len { static_cast<int>(nums.size()) };
unordered_map<int,int> ump;
__m256i t = _mm256_set1_epi32(target);
int *data { nums.data() };
int presult[8];
int i{};
for(; i<len/8; ++i){
__m256i a = _mm256_loadu_si256((__m256i_u*)(data+i*8));
__m256i b = _mm256_sub_epi32(t, a);
_mm256_storeu_si256((__m256i_u*)presult, b);
for(int j{}; j<8; ++j){
if(ump.contains(presult[j])){
rtv[0] = i*8+j;
rtv[1] = ump[presult[j]];
return rtv;
}
ump[nums[i*8+j]] = i*8+j;
}
}
for(i=i*8; i<len; ++i){
int a = target - nums[i];
if(ump.contains(a)){
rtv[0] = i;
rtv[1] = ump[a];
return rtv;
}
ump[nums[i]] = i;
}
return rtv;
}
};
__attribute__((target("avx2")))
表示这个函数的编译需要avx2的支持,否则,就需要在命令行使用-mavx2
编译选项。(SSE不用烦的,默认就支持,不需要额外的属性或编译选项)C with AVX
用C就先不麻烦hash table了,直接用brute force配合AVX的32字节并行能力,用并行加法。
#include <immintrin.h>
__attribute__((target("avx2")))
int* twoSum(int* nums, int numsSize, int target, int* returnSize){
*returnSize = 2;
int *prtv = malloc(sizeof(int*)*2);
int presult[8], j;
for(int i=0; i<numsSize; ++i){
if(numsSize-i-1 >= 8){
__m256i a = _mm256_set1_epi32(nums[i]);
for(j=0; j<(numsSize-i-1)/8; ++j){
__m256i b = _mm256_loadu_si256((__m256i_u*)(nums+j*8+i+1));
__m256i c = _mm256_add_epi32(a,b);
_mm256_storeu_si256((__m256i_u*)presult, c);
for(int k=0; k<8; ++k)
if(presult[k] == target){
prtv[0] = i;
prtv[1] = j*8 + i+1 + k;
return prtv;
}
}
} else {
for(j=i+1; j<numsSize; ++j)
if(nums[i]+nums[j] == target){
prtv[0] = i;
prtv[1] = j;
return prtv;
}
}
}
return prtv;
}
纯数字的排序,可以做到线性复杂度,不知是否可能比qsort更快?
本文链接:https://cs.pynote.net/ag/leetcode/202203141/
-- EOF --
-- MORE --