4. Median of Two Sorted Arrays,两个有序数组的中值

Last Updated: 2024-01-10 13:18:26 Wednesday

-- TOC --

我的第1道LeetCode HARD级别挑战,想快,的确很难!

题目分析

Given two sorted arrays nums1 and nums2 of size m and n respectively, return the median of the two sorted arrays. The overall run time complexity should be \(O(log(m+n))\).

输入是两个已排序的array,已知他们的size,输出这两个array的中间数median,float类型。要求Big-Oh为\(O(\log(m+n))\)。概率和统计中对median的定义很清晰,如果元素个数是奇数,median就是中间那个数,如果是偶数,就是中间两个数的算数平均值。

两个示例:

Input: nums1 = [1,3], nums2 = [2]
Output: 2.00000
Explanation: merged array = [1,2,3] and median is 2.

Input: nums1 = [1,2], nums2 = [3,4]
Output: 2.50000
Explanation: merged array = [1,2,3,4] and median is (2 + 3) / 2 = 2.5.

Constraints:

Brute Force

将两个已排序array合并成一个array,然后找中间的数,这样做显然比较简单,但性能较差。玩一玩...

Python

class Solution:
    def findMedianSortedArrays(self, nums1: List[int], nums2: List[int]) -> float:
        a = sorted(nums1+nums2)
        alen = len(a)
        m = alen // 2
        return a[m]/1 if alen%2 else (a[m-1]+a[m])/2 

C++

class Solution {
public:
    double findMedianSortedArrays(vector<int>& nums1, vector<int>& nums2) {
        size_t n1 {nums1.size()};
        size_t n2 {nums2.size()};
        size_t n {n1+n2};
        vector<int> nums(n);
        merge(nums1.begin(), nums1.end(),
              nums2.begin(), nums2.end(),
              nums.begin());
        return n&1 ? (double)nums[n/2] : \
                     ((double)(nums[n/2-1]+nums[n/2]))/2;
    }
};

C

static inline int comp(const void *a, const void *b){
    return *(int*)a<*(int*)b ? -1 : *(int*)a>*(int*)b ? 1 : 0; 
}

double findMedianSortedArrays(int* nums1, int nums1Size, int* nums2, int nums2Size){
    int n = nums1Size + nums2Size;
    int *p = malloc(sizeof(int)*n);
    memcpy(p, nums1, sizeof(int)*nums1Size);
    memcpy(p+nums1Size, nums2, sizeof(int)*nums2Size);
    qsort(p, n, sizeof(int), comp);
    double r = n&1 ? (double)p[n/2] : ((double)(p[n/2]+p[n/2-1]))/2;
    free(p);
    return r;
}

不如再试一试,自己写一个像C++一样的merge接口:

static void merge_two_sorted(int *d, int *n1, int n1size, int *n2, int n2size){
    int i1,i2,j;
    i1 = i2 = j = 0;
    while(i1<n1size && i2<n2size)
        if(n1[i1] <= n2[i2])
            d[j++] = n1[i1++];
        else
            d[j++] = n2[i2++];
    if(i1 == n1size)
        memcpy(d+j, n2+i2, (n2size-i2)*sizeof(int));
    else
        memcpy(d+j, n1+i1, (n1size-i1)*sizeof(int));
}

double findMedianSortedArrays(int* nums1, int nums1Size, int* nums2, int nums2Size){
    int n = nums1Size + nums2Size;
    int *p = malloc(sizeof(int)*n);
    merge_two_sorted(p, nums1, nums1Size, nums2, nums2Size);
    double r = n&1 ? (double)p[n/2] : ((double)(p[n/2]+p[n/2-1]))/2;
    free(p);
    return r;
}

双指针推进

Brute Force调用了编程语言标准库中的排序算法,虽然不算很慢,但需要消耗\(O(m+n)\)的内存。其实,我们不需要用那么多内存来排序,只需要找到中间的位置,将需要的中间的数拿出来计算返回即可。由于两个输入队列已经排好序,因此用两个指针来推进到中间的位置,是一个还算自然的思路。

Python

class Solution:
    def findMedianSortedArrays(self, nums1: List[int], nums2: List[int]) -> float:
        n1 = len(nums1)
        n2 = len(nums2)
        if n1 == 0:
            p = n2 // 2
            return nums2[p]/1 if n2&1 else (nums2[p]+nums2[p-1])/2
        if n2 == 0:
            p = n1 // 2
            return nums1[p]/1 if n1&1 else (nums1[p]+nums1[p-1])/2
        n = n1 + n2
        m = n // 2
        p1 = p2 = k = 0
        while m:
            m -= 1
            if nums1[p1] < nums2[p2]:
                k = nums1[p1]
                p1 += 1
                if p1 == n1:
                    return nums2[p2+m]/1 if n%2 else \
                               (nums2[p2+m-1]+nums2[p2+m])/2 if m else \
                                   (nums2[p2]+k)/2
            else:
                k = nums2[p2]
                p2 += 1
                if p2 == n2:
                    return nums1[p1+m]/1 if n%2 else \
                               (nums1[p1+m-1]+nums1[p1+m])/2 if m else \
                                   (nums1[p1]+k)/2
        mn = min(nums1[p1], nums2[p2])
        return mn/1 if n%2 else (mn+k)/2

k的作用很关键,它记录了每一次推进前的那个最小的数。这个解法,已经满足了\(O(m+n)\)的要求。

C++

class Solution {
public:
    double findMedianSortedArrays(vector<int>& nums1, vector<int>& nums2) {
        size_t n1 {nums1.size()};
        size_t n2 {nums2.size()};
        if(n1 == 0){
            size_t p {n2/2};
            return n2&1 ? (double)nums2[p] : (double)(nums2[p-1]+nums2[p])/2;
        }
        if(n2 == 0){
            size_t p {n1/2};
            return n1&1 ? (double)nums1[p] : (double)(nums1[p-1]+nums1[p])/2;
        }
        size_t n {n1+n2};
        size_t m {n/2};
        int p1 {}, p2 {}, k {};
        while(m--)
            if(nums1[p1] < nums2[p2]){
                k = nums1[p1++];
                if(p1 == n1)
                    return n&1 ? (double)nums2[p2+m] : \
                             m ? (double)(nums2[p2+m-1]+nums2[p2+m])/2 : \
                                    (double)(nums2[p2]+k)/2;
            } else {
                k = nums2[p2++];
                if(p2 == n2){
                    return n&1 ? (double)nums1[p1+m] : \
                             m ? (double)(nums1[p1+m-1]+nums1[p1+m])/2 : \
                                    (double)(nums1[p1]+k)/2;
                }
            }
        int mn {min(nums1[p1], nums2[p2])};
        return n&1 ? (double)mn : (double)(mn+k)/2;
    }
};

C

double findMedianSortedArrays(int* nums1, int nums1Size, int* nums2, int nums2Size){
    if(nums1Size == 0){
        int p = nums2Size / 2;
        return nums2Size&1 ? (double)nums2[p] : (double)(nums2[p-1]+nums2[p])/2; 
    }
    if(nums2Size == 0){
        int p = nums1Size / 2;
        return nums1Size&1 ? (double)nums1[p] : (double)(nums1[p-1]+nums1[p])/2; 
    }
    int n = nums1Size + nums2Size;
    int m = n / 2;
    int p1, p2, k;
    p1 = p2 = k = 0;
    while(m--)
        if(nums1[p1] < nums2[p2]){
            k = nums1[p1++];
            if(p1 == nums1Size)
                return n&1 ? (double)nums2[p2+m] : \
                         m ? (double)(nums2[p2+m-1]+nums2[p2+m])/2 : \
                                (double)(nums2[p2]+k)/2;
        } else {
            k = nums2[p2++];
            if(p2 == nums2Size){
                return n&1 ? (double)nums1[p1+m] : \
                         m ? (double)(nums1[p1+m-1]+nums1[p1+m])/2 : \
                                (double)(nums1[p1]+k)/2;
            }
        }
    int mn = nums1[p1]<nums2[p2] ?  nums1[p1] : nums2[p2];
    return n&1 ? (double)mn : (double)(mn+k)/2;
}

Divide-and-Conquer

在一个有序队列中,查找某个元素的位置,顺序遍历查找(Sequential Search)是\(O(n)\),而二分查找(Binary Search)是\(O(\log n)\)。在单个有序队列上作二分查找很容易,现在是两条有序队列。不过此题的要求是找到中间的1个或2个数,位置赢确定,也就跟bsearch没啥关系了。

此题应用Divide-and-Conquer的关键(我看答案了,这个思路我自己没想出来),是要想清楚,已知两个序列的长度,也就知道了需要寻找的那1个或2个数在混合排序后的index。双指针推进一次只能+1,实际上每次可以推进更多!

在两条队列中,找一个知道混合排序后index的数!大于两条队列呢?用heap数据结构配合单步推进!LeetCode第23题,合并k个有序单链表

Python

class Solution:
    def findMedianSortedArrays(self, nums1: List[int], nums2: List[int]) -> float:
        n1 = len(nums1)
        n2 = len(nums2)
        if n1 == 0:
            p = n2 // 2
            return nums2[p]/1 if n2%2 else (nums2[p]+nums2[p-1])/2
        if n2 == 0:
            p = n1 // 2
            return nums1[p]/1 if n1%2 else (nums1[p]+nums1[p-1])/2
        n = n1 + n2
        x = (n-1) // 2
        p11 = p12 = p21 = p22 = 0
        while x:
            step = (x-1) // 2
            p12 = min(p11+step, n1-1)
            p22 = min(p21+step, n2-1)
            if nums1[p12] < nums2[p22]:
                x -= p12 - p11 + 1
                p11 = p12 + 1
                if p11 == n1:
                    return nums2[p21+x]/1 if n%2 else \
                               (nums2[p21+x]+nums2[p21+x+1])/2
            else:
                x -= p22 - p21 + 1
                p21 = p22 + 1
                if p21 == n2:
                    return nums1[p11+x]/1 if n%2 else \
                               (nums1[p11+x]+nums1[p11+x+1])/2
        if n%2:
            return min(nums1[p11],nums2[p21])/1
        else:
            if nums1[p11] < nums2[p21]:
                return (nums1[p11] +
                        (nums2[p21] if p11==n1-1 else 
                            min(nums1[p11+1],nums2[p21])))/2
            else:
                return (nums2[p21] +
                        (nums1[p11] if p21==n2-1 else 
                            min(nums2[p21+1],nums1[p11])))/2

step是性能关键!x是整体要排除的数的个数,step是x的一半还少,因为有两条已排序队列。每次计算出来的step值,都是可以在某一条队列中安全排除掉的元素个数,但就是不知道它们在哪条队列中。

Pushing forward more than 1 step in each loop is definitely better than only 1 step. But the order of growth is the same, with a smaller hidden constant.

C++

class Solution {
public:
    double findMedianSortedArrays(vector<int>& nums1, vector<int>& nums2) {
        size_t n1 {nums1.size()};
        size_t n2 {nums2.size()};
        if(n1 == 0){
            size_t p {n2/2};
            return n2&1 ? (double)nums2[p] : (double)(nums2[p-1]+nums2[p])/2;
        }
        if(n2 == 0){
            size_t p {n1/2};
            return n1&1 ? (double)nums1[p] : (double)(nums1[p-1]+nums1[p])/2;
        }
        size_t n {n1+n2};
        size_t x {(n-1)/2};
        size_t p11{}, p12{}, p21{}, p22{};
        while(x){
            size_t step {(x-1)/2};
            p12 = min(p11+step, n1-1);
            p22 = min(p21+step, n2-1);
            if(nums1[p12] < nums2[p22]){
                x -= p12 - p11 + 1;
                p11 = p12 + 1;
                if(p11 == n1)
                    return n&1 ? (double)nums2[p21+x] : \
                                        (double)(nums2[p21+x]+nums2[p21+x+1])/2;
            } else {
                x -= p22 - p21 + 1;
                p21 = p22 + 1;
                if(p21 == n2)
                    return n&1 ? (double)nums1[p11+x] : \
                                        (double)(nums1[p11+x]+nums1[p11+x+1])/2;
            }
        }
        if(n & 1)
            return (double)min(nums1[p11], nums2[p21]);
        else{
            if(nums1[p11] < nums2[p21])
                return (double)(nums1[p11] + \
                            (p11==n1-1 ? nums2[p21] : min(nums1[p11+1],nums2[p21])))/2;
            else
                return (double)(nums2[p21] + \
                            (p21==n2-1 ? nums1[p11] : min(nums2[p21+1],nums1[p11])))/2;
        }
    }
};

C

#define MIN(a,b)  ((a)<(b)?(a):(b))

double findMedianSortedArrays(int* nums1, int nums1Size, int* nums2, int nums2Size){
    if(nums1Size == 0){
        int p = nums2Size / 2;
        return nums2Size&1 ? (double)nums2[p] : (double)(nums2[p-1]+nums2[p])/2; 
    }
    if(nums2Size == 0){
        int p = nums1Size / 2;
        return nums1Size&1 ? (double)nums1[p] : (double)(nums1[p-1]+nums1[p])/2; 
    }
    int n = nums1Size + nums2Size;
    int x = (n-1) / 2;
    int p11, p12, p21, p22;
    p11 = p12 = p21 = p22 = 0; 
    while(x){
        int step = (x-1)/2;
        p12 = MIN(p11+step, nums1Size-1);
        p22 = MIN(p21+step, nums2Size-1);
        if(nums1[p12] < nums2[p22]){
            x -= p12 - p11 + 1;
            p11 = p12 + 1;
            if(p11 == nums1Size)
                return n&1 ? (double)nums2[p21+x] : \
                                    (double)(nums2[p21+x]+nums2[p21+x+1])/2;
        } else {
            x -= p22 - p21 + 1;
            p21 = p22 + 1;
            if(p21 == nums2Size){
                return n&1 ? (double)nums1[p11+x] : \
                                    (double)(nums1[p11+x]+nums1[p11+x+1])/2;
            }
        }
    }
    if(n & 1)
        return (double)MIN(nums1[p11], nums2[p21]);
    else{
        if(nums1[p11] < nums2[p21])
            return (double)(nums1[p11] + \
                        (p11==nums1Size-1?nums2[p21]:MIN(nums1[p11+1],nums2[p21])))/2;
        else
            return (double)(nums2[p21] + \
                        (p21==nums2Size-1?nums1[p11]:MIN(nums2[p21+1],nums1[p11])))/2;
    }
}

对Python性能的思考

Python非常适合用来写算法,实现prototype,验证可行性。但用Python写出来的算法,其性能不一定比在Python中用笨办法的实现要快。这是因为笨办法执行的statements数量少,重要的操作底层可能是C的速度。而好办法执行的statements数量可能很多,这一多,可能就反而不如有C的速度加持的笨办法了。

因此,Python用来实现和验证算法,速度测量需要使用C/C++来做。其实,Python实现后,再用C++实现就比较简单了,但如果用C来实现,会相对困难。

本文链接:https://cs.pynote.net/ag/leetcode/202211151/

-- EOF --

-- MORE --