[labuladong/fucking-algorithm]滑动窗口算法延伸:Rabin Karp 字符匹配算法 :: labuladong的算法小抄

2023-12-11 959 views

回答

6

纯纯滑动窗口思路Java代码如下:

  public static List<String> findRepeatedDnaSequences(String s) {
       int L = 10;
       Set<String> res = new HashSet();
       List<String> Fres = new ArrayList<>();
       HashSet<String> seen = new HashSet<>();

       List<Character> window = new ArrayList<>();
       int left = 0;int right = 0;
       while (right < s.length()) {
           window.add(s.charAt(right));
           right++;
           if (right - left == L) {
               String windowStr = "";

               for (int i = 0 ; i < window.size();i++) {
                   windowStr += window.get(i);
               }

               if (seen.contains(windowStr)) {
//                   System.out.println("找到一个重复子串: "+ windowStr);
                   res.add(windowStr);
               }else {
                   seen.add(windowStr);
               }
               window.remove(0);
               left++;
           }
       }
       Fres.addAll(res);
    return Fres;
    }
0
func findRepeatedDnaSequences(s string) []string {
    arr := make([]byte, len(s))
    for i, v := range s {
        switch string(v) {
        case "A":
            arr[i] = 0
        case "C":
            arr[i] = 1
        case "G":
            arr[i] = 2
        case "T":
            arr[i] = 3
        }
    }

    left, right := 0, 0
    var res []string
    resMap := map[string]struct{}{}
    seek := map[int]bool{}

    tmp := 0
    for right < len(arr) {
        tmp = addNewStr(tmp, arr[right])
        right++

        for right-left >= 10 {
            if _, exist := seek[tmp]; exist {
                resMap[s[left:right]] = struct{}{}
            } else {
                seek[tmp] = true
            }
            left++
        }
    }

    for key := range resMap {
        res = append(res, key)
    }

    return res
}

func addNewStr(num int, b byte) int {
    num = num << 2
    num = num & 0xfffff
    num = num | int(b)
    return num
}

go语言解决重复的DNA序列问题,感谢大佬提供的思路

9

取了个巧,刚好这个问题只需要求重复的10个序列,而总共只有4种碱基,利用2个bit位刚好能够表示,总共只需要20个bit位,也就是int32其实就够了

4

我看官方题解下面也有人在讨论DNA这个题,如本书作者所说(下图1),O(L)的时间复杂度来自于生成子串,可是加入了hash后还是存在生成子串的过程吧(下图2)?所以时间复杂度不是应该还是要考虑L? image image

8

TypeScript

function findRepeatedDnaSequences(s: string): string[] {
  const ret: Set<string> = new Set();

  const n = s.length;

  const map = {
    'A': 0,
    'G': 1,
    'C': 2,
    'T': 3,
  };

  const nums: number[] = new Array(n);

  // 将字符串转数字
  for(let i = 0; i < n; i++) {
    nums[i] = map[s[i]];
  };

  // 数字长度
  const L = 10;
  // 4 进制
  const R = 4;
  const RL = Math.pow(R, L - 1);

  // 左右窗口边界索引
  let left = 0;
  let right = 0;

  // 记录重复出现的 hash 值
  const seenHash = {};

  // 窗口内 hash
  let windowHash = 0;

  while (right < s.length) {
    windowHash = windowHash * R + nums[right];

    right++;

    while (right - left === L) {
      if (seenHash[windowHash]) {
        ret.add(s.substring(left, right));
      } else {
        seenHash[windowHash] = true;
      }

      windowHash = windowHash - nums[left] * RL;

      left++;
    }
  }

  return Array.from(ret);
};
5

滴滴滴打卡

6

第一章早就刷完了怎么还没小旗子,原来更新了

4

推导一下公式

hashBase = 1658598167
needleHashBase = hashStep ^ (needle.length - 1)
hash = hash - charToRemove * needleHashBase
     => (hash - charToRemove * needleHashBase) % hashBase      // this is not equivalent, will cause hash collision
     = ((hash - charToRemove * needleHashBase) + hashBase) % hashBase      // (a) % Q = (a + Q) % Q, in case for negative when removing a char
     = ((hash % hashBase - charToRemove * needleHashBase % hashBase) + hashBase) % hashBase     // (a + b) % Q = (a % Q + b % Q) % Q, mod each one to prevent overflow
     = ((hash - charToRemove * needleHashBase % hashBase) + hashBase) % hashBase     // hash was alread moded by hashBase in the previous step, so hash % hashBase = hash
7

-1%3 =2 最后rabin-karp算法中的左移处理时,不加Q也是可以的

7

打卡 为了避免整型溢出,选一个大素数Q,取模。

4

windowHash = ((R windowHash) % Q + txt.charAt(right)) % Q 为什么不是 windowHash = (R windowHash+ txt.charAt(right)) % Q 或者 windowHash = ((R * windowHash) % Q + txt.charAt(right) % Q ) % Q 呢