math_utils.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. # -*- coding: utf-8 -*-
  2. """
  3. @author:XuMing(xuming624@qq.com)
  4. @description:
  5. """
  6. def edit_distance_word(word, char_set):
  7. """
  8. all edits that are one edit away from 'word'
  9. :param word:
  10. :param char_set:
  11. :return:
  12. """
  13. splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
  14. transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
  15. replaces = [L + c + R[1:] for L, R in splits if R for c in char_set]
  16. return set(transposes + replaces)
  17. def get_sub_array(nums):
  18. """
  19. 取所有连续子串,
  20. [0, 1, 2, 5, 7, 8]
  21. => [[0, 3], 5, [7, 9]]
  22. :param nums: sorted(list)
  23. :return:
  24. """
  25. ret = []
  26. ii = 0
  27. for i, c in enumerate(nums):
  28. if i == 0:
  29. pass
  30. elif i <= ii:
  31. continue
  32. elif i == len(nums) - 1:
  33. ret.append([c])
  34. break
  35. ii = i
  36. cc = c
  37. # get continuity Substring
  38. while ii < len(nums) - 1 and nums[ii + 1] == cc + 1:
  39. ii = ii + 1
  40. cc = cc + 1
  41. if ii > i:
  42. ret.append([c, nums[ii] + 1])
  43. else:
  44. ret.append([c])
  45. return ret
  46. def find_all_idx2(lst, item):
  47. """
  48. 取列表中指定元素的所有下标
  49. :param lst: 列表或字符串
  50. :param item: 指定元素
  51. :return: 下标列表
  52. """
  53. ids = []
  54. for i in range(len(lst)):
  55. if item == lst[i]:
  56. ids.append(i)
  57. return ids
  58. def find_all_idx(lst, item):
  59. """
  60. 取列表中指定元素的所有下标
  61. :param lst: 列表或字符串
  62. :param item: 指定元素
  63. :return: 下标列表
  64. """
  65. ids = []
  66. pos = -1
  67. for i in range(lst.count(item)):
  68. pos = lst.index(item, pos + 1)
  69. if pos > -1:
  70. ids.append(pos)
  71. return ids
  72. def edit_distance_dp(str1: str, str2: str) -> int:
  73. """
  74. 计算两个字符串的编辑距离
  75. Args:
  76. str1:
  77. str2:
  78. Returns:
  79. int: 编辑距离
  80. """
  81. if not str1:
  82. return len(str2)
  83. if not str2:
  84. return len(str1)
  85. dp = [[0 for _ in range(len(str2) + 1)] for _ in range(len(str1) + 1)]
  86. for i in range(0, len(str1) + 1):
  87. dp[i][0] = i
  88. for j in range(0, len(str2) + 1):
  89. dp[0][j] = j
  90. for i in range(1, len(str1) + 1):
  91. for j in range(1, len(str2) + 1):
  92. if str1[i - 1] == str2[j - 1]:
  93. dp[i][j] = dp[i - 1][j - 1]
  94. else:
  95. dp[i][j] = min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1]) + 1
  96. return dp[-1][-1]
  97. def edit_distance(str1, str2):
  98. try:
  99. # very fast
  100. # http://stackoverflow.com/questions/14260126/how-python-levenshtein-ratio-is-computed
  101. import Levenshtein
  102. d = Levenshtein.distance(str1, str2) / float(max(len(str1), len(str2)))
  103. except:
  104. # https://docs.python.org/2/library/difflib.html
  105. import difflib
  106. d = 1.0 - difflib.SequenceMatcher(lambda x: x == " ", str1, str2).ratio()
  107. return d
  108. if __name__ == "__main__":
  109. l = [1, 2, 3, 4, 2, 3, 4]
  110. item = 2
  111. print(find_all_idx(l, item))
  112. l = '我爱中国,我是中国人'
  113. item = '中国'
  114. print(find_all_idx(l, item))