2018 KAKAO BLIND RECRUITMENT - 뉴스 클러스터링 (Level 2)

 

https://programmers.co.kr/learn/courses/30/lessons/17677


Code

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import re
from collections import Counter

def solution(str1, str2):
    # 1. Lower
    str1, str2 = str1.lower(), str2.lower()


    # 2. Generate substring list
    def generate_valid_substrings(str):
        # Select only [a-z] substrings
        sstr = [str[i:i+2] for i in range(len(str)-1)]
        return [ss for ss in sstr if re.match("^[a-z]+$", ss)]
    sstr1 = generate_valid_substrings(str1)
    sstr2 = generate_valid_substrings(str2)


    # 3. Convert to set
    union  = set(sstr1) | set(sstr2)
    inters = set(sstr1) & set(sstr2)

    # 3.1 Convert to multiset
    cnt1, cnt2   = Counter(sstr1), Counter(sstr2)
    union_multi  = {k: max(cnt1[k], cnt2[k]) for k in union}
    inters_multi = {k: min(cnt1[k], cnt2[k]) for k in inters}


    # 4. Compute Jaccard distance
    dist = sum(inters_multi.values()) / sum(union_multi.values()) if cnt1 or cnt2 else 1
    return int(dist * 65536)

Complexity

$O(|\text{str1}|+|\text{str2}|)$