Jovian
⭐️
Sign In
In [4]:
!wget http://10.114.38.22:8888/data.tsv -O data.tsv
--2019-05-23 02:44:20-- http://10.114.38.22:8888/data.tsv Connecting to 10.114.38.22:8888... connected. HTTP request sent, awaiting response... 200 OK Length: 844413808 (805M) [text/tab-separated-values] Saving to: 'data.tsv' data.tsv 100%[===================>] 805.29M 371MB/s in 2.2s 2019-05-23 02:44:22 (371 MB/s) - 'data.tsv' saved [844413808/844413808]
In [189]:
from iwork.pipe import *
from iwork.nlp.utils import Sent2Vec, VecQuery
from gensim.models.fasttext import load_facebook_model
9%|▊ | 869476/10000000 [00:19<00:32, 276794.62it/s]
In [7]:
sv = Sent2Vec(load_facebook_model('./skipgram.title'))
In [14]:
def get_sent_vec(s):
    return sv.transform(str(s), jieba.cut)
In [59]:
%%time
df_ = pd.read_csv('data.tsv', '\t', names=['id', 'title'], usecols=[0, 1])
CPU times: user 16.4 s, sys: 1.2 s, total: 17.6 s Wall time: 17.6 s
In [15]:
ids = df['id'] + '🕷' + df['title']
In [16]:
vectors = []
for s in tqdm(df.title):
    vectors.append(get_sent_vec(s))
HBox(children=(IntProgress(value=0, max=4097449), HTML(value='')))
In [18]:
%%time
vq = VecQuery()
vq.createIndex(ids, vectors)
Index Create ... CPU times: user 9h 45min 53s, sys: 2min, total: 9h 47min 53s Wall time: 15min 16s
In [20]:
pickle.dump(vq, open('./vq.cls.pkl', 'wb'))
In [26]:
pprint(vq.query([get_sent_vec('周杰伦')])[0])
OrderedDict([('09ffff03b829d8cb0e0cbf326aa3f277🕷周杰伦演唱会为什么失声?请来林俊杰帮忙 ' '林俊杰周杰伦关系大起底', 0.8073062300682068), ('68ade1d70946de042bf3facda3ad91bc🕷周杰伦,只是周杰伦,独此一位', 0.7995769381523132), ('1fb26a550214bd81fd4a71e151ad4d36🕷周杰伦昆凌古天乐林俊杰萧敬腾 明星齐晒娃娃脸', 0.7976314425468445), ('50421af0f3a08e59b78ec9a93ba7ae6e🕷韩庚演唱会,演唱周杰伦歌曲周杰伦也在现场', 0.7896055579185486), ('95c21692be09684ebd90982ae364b331🕷吴宗宪用周杰伦遛粉,发新歌却是十几年前周杰伦写的歌', 0.7842336893081665), ('fceda72df4a5f98f2756b73b31436d72🕷昆凌曝光周杰伦求婚细节,原来周杰伦私底下是这样的“周董”', 0.7834030985832214), ('58d00c6579acbe35c6c0fed4d4501af9🕷昆凌写情书表白周杰伦,杰伦再发图却被催新歌', 0.7822023630142212), ('34fcf8d82fd615e8c78bed83ee36adb7🕷周杰伦采访歌迷始末 周杰伦在哪里采访歌迷的 ' '周杰伦歌迷说了什么?', 0.7787772417068481), ('39bdbbb0b8399bda60606d195a76c59f🕷周杰伦的女儿,已经会唱周杰伦的歌了', 0.7776650190353394), ('b5c962ed8014fbcb3f43a6954fbdd4cc🕷邓紫棋与周杰伦同框 娱乐圈竟然有这么多周杰伦的粉丝', 0.7705827355384827)])
In [66]:
df = pd.read_csv('./push.tsv', '\t')[lambda df: df.crawltime.isnull()]
In [68]:
df = df[~df.title.isin(s)]
In [63]:
s = set(df_.title) & set(df.title)
In [69]:
df.shape
Out[69]:
(1052, 4)
In [67]:
df.shape
Out[67]:
(1301, 4)
In [70]:
vectors = []
for s in tqdm(df.title):
    vectors.append(get_sent_vec(s))
HBox(children=(IntProgress(value=0, max=1052), HTML(value='')))
In [71]:
r = vq.query(vectors, k=1)

In [82]:
len(r)
Out[82]:
1052
In [94]:
p = r[0].items()
In [138]:
l = []
for _ in r:
    k, v = list(_.items())[0]
    l.append(k.split('🕷') + [v])
In [139]:
np.array(l)[:, 0]
Out[139]:
array(['5c3f56fc3b7b8231fe6dfd693871182d',
       '211d5e2f676a759ed1f266cb0a151a54', '0M3pei04', ..., '0M3vC8Zr',
       '95731fb116f584691cd3b0619d95d78f',
       '81a93644ee9213b65df18e7682a3e6d6'], dtype='<U64')
In [151]:
df['title_sim_id'] = np.array(l)[:, 0]
df['title_sim'] = np.array(l)[:, 1]
df['score'] = np.array(l)[:, 2].astype(float)
In [155]:
df.sort_values('score').to_csv('./sim_title.tsv', '\t', index=False)
In [207]:
sns.distplot(df.score, 10)
Out[207]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f8fdfd714a8>
Notebook Image
In [210]:
df.score.describe()
Out[210]:
count    1052.000000
mean        0.962005
std         0.032873
min         0.830264
25%         0.942006
50%         0.969304
75%         0.990741
max         0.999509
Name: score, dtype: float64
In [221]:
ss = pd.cut(df.score, [0.8, 0.85, 0.9, 0.95, 1]).value_counts() / 2210 * 100
In [222]:
from itertools import combinations
In [216]:
combinations.
Out[216]:
0.4113122171945701
In [218]:
(681+909)/2210
Out[218]:
0.7194570135746606
In [223]:
import socket
socket.gethostbyname(socket.getfqdn(socket.gethostname()))
Out[223]:
'10.177.31.195'
In [224]:
socket.gethostname()
Out[224]:
'de-33103-demo-0415190407-5bd8b69d4f-fq24n'
In [225]:
socket.getfqdn()
Out[225]:
'de-33103-demo-0415190407-5bd8b69d4f-fq24n'
In [226]:
socket.gethostname()
Out[226]:
'de-33103-demo-0415190407-5bd8b69d4f-fq24n'
In [159]:
df.sort_values('score', 0, 0)
Out[159]:
In [163]:
s = """@猫眼娱乐: 王思聪又换女友了?"""
vq.query([get_sent_vec(s)])[0]
Out[163]:
OrderedDict([('b510fbb225de7a98298bd4a8c74a722f🕷王思聪又换女友了?',
              0.8429329991340637),
             ('d7769909abc3944cde00965745371e83🕷王思聪又交了新女朋友,网友:还是同一张脸',
              0.8295964002609253),
             ('e3ea7ec29040b921a624bc8a38e7cd1f🕷王思聪现身台北, 网友: 疑似校长又换女友了?',
              0.8254078030586243),
             ('c77cf7a646f38b824e87fbef84163b57🕷王思聪又找新女友?房祖名否认:只是普通朋友',
              0.8241278529167175),
             ('07bc05e4f5b68170acda42081bf33b6a🕷王思聪不爱网红了? 新明星女友登上《男人装》, 网友: 艳福不浅!',
              0.8182055354118347),
             ('ba2971aa2f8136ca5c5e4cbc8635a815🕷又换女友了?思聪和撕过的人玩耍,网友:你爸快完了!',
              0.8176959753036499),
             ('bd120bf1ee9d75eeaf3b12177a8c696d🕷王思聪新女友参加活动, 网友: 王思聪情何以堪。。',
              0.8166388273239136),
             ('f7a4dbd1d4a001514a47e84d88be330e🕷房祖名曝出与王思聪同游台北的照片, 是想借国民老公的影响力上热搜吗? 网友: 王校长没人你失望',
              0.8125787377357483),
             ('7419f17fc31fe04bccba5d96c66f5f24🕷与王思聪闹翻的她又回来了;网友: 终于等到你……',
              0.812347412109375),
             ('96a6b4cf1a17010c60b10e528ae6b7d9🕷王思聪又换女友了?被拍带新女友回家过夜,周洁琼也难逃分手命运',
              0.8117027282714844)])
In [164]:
df.score.describe()
Out[164]:
count    1052.000000
mean        0.962005
std         0.032873
min         0.830264
25%         0.942006
50%         0.969304
75%         0.990741
max         0.999509
Name: score, dtype: float64
In [166]:
df[df.score>0.99].to_csv('./t.tsv', '\t')
In [167]:
"""竞秀区七旬老人凌晨四点外出遛弯,穿红色上衣,骑银灰色三轮车,望留意>>照片"""
Out[167]:
'竞秀区七旬老人凌晨四点外出遛弯,穿红色上衣,骑银灰色三轮车,望留意>>照片'
In [177]:
v1 = sv.embeddings["""竞秀区七旬老人凌晨四点外出遛弯,穿红色上衣,骑银灰色三轮车,望留意>>照片"""]
v2 = sv.embeddings["""父亲您在哪?西安七旬老人外出遛弯走失,穿蓝色上衣、军用鞋,骑人力三轮车"""]
In [182]:
import jovian
In [184]:
jovian.commit(env_type='pip')
[jovian] Saving notebook..
[jovian] Updating notebook "fb4664f6f82f408a86d276a21308d47c" on https://jvn.io [jovian] Uploading notebook.. [jovian] Capturing environment.. [jovian] Committed successfully! https://jvn.io/Jie-Yuan/fb4664f6f82f408a86d276a21308d47c
In [ ]:
 
In [187]:
from pathlib import Path
from gensim.models.doc2vec import Doc2Vec, TaggedDocument, TaggedLineDocument


class GensimDoc2Vec(object):
    def __init__(self):
        pass

    def fit(self, corpus, vector_size=300, window=10, min_count=1, dm=1, hs=0, negative=5,
            epochs=10, workers=8):

        if isinstance(corpus, str) and Path(corpus).is_file():
            corpus = TaggedLineDocument(corpus)
        else:
            corpus = [TaggedDocument(line, [idx]) for idx, line in enumerate(corpus)]

        model = Doc2Vec(documents=tqdm(corpus), vector_size=vector_size,
                        window=window, min_count=min_count, dm=dm,
                        hs=hs, negative=negative, epochs=epochs, workers=workers)
        return model



In [191]:
import logging
 
##训练word2vec模型
 
# 获取日志信息
logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)
In [194]:
if __name__ == '__main__':
    docs = [['Well', 'done!'],
            ['Good', 'work'],
            ['Great', 'effort'],
            ['nice', 'work'],
            ['Excellent!'],
            ['Weak'],
            ['Poor', 'effort!'],
            ['not', 'good'],
            ['poor', 'work'],
            ['Could', 'have', 'done', 'better.']]*100
    model = GensimDoc2Vec()
    model.fit()

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))
WARNING:gensim.models.base_any2vec:under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay
In [196]:
m = Doc2Vec()
In [204]:
df.score.describe()
Out[204]:
count    1052.000000
mean        0.962005
std         0.032873
min         0.830264
25%         0.942006
50%         0.969304
75%         0.990741
max         0.999509
Name: score, dtype: float64
In [ ]:
df.score
In [227]:
!python -m http.server 8080
Serving HTTP on 0.0.0.0 port 8080 ... ^C Keyboard interrupt received, exiting.
In [229]:
!export SANIC_NO_UVLOOP=true
!export SANIC_NO_UJSON=true
!pip install --no-binary :all: sanic
Requirement already satisfied: sanic in /usr/local/lib/python3.5/dist-packages (19.3.1) Requirement already satisfied: websockets<7.0,>=6.0 in /usr/local/lib/python3.5/dist-packages (from sanic) (6.0) Requirement already satisfied: httptools>=0.0.10 in /usr/local/lib/python3.5/dist-packages (from sanic) (0.0.13) Requirement already satisfied: ujson>=1.35; sys_platform != "win32" and implementation_name == "cpython" in /usr/local/lib/python3.5/dist-packages (from sanic) (1.35) Requirement already satisfied: uvloop>=0.5.3; sys_platform != "win32" and implementation_name == "cpython" in /usr/local/lib/python3.5/dist-packages (from sanic) (0.12.2) Requirement already satisfied: aiofiles>=0.3.0 in /usr/local/lib/python3.5/dist-packages (from sanic) (0.4.0) Requirement already satisfied: multidict<5.0,>=4.0 in /usr/local/lib/python3.5/dist-packages (from sanic) (4.5.2)
In [230]:
model.n
Out[230]:
<__main__.GensimDoc2Vec at 0x7f910fb0b390>
In [ ]:
import jovian
jovian.commit(env_type='pip')
[jovian] Saving notebook..
In [ ]: