{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "### 形態素解析\n", "import MeCab\n", "\n", "#text = \"私はお昼休みに美味しいカレーライスを食べました。\"\n", "\n", "m = MeCab.Tagger(\"-Ochasen\")\n", "\n", "def mecab_sep(text):\n", " node = m.parseToNode(text)\n", "\n", " words_list = []\n", "\n", " while node:\n", " if node.feature.split(\",\")[0] == \"動詞\":\n", " words_list.append(node.feature.split(\",\")[6])\n", " elif node.feature.split(\",\")[0] == \"形容詞\":\n", " words_list.append(node.feature.split(\",\")[6])\n", " else:\n", " words_list.append(node.surface)\n", "\n", " node = node.next\n", " \n", " return words_list[1:-1]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from gensim.models.doc2vec import Doc2Vec\n", "model = Doc2Vec.load(\"jawiki.doc2vec.dbow300d/jawiki.doc2vec.dbow300d.model\")\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def calc_vecs_d2v(docs):\n", " vecs = []\n", " for d in docs:\n", " vecs.append(model.infer_vector(mecab_sep(d)))\n", " \n", " return vecs" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "### Bag-of-words / tf-idf\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", "def calc_vecs(docs):\n", " vectorizer = TfidfVectorizer(analyzer=mecab_sep)\n", " vecs = vectorizer.fit_transform(docs)\n", " return vecs.toarray()\n" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "### コサイン類似度\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "\n", "input_doc = \"君はネコが好きです。\"\n", "\n", "target_docs_df = pd.read_csv(\"target_docs.csv\")\n", "target_docs = target_docs_df[\"文章リスト\"].tolist()\n", "\n", "all_docs = [input_doc] + target_docs\n", "all_docs_vecs = calc_vecs_d2v(all_docs)\n", "\n", "similarity =cosine_similarity([all_docs_vecs[0]],all_docs_vecs[1:])\n", "\n", "target_docs_df[\"類似度\"] = similarity[0]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
文章リスト類似度
0私は犬が好きです。0.438497
4あなたは猫が好きです。0.388658
2私は犬がとても好きです。0.354345
3あなたは犬と猫が好きです。0.310741
1私は犬が嫌いです。0.278822
\n", "
" ], "text/plain": [ " 文章リスト 類似度\n", "0 私は犬が好きです。 0.438497\n", "4 あなたは猫が好きです。 0.388658\n", "2 私は犬がとても好きです。 0.354345\n", "3 あなたは犬と猫が好きです。 0.310741\n", "1 私は犬が嫌いです。 0.278822" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "target_docs_df.sort_values(\"類似度\",ascending=False)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
文章リスト類似度
4あなたは猫が好きです。0.924543
3あなたは犬と猫が好きです。0.919218
0私は犬が好きです。0.907596
1私は犬が嫌いです。0.903666
2私は犬がとても好きです。0.902655
\n", "
" ], "text/plain": [ " 文章リスト 類似度\n", "4 あなたは猫が好きです。 0.924543\n", "3 あなたは犬と猫が好きです。 0.919218\n", "0 私は犬が好きです。 0.907596\n", "1 私は犬が嫌いです。 0.903666\n", "2 私は犬がとても好きです。 0.902655" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "target_docs_df.sort_values(\"類似度\",ascending=False)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "import glob\n", "target_docs=[]\n", "\n", "for x in glob.glob(\"text/*/*.txt\"):\n", " text = \"\"\n", " with open(x) as f:\n", " next(f)\n", " next(f)\n", " for line in f:\n", " text = text + line\n", " f.close()\n", " target_docs.append(text)\n" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
カテゴリ内容
0movie-enter【DVDエンター!】誘拐犯に育てられた女が目にした真実は、孤独か幸福か\\n 2005年11月...
1movie-enter藤原竜也、中学生とともにロケット打ち上げに成功\\n 「アンテナを張りながら生活をしていけばい...
2movie-enter『戦火の馬』ロイヤル・プレミアにウィリアム王子&キャサリン妃が出席\\n 3月2日より全国ロー...
3movie-enter香里奈、女子高生100人のガチンコ質問に回答「ラーメンも食べる」\\n 女優の香里奈が18日、...
4movie-enterユージの前に立ちはだかったJOY「僕はAKBの高橋みなみを守る」\\n 5日、東京・千代田区の...
.........
7371smax好きな戦士を作ってドラゴンボールの世界で天下一武道会優勝だ!「挑戦!天下一武道会」【Andr...
7372smaxNTTドコモ、GALAXY SIII SC-06DとF-09D ANTEPRIMAの発売日を...
7373smaxNTTドコモ、Android向け「docomo Wi-Fiかんたん接続アプリ」をバージョンア...
7374smaxNTTドコモ、PRADA Phone by LG L-02Dのデコメ絵文字popが正常に表示...
7375smaxNTTドコモ、公式オンラインショップでも端末複数台購入で最大10,500円/台の割り引きが受...
\n", "

7376 rows × 2 columns

\n", "
" ], "text/plain": [ " カテゴリ 内容\n", "0 movie-enter 【DVDエンター!】誘拐犯に育てられた女が目にした真実は、孤独か幸福か\\n 2005年11月...\n", "1 movie-enter 藤原竜也、中学生とともにロケット打ち上げに成功\\n 「アンテナを張りながら生活をしていけばい...\n", "2 movie-enter 『戦火の馬』ロイヤル・プレミアにウィリアム王子&キャサリン妃が出席\\n 3月2日より全国ロー...\n", "3 movie-enter 香里奈、女子高生100人のガチンコ質問に回答「ラーメンも食べる」\\n 女優の香里奈が18日、...\n", "4 movie-enter ユージの前に立ちはだかったJOY「僕はAKBの高橋みなみを守る」\\n 5日、東京・千代田区の...\n", "... ... ...\n", "7371 smax 好きな戦士を作ってドラゴンボールの世界で天下一武道会優勝だ!「挑戦!天下一武道会」【Andr...\n", "7372 smax NTTドコモ、GALAXY SIII SC-06DとF-09D ANTEPRIMAの発売日を...\n", "7373 smax NTTドコモ、Android向け「docomo Wi-Fiかんたん接続アプリ」をバージョンア...\n", "7374 smax NTTドコモ、PRADA Phone by LG L-02Dのデコメ絵文字popが正常に表示...\n", "7375 smax NTTドコモ、公式オンラインショップでも端末複数台購入で最大10,500円/台の割り引きが受...\n", "\n", "[7376 rows x 2 columns]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "all_docs_vecs = calc_vecs_d2v(target_docs)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "### コサイン類似度\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "\n", "input_doc = \"お前らどんだけ起きてんだよ!\"\n", "\n", "similarity =cosine_similarity(calc_vecs_d2v([input_doc]),all_docs_vecs)\n", "\n", "target_docs_df = pd.DataFrame(target_docs,columns=[\"文章リスト\"])\n", "target_docs_df[\"類似度\"] = similarity[0]" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'ツイッターでストレス発散?\\u3000ローラが老人の弁当のおかずを聞かれ「もちーo(^u^)o」【話題】\\nツイッターが人気のタレントと言えば思い出すのがローラ。そんな彼女がまた仰天発言を行った。\\nファンからの「ローラ明日おばあちゃんのゲートボール大会あるんだけどおかず何がいいと思う?」というツイートに対するローラの返事は「もちーo(^u^)o」だったのだ。\\n\\nおばあちゃんにお餅!\\u3000と思ってしまったが、ネット上でも「おばあちゃん危ない」「見た瞬間爆笑した」などの声があがっている。ローラの返事を面白がっている人が多いのだが、中には「ツイッターでストレス発散してるのかな」などの書き込みもあった。\\n\\nどうも気になるローラのツイッター。目が離せない。\\n\\n@ローラツイッター\\n'" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "target_docs_df.sort_values(\"類似度\",ascending=False).head(5)[\"文章リスト\"].tolist()[1]" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 2 }