Skip to content

Commit

Permalink
Add the word segmentation task in the Taskflow (PaddlePaddle#1123)
Browse files Browse the repository at this point in the history
* Add the word segmentation task in the Taskflow

* format the example in the taskflow

* remove the limit for seq length for the segment

* Add the poetry_generation and question answering tasks

* change the text2knowledge to ner task

* update some code for the taskflow

* add the pos tagging for the taskflow

* Update the doc for the taskflow tasks

* update the pos tagging doc

Co-authored-by: Zeyu Chen <[email protected]>
  • Loading branch information
wawltor and ZeyuChen authored Oct 10, 2021
1 parent dc68121 commit 081e285
Show file tree
Hide file tree
Showing 11 changed files with 464 additions and 58 deletions.
27 changes: 17 additions & 10 deletions examples/language_model/gpt/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -233,22 +233,29 @@ python deploy/python/inference.py --model_type gpt \
from paddlenlp import Taskflow

# 默认是知识问答任务
question = Taskflow("text_generation")
question("中国的国土面积有多大?")
qa = Taskflow("question_answering")
qa("中国的国土面积有多大?")
'''
[{'text': '中国的国土面积有多大?', 'answer': '960万平方公里。'}]
'''

# 使用写诗任务进行写诗
poetry = Taskflow("text_generation", generation_task="poetry")
poetry("林密不见人")
'''
[{'text': '林密不见人', 'answer': ',但闻人语响。'}]
qa(["中国国土面积有多大?", "中国的首都在哪里?"])
'''
poetry(["林密不见人", "举头邀明月"])
'''
[{'text': '林密不见人', 'answer': ',但闻人语响。'}, {'text': '举头邀明月', 'answer': ',低头思故乡。'}]
[{'text': '中国国土面积有多大?', 'answer': '960万平方公里。'}, {'text': '中国的首都在哪里?', 'answer': '北京。'}]
'''

# 使用写诗任务进行写诗

poetry = Taskflow("poetry_generation")
poetry("林密不见人")
'''
[{'text': '林密不见人', 'answer': ',但闻人语响。'}]
'''

poetry(["林密不见人", "举头邀明月"])
'''
[{'text': '林密不见人', 'answer': ',但闻人语响。'}, {'text': '举头邀明月', 'answer': ',低头思故乡。'}]
'''
```

## 其他
Expand Down
6 changes: 3 additions & 3 deletions examples/text_to_knowledge/wordtag/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,11 @@ Term-Linking示例程序可以对无标签数据可以启动模型预测, 例如

```python
from paddlenlp import Taskflow
task = Taskflow("text2knowledge", model="wordtag")
task(["热梅茶是一道以梅子为主要原料制作的茶饮",
ner = Taskflow("ner", model="wordtag", linking=True)
ner(["热梅茶是一道以梅子为主要原料制作的茶饮",
"《孤女》是2010年九州出版社出版的小说,作者是余兼羽"])
# Support the input text directly
task("热梅茶是一道以梅子为主要原料制作的茶饮")
ner("热梅茶是一道以梅子为主要原料制作的茶饮")

```
下面是运行WordTag工具后的知识链接的预测结果
Expand Down
5 changes: 3 additions & 2 deletions examples/text_to_knowledge/wordtag/predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,11 @@ def parse_args():
def do_predict(args):
paddle.set_device(args.device)
wordtag = Taskflow(
"text2knowledge",
"ner",
model="wordtag",
batch_size=args.batch_size,
max_seq_length=args.max_seq_len)
max_seq_length=args.max_seq_len,
linking=True)
txts = ["《孤女》是2010年九州出版社出版的小说,作者是余兼羽。", "热梅茶是一道以梅子为主要原料制作的茶饮"]
res = wordtag(txts)
print(res)
Expand Down
3 changes: 0 additions & 3 deletions paddlenlp/taskflow/lexical_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,6 @@ def _preprocess(self, inputs, padding=True, add_special_tokens=True):
'batch_size'] if 'batch_size' in self.kwargs else 1
num_workers = self.kwargs[
'num_workers'] if 'num_workers' in self.kwargs else 0
max_seq_len = self.kwargs[
'max_seq_len'] if 'max_seq_len' in self.kwargs else 64
infer_data = []
oov_token_id = self._word_vocab.get("OOV")

Expand All @@ -170,7 +168,6 @@ def read(inputs):
len(input_tokens.strip()) > 0):
continue
filter_inputs.append(input_tokens)
input_tokens = input_tokens[:max_seq_len]
ids = []
for token in input_tokens:
token = self._q2b_vocab.get(token, token)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,19 +113,19 @@
usage = r"""
from paddlenlp import Taskflow
task = Taskflow("text2knowledge")
task("《孤女》是2010年九州出版社出版的小说,作者是余兼羽")
ner = Taskflow("ner")
ner("《孤女》是2010年九州出版社出版的小说,作者是余兼羽")
'''
[{'text': '《孤女》是2010年九州出版社出版的小说,作者是余兼羽', 'items': [{'item': '《', 'offset': 0, 'wordtag_label': 'w', 'length': 1}, {'item': '孤女', 'offset': 1, 'wordtag_label': '作品类_实体', 'length': 2}, {'item': '》', 'offset': 3, 'wordtag_label': 'w', 'length': 1}, {'item': '是', 'offset': 4, 'wordtag_label': '肯定词', 'length': 1, 'termid': '肯定否定词_cb_是'}, {'item': '2010年', 'offset': 5, 'wordtag_label': '时间类', 'length': 5, 'termid': '时间阶段_cb_2010年'}, {'item': '九州出版社', 'offset': 10, 'wordtag_label': '组织机构类', 'length': 5, 'termid': '组织机构_eb_九州出版社'}, {'item': '出版', 'offset': 15, 'wordtag_label': '场景事件', 'length': 2, 'termid': '场景事件_cb_出版'}, {'item': '的', 'offset': 17, 'wordtag_label': '助词', 'length': 1, 'termid': '助词_cb_的'}, {'item': '小说', 'offset': 18, 'wordtag_label': '作品类_概念', 'length': 2, 'termid': '小说_cb_小说'}, {'item': ',', 'offset': 20, 'wordtag_label': 'w', 'length': 1}, {'item': '作者', 'offset': 21, 'wordtag_label': '人物类_概念', 'length': 2, 'termid': '人物_cb_作者'}, {'item': '是', 'offset': 23, 'wordtag_label': '肯定词', 'length': 1, 'termid': '肯定否定词_cb_是'}, {'item': '余兼羽', 'offset': 24, 'wordtag_label': '人物类_实体', 'length': 3}]}]
[{'text': '《孤女》是2010年九州出版社出版的小说,作者是余兼羽', 'items': [{'item': '《', 'offset': 0, 'wordtag_label': 'w', 'length': 1}, {'item': '孤女', 'offset': 1, 'wordtag_label': '作品类_实体', 'length': 2}, {'item': '》', 'offset': 3, 'wordtag_label': 'w', 'length': 1}, {'item': '是', 'offset': 4, 'wordtag_label': '肯定词', 'length': 1}, {'item': '2010年', 'offset': 5, 'wordtag_label': '时间类', 'length': 5}, {'item': '九州出版社', 'offset': 10, 'wordtag_label': '组织机构类', 'length': 5}, {'item': '出版', 'offset': 15, 'wordtag_label': '场景事件', 'length': 2}, {'item': '的', 'offset': 17, 'wordtag_label': '助词', 'length': 1}, {'item': '小说', 'offset': 18, 'wordtag_label': '作品类_概念', 'length': 2}, {'item': ',', 'offset': 20, 'wordtag_label': 'w', 'length': 1}, {'item': '作者', 'offset': 21, 'wordtag_label': '人物类_概念', 'length': 2}, {'item': '是', 'offset': 23, 'wordtag_label': '肯定词', 'length': 1}, {'item': '余兼羽', 'offset': 24, 'wordtag_label': '人物类_实体', 'length': 3}]}]
'''
task = Taskflow("text2knowledge", batch_size=2)
task(["热梅茶是一道以梅子为主要原料制作的茶饮",
ner = Taskflow("ner", batch_size=2)
ner(["热梅茶是一道以梅子为主要原料制作的茶饮",
"《孤女》是2010年九州出版社出版的小说,作者是余兼羽",
"中山中环广场,位于广东省中山市东区,地址是东区兴政路1号",
"宫之王是一款打发休闲时光的迷宫游戏"])
'''
[{'text': '热梅茶是一道以梅子为主要原料制作的茶饮', 'items': [{'item': '热梅茶', 'offset': 0, 'wordtag_label': '饮食类_饮品', 'length': 3}, {'item': '是', 'offset': 3, 'wordtag_label': '肯定词', 'length': 1, 'termid': '肯定否定词_cb_是'}, {'item': '一道', 'offset': 4, 'wordtag_label': '数量词', 'length': 2}, {'item': '以', 'offset': 6, 'wordtag_label': '介词', 'length': 1, 'termid': '介词_cb_以'}, {'item': '梅子', 'offset': 7, 'wordtag_label': '饮食类', 'length': 2, 'termid': '饮食_cb_梅'}, {'item': '为', 'offset': 9, 'wordtag_label': '肯定词', 'length': 1, 'termid': '肯定否定词_cb_为'}, {'item': '主要原料', 'offset': 10, 'wordtag_label': '物体类', 'length': 4, 'termid': '物品_cb_主要原料'}, {'item': '制作', 'offset': 14, 'wordtag_label': '场景事件', 'length': 2, 'termid': '场景事件_cb_制作'}, {'item': '的', 'offset': 16, 'wordtag_label': '助词', 'length': 1, 'termid': '助词_cb_的'}, {'item': '茶饮', 'offset': 17, 'wordtag_label': '饮食类_饮品', 'length': 2, 'termid': '饮品_cb_茶饮'}]}, {'text': '《孤女》是2010年九州出版社出版的小说,作者是余兼羽', 'items': [{'item': '《', 'offset': 0, 'wordtag_label': 'w', 'length': 1}, {'item': '孤女', 'offset': 1, 'wordtag_label': '作品类_实体', 'length': 2}, {'item': '》', 'offset': 3, 'wordtag_label': 'w', 'length': 1}, {'item': '是', 'offset': 4, 'wordtag_label': '肯定词', 'length': 1, 'termid': '肯定否定词_cb_是'}, {'item': '2010年', 'offset': 5, 'wordtag_label': '时间类', 'length': 5, 'termid': '时间阶段_cb_2010年'}, {'item': '九州出版社', 'offset': 10, 'wordtag_label': '组织机构类', 'length': 5, 'termid': '组织机构_eb_九州出版社'}, {'item': '出版', 'offset': 15, 'wordtag_label': '场景事件', 'length': 2, 'termid': '场景事件_cb_出版'}, {'item': '的', 'offset': 17, 'wordtag_label': '助词', 'length': 1, 'termid': '助词_cb_的'}, {'item': '小说', 'offset': 18, 'wordtag_label': '作品类_概念', 'length': 2, 'termid': '小说_cb_小说'}, {'item': ',', 'offset': 20, 'wordtag_label': 'w', 'length': 1}, {'item': '作者', 'offset': 21, 'wordtag_label': '人物类_概念', 'length': 2, 'termid': '人物_cb_作者'}, {'item': '是', 'offset': 23, 'wordtag_label': '肯定词', 'length': 1, 'termid': '肯定否定词_cb_是'}, {'item': '余兼羽', 'offset': 24, 'wordtag_label': '人物类_实体', 'length': 3}]}, {'text': '中山中环广场,位于广东省中山市东区,地址是东区兴政路1号', 'items': [{'item': '中山中环广场', 'offset': 0, 'wordtag_label': '场所类', 'length': 6}, {'item': ',', 'offset': 6, 'wordtag_label': 'w', 'length': 1}, {'item': '位于', 'offset': 7, 'wordtag_label': '场景事件', 'length': 2, 'termid': '场景事件_cb_位于'}, {'item': '广东省', 'offset': 9, 'wordtag_label': '世界地区类', 'length': 3, 'termid': '中国地区_cb_广东省'}, {'item': '中山市东', 'offset': 12, 'wordtag_label': '世界地区类', 'length': 4}, {'item': '区', 'offset': 16, 'wordtag_label': '词汇用语', 'length': 1}, {'item': ',', 'offset': 17, 'wordtag_label': 'w', 'length': 1}, {'item': '地址', 'offset': 18, 'wordtag_label': '场所类', 'length': 2, 'termid': '区域场所_cb_地址'}, {'item': '是', 'offset': 20, 'wordtag_label': '肯定词', 'length': 1, 'termid': '肯定否定词_cb_是'}, {'item': '东区', 'offset': 21, 'wordtag_label': '位置方位', 'length': 2, 'termid': '位置方位_cb_东区'}, {'item': '兴政路1号', 'offset': 23, 'wordtag_label': '世界地区类', 'length': 5}]}, {'text': '宫之王是一款打发休闲时光的迷宫游戏', 'items': [{'item': '宫之王', 'offset': 0, 'wordtag_label': '人物类_实体', 'length': 3}, {'item': '是', 'offset': 3, 'wordtag_label': '肯定词', 'length': 1, 'termid': '肯定否定词_cb_是'}, {'item': '一款', 'offset': 4, 'wordtag_label': '数量词', 'length': 2}, {'item': '打发', 'offset': 6, 'wordtag_label': '场景事件', 'length': 2, 'termid': '场景事件_cb_打发'}, {'item': '休闲', 'offset': 8, 'wordtag_label': '场景事件', 'length': 2, 'termid': '场景事件_cb_休闲'}, {'item': '时光', 'offset': 10, 'wordtag_label': '时间类', 'length': 2, 'termid': '时间阶段_cb_时光'}, {'item': '的', 'offset': 12, 'wordtag_label': '助词', 'length': 1, 'termid': '助词_cb_的'}, {'item': '迷宫游戏', 'offset': 13, 'wordtag_label': '作品类_概念', 'length': 4}]}]
[{'text': '热梅茶是一道以梅子为主要原料制作的茶饮', 'items': [{'item': '热梅茶', 'offset': 0, 'wordtag_label': '饮食类_饮品', 'length': 3}, {'item': '是', 'offset': 3, 'wordtag_label': '肯定词', 'length': 1}, {'item': '一道', 'offset': 4, 'wordtag_label': '数量词', 'length': 2}, {'item': '以', 'offset': 6, 'wordtag_label': '介词', 'length': 1}, {'item': '梅子', 'offset': 7, 'wordtag_label': '饮食类', 'length': 2}, {'item': '为', 'offset': 9, 'wordtag_label': '肯定词', 'length': 1}, {'item': '主要原料', 'offset': 10, 'wordtag_label': '物体类', 'length': 4}, {'item': '制作', 'offset': 14, 'wordtag_label': '场景事件', 'length': 2}, {'item': '的', 'offset': 16, 'wordtag_label': '助词', 'length': 1}, {'item': '茶饮', 'offset': 17, 'wordtag_label': '饮食类_饮品', 'length': 2}]}, {'text': '《孤女》是2010年九州出版社出版的小说,作者是余兼羽', 'items': [{'item': '《', 'offset': 0, 'wordtag_label': 'w', 'length': 1}, {'item': '孤女', 'offset': 1, 'wordtag_label': '作品类_实体', 'length': 2}, {'item': '》', 'offset': 3, 'wordtag_label': 'w', 'length': 1}, {'item': '是', 'offset': 4, 'wordtag_label': '肯定词', 'length': 1}, {'item': '2010年', 'offset': 5, 'wordtag_label': '时间类', 'length': 5}, {'item': '九州出版社', 'offset': 10, 'wordtag_label': '组织机构类', 'length': 5}, {'item': '出版', 'offset': 15, 'wordtag_label': '场景事件', 'length': 2}, {'item': '的', 'offset': 17, 'wordtag_label': '助词', 'length': 1}, {'item': '小说', 'offset': 18, 'wordtag_label': '作品类_概念', 'length': 2}, {'item': ',', 'offset': 20, 'wordtag_label': 'w', 'length': 1}, {'item': '作者', 'offset': 21, 'wordtag_label': '人物类_概念', 'length': 2}, {'item': '是', 'offset': 23, 'wordtag_label': '肯定词', 'length': 1}, {'item': '余兼羽', 'offset': 24, 'wordtag_label': '人物类_实体', 'length': 3}]}, {'text': '中山中环广场,位于广东省中山市东区,地址是东区兴政路1号', 'items': [{'item': '中山中环广场', 'offset': 0, 'wordtag_label': '场所类', 'length': 6}, {'item': ',', 'offset': 6, 'wordtag_label': 'w', 'length': 1}, {'item': '位于', 'offset': 7, 'wordtag_label': '场景事件', 'length': 2}, {'item': '广东省', 'offset': 9, 'wordtag_label': '世界地区类', 'length': 3}, {'item': '中山市东', 'offset': 12, 'wordtag_label': '世界地区类', 'length': 4}, {'item': '区', 'offset': 16, 'wordtag_label': '词汇用语', 'length': 1}, {'item': ',', 'offset': 17, 'wordtag_label': 'w', 'length': 1}, {'item': '地址', 'offset': 18, 'wordtag_label': '场所类', 'length': 2}, {'item': '是', 'offset': 20, 'wordtag_label': '肯定词', 'length': 1}, {'item': '东区', 'offset': 21, 'wordtag_label': '位置方位', 'length': 2}, {'item': '兴政路1号', 'offset': 23, 'wordtag_label': '世界地区类', 'length': 5}]}, {'text': '宫之王是一款打发休闲时光的迷宫游戏', 'items': [{'item': '宫之王', 'offset': 0, 'wordtag_label': '人物类_实体', 'length': 3}, {'item': '是', 'offset': 3, 'wordtag_label': '肯定词', 'length': 1}, {'item': '一款', 'offset': 4, 'wordtag_label': '数量词', 'length': 2}, {'item': '打发', 'offset': 6, 'wordtag_label': '场景事件', 'length': 2}, {'item': '休闲', 'offset': 8, 'wordtag_label': '场景事件', 'length': 2}, {'item': '时光', 'offset': 10, 'wordtag_label': '时间类', 'length': 2}, {'item': '的', 'offset': 12, 'wordtag_label': '助词', 'length': 1}, {'item': '迷宫游戏', 'offset': 13, 'wordtag_label': '作品类_概念', 'length': 4}]}]
'''
"""

Expand Down Expand Up @@ -157,7 +157,8 @@ def __init__(self, model, task, **kwargs):
self._tags_to_index, self._index_to_tags = self._load_labels(tag_path)

self._termtree = TermTree.from_dir(term_schema_path, term_data_path)
self._linking = True
self._linking = self.kwargs[
'linking'] if 'linking' in self.kwargs else False
self._construct_tokenizer(model)
self._usage = usage
self._summary_num = 2
Expand Down
70 changes: 70 additions & 0 deletions paddlenlp/taskflow/poetry_generation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# coding:utf-8
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import glob
import json
import math
import os
import copy
import itertools

import numpy as np
from .utils import download_file
from .text_generation import TextGenerationTask
from .task import Task

usage = r"""
from paddlenlp import Taskflow
poetry = Taskflow("poetry_generation")
poetry("林密不见人")
'''
[{'text': '林密不见人', 'answer': ',但闻人语响。'}]
'''
poetry(["林密不见人", "举头邀明月"])
'''
[{'text': '林密不见人', 'answer': ',但闻人语响。'}, {'text': '举头邀明月', 'answer': ',低头思故乡。'}]
'''
"""

URLS = {
"gpt-cpm-large-cn": [
"https://paddlenlp.bj.bcebos.com/taskflow/text_generation/gpt-cpm/gpt-cpm-large-cn_params.tar",
"5aad6f81053cfdbba4797f044fcf66d1"
],
}


class PoetryGenerationTask(TextGenerationTask):
"""
The text generation model to predict the question or chinese poetry.
Args:
task(string): The name of task.
model(string): The model name in the task.
kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
"""

def __init__(self, task, model, **kwargs):
super().__init__(task=task, model=model, **kwargs)
if self._static_mode:
download_file(
self._task_path, "static" + os.path.sep + "inference.pdiparams",
URLS[self.model][0], URLS[self.model][1], "poetry_generation")
self._get_inference_model()
else:
self._construct_model(model)
self._construct_tokenizer(model)
self.kwargs['generation_task'] = 'poetry_generation'
Loading

0 comments on commit 081e285

Please sign in to comment.