Skip to content
This repository was archived by the owner on Apr 15, 2020. It is now read-only.

Commit f4ea470

Browse files
committed
分词接口(tag)支持传递参数:space_mode, oov_level, t2s, special_char_conv
1 parent 6461bdc commit f4ea470

File tree

3 files changed

+87
-47
lines changed

3 files changed

+87
-47
lines changed

bosonnlp/client.py

+65-29
Original file line numberDiff line numberDiff line change
@@ -123,10 +123,11 @@ def sentiment(self, contents, model='general'):
123123
124124
>>> import os
125125
>>> nlp = BosonNLP(os.environ['BOSON_API_TOKEN'])
126-
>>> nlp.sentiment('这家味道还不错')
127-
[[0.8758192096636473, 0.12418079033635264]]
128-
>>> nlp.sentiment(['这家味道还不错', '菜品太少了而且还不新鲜'])
129-
[[0.8758192096636473, 0.12418079033635264], [0.33160979027792103, 0.668390209722079]]
126+
>>> nlp.sentiment('这家味道还不错', model='food')
127+
[[0.9991737012037423, 0.0008262987962577828]]
128+
>>> nlp.sentiment(['这家味道还不错', '菜品太少了而且还不新鲜'], model='food')
129+
[[0.9991737012037423, 0.0008262987962577828],
130+
[9.940036427291687e-08, 0.9999999005996357]]
130131
"""
131132
api_endpoint = '/sentiment/analysis?' + model
132133
r = self._api_request('POST', api_endpoint, data=contents)
@@ -149,11 +150,11 @@ def convert_time(self, content, basetime=None):
149150
150151
>>> import os
151152
>>> nlp = BosonNLP(os.environ['BOSON_API_TOKEN'])
152-
>>> nlp.convert_time("2013年二月二十八日下午四点三十分二十九秒")
153-
{u'timestamp': u'2013-02-28 16:30:29'}
153+
>>> _json_dumps(nlp.convert_time("2013年二月二十八日下午四点三十分二十九秒"))
154+
'{"timestamp": "2013-02-28 16:30:29", "type": "timestamp"}'
154155
>>> import datetime
155-
>>> nlp.convert_time("今天晚上8点到明天下午3点", datetime.datetime.today())
156-
{u'timespan': [u'2014-08-25 20:00:00', u'2014-08-26 15:00:00']}
156+
>>> _json_dumps(nlp.convert_time("今天晚上8点到明天下午3点", datetime.datetime(2015, 9, 1)))
157+
'{"timespan": ["2015-09-02 20:00:00", "2015-09-03 15:00:00"], "type": "timespan_0"}'
157158
158159
"""
159160
api_endpoint = '/time/analysis'
@@ -205,8 +206,8 @@ def suggest(self, word, top_k=None):
205206
206207
>>> import os
207208
>>> nlp = BosonNLP(os.environ['BOSON_API_TOKEN'])
208-
>>> nlp.suggest('python', top_k=1)
209-
[[0.9999999999999992, 'python/x']]
209+
>>> nlp.suggest('北京', top_k=2)
210+
[[1.0, '北京/ns'], [0.7493540460397998, '上海/ns']]
210211
"""
211212
api_endpoint = '/suggest/analysis'
212213
params = {}
@@ -234,7 +235,7 @@ def extract_keywords(self, text, top_k=None, segmented=False):
234235
>>> import os
235236
>>> nlp = BosonNLP(os.environ['BOSON_API_TOKEN'])
236237
>>> nlp.extract_keywords('病毒式媒体网站:让新闻迅速蔓延', top_k=2)
237-
[[0.4580507649282757, '蔓延'], [0.44467176143180404, '病毒']]
238+
[[0.8391345017584958, '病毒式'], [0.3802418301341705, '蔓延']]
238239
"""
239240
api_endpoint = '/keywords/analysis'
240241
params = {}
@@ -260,18 +261,18 @@ def depparser(self, contents):
260261
>>> import os
261262
>>> nlp = BosonNLP(os.environ['BOSON_API_TOKEN'])
262263
>>> nlp.depparser('今天天气好')
263-
[{'tag': ['NT', 'NN', 'VA'],
264+
[{'head': [2, 2, -1],
264265
'role': ['TMP', 'SBJ', 'ROOT'],
265-
'head': [2, 2, -1],
266+
'tag': ['NT', 'NN', 'VA'],
266267
'word': ['今天', '天气', '好']}]
267268
>>> nlp.depparser(['今天天气好', '美好的世界'])
268-
[{'tag': ['NT', 'NN', 'VA'],
269+
[{'head': [2, 2, -1],
269270
'role': ['TMP', 'SBJ', 'ROOT'],
270-
'head': [2, 2, -1],
271+
'tag': ['NT', 'NN', 'VA'],
271272
'word': ['今天', '天气', '好']},
272-
{'tag': ['VA', 'DEC', 'NN'],
273+
{'head': [1, 2, -1],
273274
'role': ['DEC', 'NMOD', 'ROOT'],
274-
'head': [1, 2, -1],
275+
'tag': ['VA', 'DEC', 'NN'],
275276
'word': ['美好', '的', '世界']}]
276277
"""
277278
api_endpoint = '/depparser/analysis'
@@ -296,16 +297,17 @@ def ner(self, contents, sensitivity=None):
296297
297298
>>> import os
298299
>>> nlp = BosonNLP(os.environ['BOSON_API_TOKEN'])
299-
>>> nlp.ner('成都商报记者 姚永忠')
300+
>>> nlp.ner('成都商报记者 姚永忠', sensitivity=2)
300301
[{'entity': [[0, 2, 'product_name'], [3, 4, 'person_name']],
301302
'tag': ['ns', 'n', 'n', 'nr'],
302303
'word': ['成都', '商报', '记者', '姚永忠']}]
304+
303305
>>> nlp.ner(['成都商报记者 姚永忠', '微软XP操作系统今日正式退休'])
304306
[{'entity': [[0, 2, 'product_name'], [3, 4, 'person_name']],
305307
'tag': ['ns', 'n', 'n', 'nr'],
306308
'word': ['成都', '商报', '记者', '姚永忠']},
307309
{'entity': [[0, 2, 'product_name'], [3, 4, 'time']],
308-
'tag': ['nt', 'x', 'nl', 't', 'ad', 'v'],
310+
'tag': ['nt', 'nx', 'nl', 't', 'ad', 'v'],
309311
'word': ['微软', 'XP', '操作系统', '今日', '正式', '退休']}]
310312
"""
311313
api_endpoint = '/ner/analysis'
@@ -315,31 +317,63 @@ def ner(self, contents, sensitivity=None):
315317
r = self._api_request('POST', api_endpoint, data=contents, params=params)
316318
return r.json()
317319

318-
def tag(self, contents):
320+
def tag(self, contents, space_mode=0, oov_level=3, t2s=0, special_char_conv=0):
319321
"""BosonNLP `分词与词性标注 <http://docs.bosonnlp.com/tag.html>`_ 封装。
320322
321323
:param contents: 需要做分词与词性标注的文本或者文本序列。
322324
:type contents: string or sequence of string
323325
326+
:param space_mode: 空格保留选项,
327+
:type space_mode: int(整型), 0-3有效
328+
329+
:param oov_level: 枚举强度选项
330+
:type oov_level: int(整型), 0-4有效
331+
332+
:param t2s: 繁简转换选项,繁转简或不转换
333+
:type t2s: int(整型), 0-1有效
334+
335+
:param special_char_conv: 特殊字符转化选项,针对回车、Tab等特殊字符转化或者不转化
336+
:type special_char_conv: int(整型), 0-1有效
337+
324338
:returns: 接口返回的结果列表。
325339
326340
:raises: :py:exc:`~bosonnlp.HTTPError` 如果 API 请求发生错误。
327341
342+
调用参数及返回值详细说明见:http://docs.bosonnlp.com/tag.html
343+
328344
调用示例:
329345
330346
>>> import os
331347
>>> nlp = BosonNLP(os.environ['BOSON_API_TOKEN'])
332-
>>> nlp.tag('成都商报记者 姚永忠')
333-
[{'tag': ['NR', 'NN', 'NN', 'NR'],
334-
'word': ['成都', '商报', '记者', '姚永忠']}]
335-
>>> nlp.tag(['成都商报记者 姚永忠', '微软XP操作系统今日正式退休'])
336-
[{'tag': ['NR', 'NN', 'NN', 'NR'],
337-
'word': ['成都', '商报', '记者', '姚永忠']},
338-
{'tag': ['NR', 'NN', 'NN', 'NN', 'NT', 'AD', 'VV'],
339-
'word': ['微软', 'XP', '操作', '系统', '今日', '正式', '退休']}]
348+
349+
>>> result = nlp.tag('成都商报记者 姚永忠')
350+
>>> _json_dumps(result)
351+
'[{"tag": ["ns", "n", "n", "nr"], "word": ["成都", "商报", "记者", "姚永忠"]}]'
352+
353+
>>> format_tag_result = lambda tagged: ' '.join('%s/%s' % x for x in zip(tagged['word'], tagged['tag']))
354+
>>> result = nlp.tag("成都商报记者 姚永忠")
355+
>>> format_tag_result(result[0])
356+
'成都/ns 商报/n 记者/n 姚永忠/nr'
357+
358+
>>> result = nlp.tag("成都商报记者 姚永忠", space_mode=2)
359+
>>> format_tag_result(result[0])
360+
'成都/ns 商报/n 记者/n /w 姚永忠/nr'
361+
362+
>>> result = nlp.tag(['亚投行意向创始成员国确定为57个', '“流量贵”频被吐槽'], oov_level=0)
363+
>>> format_tag_result(result[0])
364+
'亚/ns 投/v 行/n 意向/n 创始/vi 成员国/n 确定/v 为/v 57/m 个/q'
365+
366+
>>> format_tag_result(result[1])
367+
'“/wyz 流量/n 贵/a ”/wyy 频/d 被/pbei 吐槽/v'
340368
"""
341369
api_endpoint = '/tag/analysis'
342-
r = self._api_request('POST', api_endpoint, data=contents)
370+
params = {
371+
'space_mode': space_mode,
372+
'oov_level': oov_level,
373+
't2s': t2s,
374+
'special_char_conv': special_char_conv,
375+
}
376+
r = self._api_request('POST', api_endpoint, params=params, data=contents)
343377
return r.json()
344378

345379
def _cluster_push(self, task_id, contents):
@@ -657,6 +691,8 @@ def wait_until_complete(self, timeout=None):
657691
"""
658692
elapsed = 0.0
659693
seconds_to_sleep = 1.0
694+
if timeout is not None:
695+
seconds_to_sleep = min(seconds_to_sleep, timeout)
660696
i = 0
661697
while True:
662698
time.sleep(seconds_to_sleep)

requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
requests>=2.0.0
2-
sphinx-rtd-theme==0.1.6
2+
sphinx-rtd-theme==0.1.8

tests.py

+21-17
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ def test_invalid_token_raises_HTTPError():
1313

1414

1515
@pytest.fixture(scope='module',
16-
params=[{}, {'bosonnlp_url': 'http://api.bosondata.net'}])
16+
params=[{}])
1717
def nlp(request):
1818
# 注意:在测试时请设置环境变量BOSON_API_TOKEN为您的 API token。
1919
return BosonNLP(os.environ['BOSON_API_TOKEN'], **request.param)
@@ -63,7 +63,7 @@ def test_classify(nlp):
6363

6464

6565
def test_suggest(nlp):
66-
assert nlp.suggest('python', top_k=1) == [[0.9999999999999992, 'python/x']]
66+
assert nlp.suggest('北京', top_k=2)[1][1] == '上海/ns'
6767

6868

6969
def test_extract_keywords(nlp):
@@ -89,27 +89,31 @@ def test_depparser(nlp):
8989

9090
def test_ner(nlp):
9191
assert nlp.ner('成都商报记者 姚永忠', sensitivity=2) == \
92-
[{'tag': ['ns', 'n', 'n', 'nr'],
93-
'word': ['成都', '商报', '记者', '姚永忠'],
94-
'entity': [[0, 2, 'product_name'], [3, 4, 'person_name']]}]
92+
[{'entity': [[0, 2, 'product_name'], [3, 4, 'person_name']],
93+
'tag': ['ns', 'n', 'n', 'nr'],
94+
'word': ['成都', '商报', '记者', '姚永忠']}]
95+
9596
assert nlp.ner(['成都商报记者 姚永忠', '微软XP操作系统今日正式退休'], sensitivity=2) == \
96-
[{'tag': ['ns', 'n', 'n', 'nr'],
97-
'word': ['成都', '商报', '记者', '姚永忠'],
98-
'entity': [[0, 2, 'product_name'], [3, 4, 'person_name']]},
99-
{'tag': ['nt', 'x', 'nl', 't', 'ad', 'v'],
100-
'word': ['微软', 'XP', '操作系统', '今日', '正式', '退休'],
101-
'entity': [[0, 2, 'product_name'], [3, 4, 'time']]}]
97+
[{'entity': [[0, 2, 'product_name'], [3, 4, 'person_name']],
98+
'tag': ['ns', 'n', 'n', 'nr'],
99+
'word': ['成都', '商报', '记者', '姚永忠']},
100+
101+
{'entity': [[0, 2, 'product_name'], [3, 4, 'time']],
102+
'tag': ['nt', 'nx', 'nl', 't', 'ad', 'v'],
103+
'word': ['微软', 'XP', '操作系统', '今日', '正式', '退休']}]
102104

103105

104106
def test_tag(nlp):
105107
assert nlp.tag('成都商报记者 姚永忠') == \
106-
[{'tag': ['NR', 'NN', 'NN', 'NR'],
107-
'word': ['成都', '商报', '记者', '姚永忠']}]
108+
[{'word': ['成都', '商报', '记者', '姚永忠'],
109+
'tag': ['ns', 'n', 'n', 'nr']}]
110+
108111
assert nlp.tag(['成都商报记者 姚永忠', '微软XP操作系统今日正式退休']) == \
109-
[{'tag': ['NR', 'NN', 'NN', 'NR'],
110-
'word': ['成都', '商报', '记者', '姚永忠']},
111-
{'tag': ['NR', 'NN', 'NN', 'NN', 'NT', 'AD', 'VV'],
112-
'word': ['微软', 'XP', '操作', '系统', '今日', '正式', '退休']}]
112+
[{'word': ['成都', '商报', '记者', '姚永忠'],
113+
'tag': ['ns', 'n', 'n', 'nr']},
114+
115+
{'word': ['微软', 'XP', '操作系统', '今日', '正式', '退休'],
116+
'tag': ['nt', 'nx', 'nl', 't', 'ad', 'v']}]
113117

114118

115119
def test_cluster_task_wait_until_complete_raises_TimeoutError(nlp):

0 commit comments

Comments
 (0)