Hi,
I just got my prodigy license but sadly I'm already stuck.
When I try to import my dataset, with already annotated entities, I get errors like this one:
Task exception was never retrieved
future: <Task finished coro=<RequestResponseCycle.run_asgi() done, defined at C:\Users\szabop.virtualenvs\prodigy-2mgOBuS3\lib\site-packages\uvicorn\protocols\http\h11_impl.py:386> exception=ValueError("Mismatched tokenization. Can't resolve span to token index 109. This can happen if your data contains pre-set spans. Make sure that the spans match spaCy's tokenization or add a 'tokens' property to your task.\n\n{'start': 93, 'end': 109, 'label': 'ARTIKEL', 'token_start': 13}")>
Traceback (most recent call last):
File "C:\Users\szabop.virtualenvs\prodigy-2mgOBuS3\lib\site-packages\uvicorn\protocols\http\h11_impl.py", line 391, in run_asgi
self.logger.error(msg, exc_info=exc)
File "C:\Users\szabop\AppData\Local\Programs\Python\Python37\Lib\logging_init_.py", line 1412, in error
self.log(ERROR, msg, args, **kwargs)
File "C:\Users\szabop\AppData\Local\Programs\Python\Python37\Lib\logging_init.py", line 1519, in log
self.handle(record)
File "C:\Users\szabop\AppData\Local\Programs\Python\Python37\Lib\logging_init.py", line 1528, in handle
if (not self.disabled) and self.filter(record):
File "C:\Users\szabop\AppData\Local\Programs\Python\Python37\Lib\logging_init_.py", line 762, in filter
result = f.filter(record)
File "cython_src\prodigy\util.pyx", line 121, in prodigy.util.ServerErrorFilter.filter
File "C:\Users\szabop.virtualenvs\prodigy-2mgOBuS3\lib\site-packages\uvicorn\protocols\http\h11_impl.py", line 388, in run_asgi
result = await app(self.scope, self.receive, self.send)
File "C:\Users\szabop.virtualenvs\prodigy-2mgOBuS3\lib\site-packages\uvicorn\middleware\proxy_headers.py", line 45, in call
return await self.app(scope, receive, send)
File "C:\Users\szabop.virtualenvs\prodigy-2mgOBuS3\lib\site-packages\fastapi\applications.py", line 140, in call
await super().call(scope, receive, send)
File "C:\Users\szabop.virtualenvs\prodigy-2mgOBuS3\lib\site-packages\starlette\applications.py", line 134, in call
await self.error_middleware(scope, receive, send)
File "C:\Users\szabop.virtualenvs\prodigy-2mgOBuS3\lib\site-packages\starlette\middleware\errors.py", line 178, in call
raise exc from None
File "C:\Users\szabop.virtualenvs\prodigy-2mgOBuS3\lib\site-packages\starlette\middleware\errors.py", line 156, in call
await self.app(scope, receive, _send)
File "C:\Users\szabop.virtualenvs\prodigy-2mgOBuS3\lib\site-packages\starlette\middleware\cors.py", line 84, in call
await self.simple_response(scope, receive, send, request_headers=headers)
File "C:\Users\szabop.virtualenvs\prodigy-2mgOBuS3\lib\site-packages\starlette\middleware\cors.py", line 140, in simple_response
await self.app(scope, receive, send)
File "C:\Users\szabop.virtualenvs\prodigy-2mgOBuS3\lib\site-packages\starlette\middleware\base.py", line 25, in call
response = await self.dispatch_func(request, self.call_next)
File "C:\Users\szabop.virtualenvs\prodigy-2mgOBuS3\lib\site-packages\prodigy\app.py", line 198, in reset_db_middleware
response = await call_next(request)
File "C:\Users\szabop.virtualenvs\prodigy-2mgOBuS3\lib\site-packages\starlette\middleware\base.py", line 45, in call_next
task.result()
File "C:\Users\szabop.virtualenvs\prodigy-2mgOBuS3\lib\site-packages\starlette\middleware\base.py", line 38, in coro
await self.app(scope, receive, send)
File "C:\Users\szabop.virtualenvs\prodigy-2mgOBuS3\lib\site-packages\starlette\exceptions.py", line 73, in call
raise exc from None
File "C:\Users\szabop.virtualenvs\prodigy-2mgOBuS3\lib\site-packages\starlette\exceptions.py", line 62, in call
await self.app(scope, receive, sender)
File "C:\Users\szabop.virtualenvs\prodigy-2mgOBuS3\lib\site-packages\starlette\routing.py", line 590, in call
await route(scope, receive, send)
File "C:\Users\szabop.virtualenvs\prodigy-2mgOBuS3\lib\site-packages\starlette\routing.py", line 208, in call
await self.app(scope, receive, send)
File "C:\Users\szabop.virtualenvs\prodigy-2mgOBuS3\lib\site-packages\starlette\routing.py", line 41, in app
response = await func(request)
File "C:\Users\szabop.virtualenvs\prodigy-2mgOBuS3\lib\site-packages\fastapi\routing.py", line 129, in app
raw_response = await run_in_threadpool(dependant.call, **values)
File "C:\Users\szabop.virtualenvs\prodigy-2mgOBuS3\lib\site-packages\starlette\concurrency.py", line 25, in run_in_threadpool
return await loop.run_in_executor(None, func, *args)
File "C:\Users\szabop\AppData\Local\Programs\Python\Python37\Lib\concurrent\futures\thread.py", line 57, in run
result = self.fn(*self.args, **self.kwargs)
File "C:\Users\szabop.virtualenvs\prodigy-2mgOBuS3\lib\site-packages\prodigy\app.py", line 420, in get_session_questions
return _shared_get_questions(req.session_id, excludes=req.excludes)
File "C:\Users\szabop.virtualenvs\prodigy-2mgOBuS3\lib\site-packages\prodigy\app.py", line 391, in _shared_get_questions
tasks = controller.get_questions(session_id=session_id, excludes=excludes)
File "cython_src\prodigy\core.pyx", line 223, in prodigy.core.Controller.get_questions
File "cython_src\prodigy\core.pyx", line 227, in prodigy.core.Controller.get_questions
File "cython_src\prodigy\components\feeds.pyx", line 99, in prodigy.components.feeds.SharedFeed.get_questions
File "cython_src\prodigy\components\feeds.pyx", line 111, in prodigy.components.feeds.SharedFeed.get_next_batch
File "cython_src\prodigy\components\preprocess.pyx", line 130, in add_tokens
File "cython_src\prodigy\components\preprocess.pyx", line 222, in prodigy.components.preprocess._add_tokens
File "cython_src\prodigy\components\preprocess.pyx", line 207, in prodigy.components.preprocess.sync_spans_to_tokens
ValueError: Mismatched tokenization. Can't resolve span to token index 109. This can happen if your data contains pre-set spans. Make sure that the spans match spaCy's tokenization or add a 'tokens' property to your task.{'start': 93, 'end': 109, 'label': 'ARTIKEL', 'token_start': 13}
I read through all the threads but I haven't really found a solution to this.
Here's an example json:
[{
"text": "Arbeitnehmer (auch geringfügig Beschäftigte, Lehrlinge), die ihrem Arbeitgeber ein COVID-19-Risiko-Attest vorlegen, haben Anspruch auf Freistellung von der Arbeitsleistung und Fortzahlung des Entgelts, sofern der Arbeitnehmer die Arbeitsleistung nicht im Home-Office erbringen kann oder die Arbeitsbedingungen in der Arbeitsstätte nicht durch geeignete Maßnahmen so gestaltet werden können, dass eine Ansteckung mit COVID-19 mit größtmöglicher Sicherheit ausgeschlossen ist. Mit BGBl II 2020/609 wurde nun der Zeitraum, in dem Freistellungen von Arbeitnehmern mit einem COVID-19-Risiko-Attest möglich sind, bis zum Ablauf des 31. 3. 2021 verlängert.",
"spans": [{
"start": 479,
"end": 495,
"label": "BGBL"
}],
"tokens": [
{
"text": "Arbeitnehmer",
"start": 0,
"end": 12,
"id": 0
},
{
"text": "(",
"start": 13,
"end": 14,
"id": 1
},
{
"text": "auch",
"start": 14,
"end": 18,
"id": 2
},
{
"text": "geringfügig",
"start": 19,
"end": 30,
"id": 3
},
{
"text": "Beschäftigte",
"start": 31,
"end": 43,
"id": 4
},
{
"text": ",",
"start": 43,
"end": 44,
"id": 5
},
{
"text": "Lehrlinge",
"start": 45,
"end": 54,
"id": 6
},
{
"text": ")",
"start": 54,
"end": 55,
"id": 7
},
{
"text": ",",
"start": 55,
"end": 56,
"id": 8
},
{
"text": "die",
"start": 57,
"end": 60,
"id": 9
},
{
"text": "ihrem",
"start": 61,
"end": 66,
"id": 10
},
{
"text": "Arbeitgeber",
"start": 67,
"end": 78,
"id": 11
},
{
"text": "ein",
"start": 79,
"end": 82,
"id": 12
},
{
"text": "COVID-19-Risiko-Attest",
"start": 83,
"end": 105,
"id": 13
},
{
"text": "vorlegen",
"start": 106,
"end": 114,
"id": 14
},
{
"text": ",",
"start": 114,
"end": 115,
"id": 15
},
{
"text": "haben",
"start": 116,
"end": 121,
"id": 16
},
{
"text": "Anspruch",
"start": 122,
"end": 130,
"id": 17
},
{
"text": "auf",
"start": 131,
"end": 134,
"id": 18
},
{
"text": "Freistellung",
"start": 135,
"end": 147,
"id": 19
},
{
"text": "von",
"start": 148,
"end": 151,
"id": 20
},
{
"text": "der",
"start": 152,
"end": 155,
"id": 21
},
{
"text": "Arbeitsleistung",
"start": 156,
"end": 171,
"id": 22
},
{
"text": "und",
"start": 172,
"end": 175,
"id": 23
},
{
"text": "Fortzahlung",
"start": 176,
"end": 187,
"id": 24
},
{
"text": "des",
"start": 188,
"end": 191,
"id": 25
},
{
"text": "Entgelts",
"start": 192,
"end": 200,
"id": 26
},
{
"text": ",",
"start": 200,
"end": 201,
"id": 27
},
{
"text": "sofern",
"start": 202,
"end": 208,
"id": 28
},
{
"text": "der",
"start": 209,
"end": 212,
"id": 29
},
{
"text": "Arbeitnehmer",
"start": 213,
"end": 225,
"id": 30
},
{
"text": "die",
"start": 226,
"end": 229,
"id": 31
},
{
"text": "Arbeitsleistung",
"start": 230,
"end": 245,
"id": 32
},
{
"text": "nicht",
"start": 246,
"end": 251,
"id": 33
},
{
"text": "im",
"start": 252,
"end": 254,
"id": 34
},
{
"text": "Home-Office",
"start": 255,
"end": 266,
"id": 35
},
{
"text": "erbringen",
"start": 267,
"end": 276,
"id": 36
},
{
"text": "kann",
"start": 277,
"end": 281,
"id": 37
},
{
"text": "oder",
"start": 282,
"end": 286,
"id": 38
},
{
"text": "die",
"start": 287,
"end": 290,
"id": 39
},
{
"text": "Arbeitsbedingungen",
"start": 291,
"end": 309,
"id": 40
},
{
"text": "in",
"start": 310,
"end": 312,
"id": 41
},
{
"text": "der",
"start": 313,
"end": 316,
"id": 42
},
{
"text": "Arbeitsstätte",
"start": 317,
"end": 330,
"id": 43
},
{
"text": "nicht",
"start": 331,
"end": 336,
"id": 44
},
{
"text": "durch",
"start": 337,
"end": 342,
"id": 45
},
{
"text": "geeignete",
"start": 343,
"end": 352,
"id": 46
},
{
"text": "Maßnahmen",
"start": 353,
"end": 362,
"id": 47
},
{
"text": "so",
"start": 363,
"end": 365,
"id": 48
},
{
"text": "gestaltet",
"start": 366,
"end": 375,
"id": 49
},
{
"text": "werden",
"start": 376,
"end": 382,
"id": 50
},
{
"text": "können",
"start": 383,
"end": 389,
"id": 51
},
{
"text": ",",
"start": 389,
"end": 390,
"id": 52
},
{
"text": "dass",
"start": 391,
"end": 395,
"id": 53
},
{
"text": "eine",
"start": 396,
"end": 400,
"id": 54
},
{
"text": "Ansteckung",
"start": 401,
"end": 411,
"id": 55
},
{
"text": "mit",
"start": 412,
"end": 415,
"id": 56
},
{
"text": "COVID-19",
"start": 416,
"end": 424,
"id": 57
},
{
"text": "mit",
"start": 425,
"end": 428,
"id": 58
},
{
"text": "größtmöglicher",
"start": 429,
"end": 443,
"id": 59
},
{
"text": "Sicherheit",
"start": 444,
"end": 454,
"id": 60
},
{
"text": "ausgeschlossen",
"start": 455,
"end": 469,
"id": 61
},
{
"text": "ist",
"start": 470,
"end": 473,
"id": 62
},
{
"text": ".",
"start": 473,
"end": 474,
"id": 63
},
{
"text": "Mit",
"start": 475,
"end": 478,
"id": 64
},
{
"text": "BGBl",
"start": 479,
"end": 483,
"id": 65
},
{
"text": "II",
"start": 484,
"end": 486,
"id": 66
},
{
"text": "2020",
"start": 487,
"end": 491,
"id": 67
},
{
"text": "/",
"start": 491,
"end": 492,
"id": 68
},
{
"text": "609",
"start": 492,
"end": 495,
"id": 69
},
{
"text": "wurde",
"start": 496,
"end": 501,
"id": 70
},
{
"text": "nun",
"start": 502,
"end": 505,
"id": 71
},
{
"text": "der",
"start": 506,
"end": 509,
"id": 72
},
{
"text": "Zeitraum",
"start": 510,
"end": 518,
"id": 73
},
{
"text": ",",
"start": 518,
"end": 519,
"id": 74
},
{
"text": "in",
"start": 520,
"end": 522,
"id": 75
},
{
"text": "dem",
"start": 523,
"end": 526,
"id": 76
},
{
"text": "Freistellungen",
"start": 527,
"end": 541,
"id": 77
},
{
"text": "von",
"start": 542,
"end": 545,
"id": 78
},
{
"text": "Arbeitnehmern",
"start": 546,
"end": 559,
"id": 79
},
{
"text": "mit",
"start": 560,
"end": 563,
"id": 80
},
{
"text": "einem",
"start": 564,
"end": 569,
"id": 81
},
{
"text": "COVID-19-Risiko-Attest",
"start": 570,
"end": 592,
"id": 82
},
{
"text": "möglich",
"start": 593,
"end": 600,
"id": 83
},
{
"text": "sind",
"start": 601,
"end": 605,
"id": 84
},
{
"text": ",",
"start": 605,
"end": 606,
"id": 85
},
{
"text": "bis",
"start": 607,
"end": 610,
"id": 86
},
{
"text": "zum",
"start": 611,
"end": 614,
"id": 87
},
{
"text": "Ablauf",
"start": 615,
"end": 621,
"id": 88
},
{
"text": "des",
"start": 622,
"end": 625,
"id": 89
},
{
"text": "31.",
"start": 626,
"end": 629,
"id": 90
},
{
"text": "3.",
"start": 630,
"end": 632,
"id": 91
},
{
"text": "2021",
"start": 633,
"end": 637,
"id": 92
},
{
"text": "verlängert",
"start": 638,
"end": 648,
"id": 93
},
{
"text": ".",
"start": 648,
"end": 649,
"id": 94
}
]
}]
This particular doc work's but there's to many to find the one that fails.
I'm not sure what to do at this point and I'm honestly fonsued as to why this would even happen. I'm providing the entities with character-idx so why would there be a mismatch?
Any help would be much appreciated