My current problem is that I get:
on output of the 01_Preprocess_Reddit.ipynb, code at the bottom.
12/22/2023 01:12 PM 0 reddit.jsonl
This is straight from the examples I think (perhaps I dont understand how to enter the correct gz file
I also tried futzing with the iterator (with things like
" # .gz archive or directory of archives
OUTPUT_FILE = "C:\Users\dwu\Documents\ner_Prodigy\ner-food-ingredients\reddit.jsonl" # path to output JSONL
#%%
!pip install srsly
#%%
import re
from pathlib import Path
import gzip
import srsly
#%%
class Reddit(object):
"""Stream cleaned comments from Reddit."""
pre_format_re = re.compile(r"^[\`\*\~]")
post_format_re = re.compile(r"[\`\*\~]$")
url_re = re.compile(r"\[([^]]+)\]\(%%URL\)")
link_re = re.compile(r"\[([^]]+)\]\(https?://[^\)]+\)")
def __init__(
self, file_path, meta_keys={"subreddit": "section", "created_utc": "utc"}
):
"""
file_path (unicode / Path): Path to archive or directory of archives.
meta_keys (dict): Meta data key included in the Reddit corpus, mapped
to display name in Prodigy meta.
RETURNS (Reddit): The Reddit loader.
"""
self.meta = meta_keys
self.file_path = Path(file_path)
if not self.file_path.exists():
raise IOError(f"Can't find file path: {self.file_path}")
def __iter__(self):
for file_path in self.iter_files():
with gzip.open(str(file_path), "rb") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
comment = srsly.json_loads(line)
except:
print(line)
if self.is_valid(comment):
text = self.strip_tags(comment["body"])
yield {"text": text, "meta": self.get_meta(comment)}
def get_meta(self, item):
return {name: item.get(key, "n/a") for key, name in self.meta.items()}
def iter_files(self):
if not self.file_path.is_dir():
return [self.file_path]
yield from self.file_path.glob("**/*.gz")
def strip_tags(self, text):
text = self.link_re.sub(r"\1", text)
text = text.replace(">", ">").replace("<", "<")
text = self.pre_format_re.sub("", text)
text = self.post_format_re.sub("", text)
text = re.sub(r"\s+", " ", text)
return text.strip()
def is_valid(self, comment):
return (
comment["body"] is not None
and comment["body"] != "[deleted]"
and comment["body"] != ""
)
#%%
stream = Reddit(INPUT_DATA)
srsly.write_jsonl(OUTPUT_FILE, stream)
#%%
stream = Reddit(INPUT_DATA)
for x in stream.iter_files():
print(x)
#%%
stream = Reddit(INPUT_DATA)
i=0
for x in stream:
if i == 5:
break
print(x)
All of these produce null output
If anyone has ideas that would be great!
Dee
INPUT_DATA = "C:\Users\dwu\Documents\ner_Prodigy\ner-food-ingredients\s2v_reddit_2015_md.tar.gz" # .gz archive or directory of archives
OUTPUT_FILE = "C:\Users\dwu\Documents\ner_Prodigy\ner-food-ingredients\reddit.jsonl" # path to output JSONL
#%%
!pip install srsly
#%%
import re
from pathlib import Path
import gzip
import srsly
#%%
class Reddit(object):
"""Stream cleaned comments from Reddit."""
pre_format_re = re.compile(r"^[\`\*\~]")
post_format_re = re.compile(r"[\`\*\~]$")
url_re = re.compile(r"\[([^]]+)\]\(%%URL\)")
link_re = re.compile(r"\[([^]]+)\]\(https?://[^\)]+\)")
def __init__(
self, file_path, meta_keys={"subreddit": "section", "created_utc": "utc"}
):
"""
file_path (unicode / Path): Path to archive or directory of archives.
meta_keys (dict): Meta data key included in the Reddit corpus, mapped
to display name in Prodigy meta.
RETURNS (Reddit): The Reddit loader.
"""
self.meta = meta_keys
self.file_path = Path(file_path)
if not self.file_path.exists():
raise IOError(f"Can't find file path: {self.file_path}")
def __iter__(self):
for file_path in self.iter_files():
with gzip.open(str(file_path), "rb") as f:
for line in f:
line = line.strip()
if not line:
continue
comment = srsly.json_loads(line)
if self.is_valid(comment):
text = self.strip_tags(comment["body"])
yield {"text": text, "meta": self.get_meta(comment)}
def get_meta(self, item):
return {name: item.get(key, "n/a") for key, name in self.meta.items()}
def iter_files(self):
if not self.file_path.is_dir():
return [self.file_path]
yield from self.file_path.glob("**/*.gz")
def strip_tags(self, text):
text = self.link_re.sub(r"\1", text)
text = text.replace(">", ">").replace("<", "<")
text = self.pre_format_re.sub("", text)
text = self.post_format_re.sub("", text)
text = re.sub(r"\s+", " ", text)
return text.strip()
def is_valid(self, comment):
return (
comment["body"] is not None
and comment["body"] != "[deleted]"
and comment["body"] != ""
)
#%%
stream = Reddit(INPUT_DATA)
srsly.write_jsonl(OUTPUT_FILE, stream)
#%%
PS. I was able to get the [tok2vec_cd8_model289.bin
] here
https://github.com/explosion/projects/releases/download/tok2vec/tok2vec_cd8_model289.bin): GitHub - Dwonczykj/ner_food
NOTE: I also tried to un-gz and untar the files: and point to the INPUT_DATA = "C:\Users\dwu\Documents\ner_Prodigy\ner-food-ingredients\s2v_old"
but that didnt work either