-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathuniverse.json
4077 lines (4076 loc) · 218 KB
/
universe.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
{
"resources": [
{
"id": "concepcy",
"title": "concepCy",
"slogan": "A multilingual knowledge graph in spaCy",
"description": "A spaCy wrapper for ConceptNet, a freely-available semantic network designed to help computers understand the meaning of words.",
"github": "JulesBelveze/concepcy",
"pip": "concepcy",
"code_example": [
"import spacy",
"import concepcy",
"",
"nlp = spacy.load('en_core_web_sm')",
"# Using default concepCy configuration",
"nlp.add_pipe('concepcy')",
"",
"doc = nlp('WHO is a lovely company')",
"",
"# Access all the 'RelatedTo' relations from the Doc",
"for word, relations in doc._.relatedto.items():",
" print(f'Word: {word}\n{relations}')",
"",
"# Access the 'RelatedTo' relations word by word",
"for token in doc:",
" print(f'Word: {token}\n{token._.relatedto}')"
],
"category": ["pipeline"],
"image": "https://github.com/JulesBelveze/concepcy/blob/main/figures/concepcy.png",
"tags": ["semantic", "ConceptNet"],
"author": "Jules Belveze",
"author_links": {
"github": "JulesBelveze",
"website": "https://www.linkedin.com/in/jules-belveze/"
}
},
{
"id": "spacyfishing",
"title": "spaCy fishing",
"slogan": "Named entity disambiguation and linking on Wikidata in spaCy with Entity-Fishing.",
"description": "A spaCy wrapper of Entity-Fishing for named entity disambiguation and linking against a Wikidata knowledge base.",
"github": "Lucaterre/spacyfishing",
"pip": "spacyfishing",
"code_example": [
"import spacy",
"text = 'Victor Hugo and Honoré de Balzac are French writers who lived in Paris.'",
"nlp = spacy.load('en_core_web_sm')",
"nlp.add_pipe('entityfishing')",
"doc = nlp(text)",
"for span in doc.ents:",
" print((ent.text, ent.label_, ent._.kb_qid, ent._.url_wikidata, ent._.nerd_score))",
"# ('Victor Hugo', 'PERSON', 'Q535', 'https://www.wikidata.org/wiki/Q535', 0.972)",
"# ('Honoré de Balzac', 'PERSON', 'Q9711', 'https://www.wikidata.org/wiki/Q9711', 0.9724)",
"# ('French', 'NORP', 'Q121842', 'https://www.wikidata.org/wiki/Q121842', 0.3739)",
"# ('Paris', 'GPE', 'Q90', 'https://www.wikidata.org/wiki/Q90', 0.5652)",
"## Set parameter `extra_info` to `True` and check also span._.description, span._.src_description, span._.normal_term, span._.other_ids"
],
"category": ["models", "pipeline"],
"image": "https://raw.githubusercontent.com/Lucaterre/spacyfishing/main/docs/spacyfishing-logo-resized.png",
"tags": ["NER", "NEL"],
"author": "Lucas Terriel",
"author_links": {
"twitter": "TerreLuca",
"github": "Lucaterre"
}
},
{
"id": "aim-spacy",
"title": "Aim-spaCy",
"slogan": "Aim-spaCy is an Aim-based spaCy experiment tracker.",
"description": "Aim-spaCy helps to easily collect, store and explore training logs for spaCy, including: hyper-parameters, metrics and displaCy visualizations",
"github": "aimhubio/aim-spacy",
"pip": "aim-spacy",
"code_example": [
"https://github.com/aimhubio/aim-spacy/tree/master/examples"
],
"code_language": "python",
"url": "https://aimstack.io/spacy",
"thumb": "https://user-images.githubusercontent.com/13848158/172912427-ee9327ea-3cd8-47fa-8427-6c0d36cd831f.png",
"image": "https://user-images.githubusercontent.com/13848158/136364717-0939222c-55b6-44f0-ad32-d9ab749546e4.png",
"author": "AimStack",
"author_links": {
"twitter": "aimstackio",
"github": "aimhubio",
"website": "https://aimstack.io"
},
"category": ["visualizers"],
"tags": ["experiment-tracking", "visualization"]
},
{
"id": "spacy-report",
"title": "spacy-report",
"slogan": "Generates interactive reports for spaCy models.",
"description": "The goal of spacy-report is to offer static reports for spaCy models that help users make better decisions on how the models can be used.",
"github": "koaning/spacy-report",
"pip": "spacy-report",
"thumb": "https://github.com/koaning/spacy-report/raw/main/icon.png",
"image": "https://raw.githubusercontent.com/koaning/spacy-report/main/gif.gif",
"code_example": [
"python -m spacy report textcat training/model-best/ corpus/train.spacy corpus/dev.spacy"
],
"category": ["visualizers", "research"],
"author": "Vincent D. Warmerdam",
"author_links": {
"twitter": "fishnets88",
"github": "koaning",
"website": "https://koaning.io"
}
},
{
"id": "scrubadub_spacy",
"title": "scrubadub_spacy",
"category": ["pipeline"],
"slogan": "Remove personally identifiable information from text using spaCy.",
"description": "scrubadub removes personally identifiable information from text. scrubadub_spacy is an extension that uses spaCy NLP models to remove personal information from text.",
"github": "LeapBeyond/scrubadub_spacy",
"pip": "scrubadub-spacy",
"url": "https://github.com/LeapBeyond/scrubadub_spacy",
"code_language": "python",
"author": "Leap Beyond",
"author_links": {
"github": "LeapBeyond",
"website": "https://leapbeyond.ai"
},
"code_example": [
"import scrubadub, scrubadub_spacy",
"scrubber = scrubadub.Scrubber()",
"scrubber.add_detector(scrubadub_spacy.detectors.SpacyEntityDetector)",
"print(scrubber.clean(\"My name is Alex, I work at LifeGuard in London, and my eMail is [email protected] btw. my super secret twitter login is username: alex_2000 password: g-dragon180888\"))",
"# My name is {{NAME}}, I work at {{ORGANIZATION}} in {{LOCATION}}, and my eMail is {{EMAIL}} btw. my super secret twitter login is username: {{USERNAME}} password: {{PASSWORD}}"
]
},
{
"id": "spacy-setfit-textcat",
"title": "spacy-setfit-textcat",
"category": ["research"],
"tags": ["SetFit", "Few-Shot"],
"slogan": "spaCy Project: Experiments with SetFit & Few-Shot Classification",
"description": "This project is an experiment with spaCy and few-shot text classification using SetFit",
"github": "pmbaumgartner/spacy-setfit-textcat",
"url": "https://github.com/pmbaumgartner/spacy-setfit-textcat",
"code_language": "python",
"author": "Peter Baumgartner",
"author_links": {
"twitter" : "pmbaumgartner",
"github": "pmbaumgartner",
"website": "https://www.peterbaumgartner.com/"
},
"code_example": [
"https://colab.research.google.com/drive/1CvGEZC0I9_v8gWrBxSJQ4Z8JGPJz-HYb?usp=sharing"
]
},
{
"id": "spacy-experimental",
"title": "spacy-experimental",
"category": ["extension"],
"slogan": "Cutting-edge experimental spaCy components and features",
"description": "This package includes experimental components and features for spaCy v3.x, for example model architectures, pipeline components and utilities.",
"github": "explosion/spacy-experimental",
"pip": "spacy-experimental",
"url": "https://github.com/explosion/spacy-experimental",
"code_language": "python",
"author": "Explosion",
"author_links": {
"twitter" : "explosion_ai",
"github": "explosion",
"website": "https://explosion.ai/"
},
"code_example": [
"python -m pip install -U pip setuptools wheel",
"python -m pip install spacy-experimental"
]
},
{
"id": "spacypdfreader",
"title": "spadypdfreader",
"category": ["pipeline"],
"tags": ["PDF"],
"slogan": "Easy PDF to text to spaCy text extraction in Python.",
"description": "*spacypdfreader* is a Python library that allows you to convert PDF files directly into *spaCy* `Doc` objects. The library provides several built in parsers or bring your own parser. `Doc` objects are annotated with several custom attributes including: `token._.page_number`, `doc._.page_range`, `doc._.first_page`, `doc._.last_page`, `doc._.pdf_file_name`, and `doc._.page(int)`.",
"github": "SamEdwardes/spacypdfreader",
"pip": "spacypdfreader",
"url": "https://samedwardes.github.io/spacypdfreader/",
"code_language": "python",
"author": "Sam Edwardes",
"author_links": {
"twitter": "TheReaLSamlam",
"github": "SamEdwardes",
"website": "https://samedwardes.com"
},
"code_example": [
"import spacy",
"from spacypdfreader import pdf_reader",
"",
"nlp = spacy.load('en_core_web_sm')",
"doc = pdf_reader('tests/data/test_pdf_01.pdf', nlp)",
"",
"# Get the page number of any token.",
"print(doc[0]._.page_number) # 1",
"print(doc[-1]._.page_number) # 4",
"",
"# Get page meta data about the PDF document.",
"print(doc._.pdf_file_name) # 'tests/data/test_pdf_01.pdf'",
"print(doc._.page_range) # (1, 4)",
"print(doc._.first_page) # 1",
"print(doc._.last_page) # 4",
"",
"# Get all of the text from a specific PDF page.",
"print(doc._.page(4)) # 'able to display the destination page (unless...'"
]
},
{
"id": "nlpcloud",
"title": "NLPCloud.io",
"slogan": "Production-ready API for spaCy models in production",
"description": "A highly-available hosted API to easily deploy and use spaCy models in production. Supports NER, POS tagging, dependency parsing, and tokenization.",
"github": "nlpcloud",
"pip": "nlpcloud",
"code_example": [
"import nlpcloud",
"",
"client = nlpcloud.Client('en_core_web_lg', '4eC39HqLyjWDarjtT1zdp7dc')",
"client.entities('John Doe is a Go Developer at Google')",
"# [{'end': 8, 'start': 0, 'text': 'John Doe', 'type': 'PERSON'}, {'end': 25, 'start': 13, 'text': 'Go Developer', 'type': 'POSITION'}, {'end': 35,'start': 30, 'text': 'Google', 'type': 'ORG'}]"
],
"thumb": "https://avatars.githubusercontent.com/u/77671902",
"image": "https://nlpcloud.io/assets/images/logo.svg",
"code_language": "python",
"author": "NLPCloud.io",
"author_links": {
"github": "nlpcloud",
"twitter": "cloud_nlp",
"website": "https://nlpcloud.io"
},
"category": ["apis", "nonpython", "standalone"],
"tags": ["api", "deploy", "production"]
},
{
"id": "eMFDscore",
"title": "eMFDscore : Extended Moral Foundation Dictionary Scoring for Python",
"slogan": "Extended Moral Foundation Dictionary Scoring for Python",
"description": "eMFDscore is a library for the fast and flexible extraction of various moral information metrics from textual input data. eMFDscore is built on spaCy for faster execution and performs minimal preprocessing consisting of tokenization, syntactic dependency parsing, lower-casing, and stopword/punctuation/whitespace removal. eMFDscore lets users score documents with multiple Moral Foundations Dictionaries, provides various metrics for analyzing moral information, and extracts moral patient, agent, and attribute words related to entities.",
"github": "medianeuroscience/emfdscore",
"code_example": [
"from emfdscore.scoring import score_docs",
"import pandas as pd",
"template_input = pd.read_csv('emfdscore/template_input.csv', header=None)",
"DICT_TYPE = 'emfd'",
"PROB_MAP = 'single'",
"SCORE_METHOD = 'bow'",
"OUT_METRICS = 'vice-virtue'",
"OUT_CSV_PATH = 'single-vv.csv'",
"df = score_docs(template_input,DICT_TYPE,PROB_MAP,SCORE_METHOD,OUT_METRICS,num_docs)"
],
"code_language": "python",
"author": "Media Neuroscience Lab",
"author_links": {
"github": "medianeuroscience",
"twitter": "medianeuro"
},
"category": ["research", "teaching"],
"tags": ["morality", "dictionary", "sentiment"]
},
{
"id": "skweak",
"title": "skweak",
"slogan": "Weak supervision for NLP",
"description": "`skweak` brings the power of weak supervision to NLP tasks, and in particular sequence labelling and text classification. Instead of annotating documents by hand, `skweak` allows you to define *labelling functions* to automatically label your documents, and then aggregate their results using a statistical model that estimates the accuracy and confusions of each labelling function.",
"github": "NorskRegnesentral/skweak",
"pip": "skweak",
"code_example": [
"import spacy, re",
"from skweak import heuristics, gazetteers, aggregation, utils",
"",
"# LF 1: heuristic to detect occurrences of MONEY entities",
"def money_detector(doc):",
" for tok in doc[1:]:",
" if tok.text[0].isdigit() and tok.nbor(-1).is_currency:",
" yield tok.i-1, tok.i+1, 'MONEY'",
"lf1 = heuristics.FunctionAnnotator('money', money_detector)",
"",
"# LF 2: detection of years with a regex",
"lf2= heuristics.TokenConstraintAnnotator ('years', lambda tok: re.match('(19|20)\\d{2}$', tok.text), 'DATE')",
"",
"# LF 3: a gazetteer with a few names",
"NAMES = [('Barack', 'Obama'), ('Donald', 'Trump'), ('Joe', 'Biden')]",
"trie = gazetteers.Trie(NAMES)",
"lf3 = gazetteers.GazetteerAnnotator('presidents', {'PERSON':trie})",
"",
"# We create a corpus (here with a single text)",
"nlp = spacy.load('en_core_web_sm')",
"doc = nlp('Donald Trump paid $750 in federal income taxes in 2016')",
"",
"# apply the labelling functions",
"doc = lf3(lf2(lf1(doc)))",
"",
"# and aggregate them",
"hmm = aggregation.HMM('hmm', ['PERSON', 'DATE', 'MONEY'])",
"hmm.fit_and_aggregate([doc])",
"",
"# we can then visualise the final result (in Jupyter)",
"utils.display_entities(doc, 'hmm')"
],
"code_language": "python",
"url": "https://github.com/NorskRegnesentral/skweak",
"thumb": "https://raw.githubusercontent.com/NorskRegnesentral/skweak/main/data/skweak_logo_thumbnail.jpg",
"image": "https://raw.githubusercontent.com/NorskRegnesentral/skweak/main/data/skweak_logo.jpg",
"author": "Pierre Lison",
"author_links": {
"twitter": "plison2",
"github": "plison",
"website": "https://www.nr.no/~plison"
},
"category": ["pipeline", "standalone", "research", "training"],
"tags": [],
"spacy_version": 3
},
{
"id": "numerizer",
"title": "numerizer",
"slogan": "Convert natural language numerics into ints and floats.",
"description": "A SpaCy extension for Docs, Spans and Tokens that converts numerical words and quantitative named entities into numeric strings.",
"github": "jaidevd/numerizer",
"pip": "numerizer",
"code_example": [
"from spacy import load",
"import numerizer",
"nlp = load('en_core_web_sm') # or any other model",
"doc = nlp('The Hogwarts Express is at platform nine and three quarters')",
"doc._.numerize()",
"# {nine and three quarters: '9.75'}"
],
"author": "Jaidev Deshpande",
"author_links": {
"github": "jaidevd",
"twitter": "jaidevd"
},
"category": ["standalone"]
},
{
"id": "spikex",
"title": "SpikeX - SpaCy Pipes for Knowledge Extraction",
"slogan": "Use SpikeX to build knowledge extraction tools with almost-zero effort",
"description": "SpikeX is a collection of pipes ready to be plugged in a spaCy pipeline. It aims to help in building knowledge extraction tools with almost-zero effort.",
"github": "erre-quadro/spikex",
"pip": "spikex",
"code_example": [
"from spacy import load as spacy_load",
"from spikex.wikigraph import load as wg_load",
"from spikex.pipes import WikiPageX",
"",
"# load a spacy model and get a doc",
"nlp = spacy_load('en_core_web_sm')",
"doc = nlp('An apple a day keeps the doctor away')",
"# load a WikiGraph",
"wg = wg_load('simplewiki_core')",
"# get a WikiPageX and extract all pages",
"wikipagex = WikiPageX(wg)",
"doc = wikipagex(doc)",
"# see all pages extracted from the doc",
"for span in doc._.wiki_spans:",
" print(span._.wiki_pages)"
],
"category": ["pipeline", "standalone"],
"author": "Erre Quadro",
"author_links": {
"github": "erre-quadro",
"website": "https://www.errequadrosrl.com"
}
},
{
"id": "spacy-dbpedia-spotlight",
"title": "DBpedia Spotlight for SpaCy",
"slogan": "Use DBpedia Spotlight to link entities inside SpaCy",
"description": "This library links SpaCy with [DBpedia Spotlight](https://www.dbpedia-spotlight.org/). You can easily get the DBpedia entities from your documents, using the public web service or by using your own instance of DBpedia Spotlight. The `doc.ents` are populated with the entities and all their details (URI, type, ...).",
"github": "MartinoMensio/spacy-dbpedia-spotlight",
"pip": "spacy-dbpedia-spotlight",
"code_example": [
"import spacy_dbpedia_spotlight",
"# load your model as usual",
"nlp = spacy.load('en_core_web_lg')",
"# add the pipeline stage",
"nlp.add_pipe('dbpedia_spotlight')",
"# get the document",
"doc = nlp('The president of USA is calling Boris Johnson to decide what to do about coronavirus')",
"# see the entities",
"print('Entities', [(ent.text, ent.label_, ent.kb_id_) for ent in doc.ents])",
"# inspect the raw data from DBpedia spotlight",
"print(doc.ents[0]._.dbpedia_raw_result)"
],
"category": ["models", "pipeline"],
"author": "Martino Mensio",
"author_links": {
"twitter": "MartinoMensio",
"github": "MartinoMensio",
"website": "https://martinomensio.github.io"
}
},
{
"id": "spacy-textblob",
"title": "spacytextblob",
"slogan": "A TextBlob sentiment analysis pipeline component for spaCy.",
"thumb": "https://github.com/SamEdwardes/spacytextblob/raw/main/docs/static/img/logo-thumb-square-250x250.png",
"description": "spacytextblob is a pipeline component that enables sentiment analysis using the [TextBlob](https://github.com/sloria/TextBlob) library. It will add the additional extension `._.blob` to `Doc`, `Span`, and `Token` objects.",
"github": "SamEdwardes/spacytextblob",
"pip": "spacytextblob",
"code_example": [
"# the following installations are required",
"# python -m textblob.download_corpora",
"# python -m spacy download en_core_web_sm",
"",
"import spacy",
"from spacytextblob.spacytextblob import SpacyTextBlob",
"",
"nlp = spacy.load('en_core_web_sm')",
"nlp.add_pipe('spacytextblob')",
"text = 'I had a really horrible day. It was the worst day ever! But every now and then I have a really good day that makes me happy.'",
"doc = nlp(text)",
"doc._.blob.polarity # Polarity: -0.125",
"doc._.blob.subjectivity # Subjectivity: 0.9",
"doc._.blob.sentiment_assessments.assessments # Assessments: [(['really', 'horrible'], -1.0, 1.0, None), (['worst', '!'], -1.0, 1.0, None), (['really', 'good'], 0.7, 0.6000000000000001, None), (['happy'], 0.8, 1.0, None)]",
"doc._.blob.ngrams() # [WordList(['I', 'had', 'a']), WordList(['had', 'a', 'really']), WordList(['a', 'really', 'horrible']), WordList(['really', 'horrible', 'day']), WordList(['horrible', 'day', 'It']), WordList(['day', 'It', 'was']), WordList(['It', 'was', 'the']), WordList(['was', 'the', 'worst']), WordList(['the', 'worst', 'day']), WordList(['worst', 'day', 'ever']), WordList(['day', 'ever', 'But']), WordList(['ever', 'But', 'every']), WordList(['But', 'every', 'now']), WordList(['every', 'now', 'and']), WordList(['now', 'and', 'then']), WordList(['and', 'then', 'I']), WordList(['then', 'I', 'have']), WordList(['I', 'have', 'a']), WordList(['have', 'a', 'really']), WordList(['a', 'really', 'good']), WordList(['really', 'good', 'day']), WordList(['good', 'day', 'that']), WordList(['day', 'that', 'makes']), WordList(['that', 'makes', 'me']), WordList(['makes', 'me', 'happy'])]"
],
"code_language": "python",
"url": "https://spacytextblob.netlify.app/",
"author": "Sam Edwardes",
"author_links": {
"twitter": "TheReaLSamlam",
"github": "SamEdwardes",
"website": "https://samedwardes.com"
},
"category": ["pipeline"],
"tags": ["sentiment", "textblob"],
"spacy_version": 3
},
{
"id": "spacy-ray",
"title": "spacy-ray",
"slogan": "Parallel and distributed training with spaCy and Ray",
"description": "[Ray](https://ray.io/) is a fast and simple framework for building and running **distributed applications**. This very lightweight extension package lets you use Ray for parallel and distributed training with spaCy. If `spacy-ray` is installed in the same environment as spaCy, it will automatically add `spacy ray` commands to your spaCy CLI.",
"github": "explosion/spacy-ray",
"pip": "spacy-ray",
"category": ["training"],
"author": "Explosion / Anyscale",
"thumb": "https://i.imgur.com/7so6ZpS.png"
},
{
"id": "spacy-sentence-bert",
"title": "spaCy - sentence-transformers",
"slogan": "Pipelines for pretrained sentence-transformers (BERT, RoBERTa, XLM-RoBERTa & Co.) directly within spaCy",
"description": "This library lets you use the embeddings from [sentence-transformers](https://github.com/UKPLab/sentence-transformers) of Docs, Spans and Tokens directly from spaCy. Most models are for the english language but three of them are multilingual.",
"github": "MartinoMensio/spacy-sentence-bert",
"pip": "spacy-sentence-bert",
"code_example": [
"import spacy_sentence_bert",
"# load one of the models listed at https://github.com/MartinoMensio/spacy-sentence-bert/",
"nlp = spacy_sentence_bert.load_model('en_roberta_large_nli_stsb_mean_tokens')",
"# get two documents",
"doc_1 = nlp('Hi there, how are you?')",
"doc_2 = nlp('Hello there, how are you doing today?')",
"# use the similarity method that is based on the vectors, on Doc, Span or Token",
"print(doc_1.similarity(doc_2[0:7]))"
],
"category": ["models", "pipeline"],
"author": "Martino Mensio",
"author_links": {
"twitter": "MartinoMensio",
"github": "MartinoMensio",
"website": "https://martinomensio.github.io"
}
},
{
"id": "spacy-streamlit",
"title": "spacy-streamlit",
"slogan": "spaCy building blocks for Streamlit apps",
"github": "explosion/spacy-streamlit",
"description": "This package contains utilities for visualizing spaCy models and building interactive spaCy-powered apps with [Streamlit](https://streamlit.io). It includes various building blocks you can use in your own Streamlit app, like visualizers for **syntactic dependencies**, **named entities**, **text classification**, **semantic similarity** via word vectors, token attributes, and more.",
"pip": "spacy-streamlit",
"category": ["visualizers"],
"thumb": "https://i.imgur.com/mhEjluE.jpg",
"image": "https://user-images.githubusercontent.com/13643239/85388081-f2da8700-b545-11ea-9bd4-e303d3c5763c.png",
"code_example": [
"import spacy_streamlit",
"",
"models = [\"en_core_web_sm\", \"en_core_web_md\"]",
"default_text = \"Sundar Pichai is the CEO of Google.\"",
"spacy_streamlit.visualize(models, default_text)"
],
"author": "Ines Montani",
"author_links": {
"twitter": "_inesmontani",
"github": "ines",
"website": "https://ines.io"
}
},
{
"id": "spaczz",
"title": "spaczz",
"slogan": "Fuzzy matching and more for spaCy.",
"description": "Spaczz provides fuzzy matching and multi-token regex matching functionality for spaCy. Spaczz's components have similar APIs to their spaCy counterparts and spaczz pipeline components can integrate into spaCy pipelines where they can be saved/loaded as models.",
"github": "gandersen101/spaczz",
"pip": "spaczz",
"code_example": [
"import spacy",
"from spaczz.matcher import FuzzyMatcher",
"",
"nlp = spacy.blank(\"en\")",
"text = \"\"\"Grint Anderson created spaczz in his home at 555 Fake St,",
"Apt 5 in Nashv1le, TN 55555-1234 in the US.\"\"\" # Spelling errors intentional.",
"doc = nlp(text)",
"",
"matcher = FuzzyMatcher(nlp.vocab)",
"matcher.add(\"NAME\", [nlp(\"Grant Andersen\")])",
"matcher.add(\"GPE\", [nlp(\"Nashville\")])",
"matches = matcher(doc)",
"",
"for match_id, start, end, ratio in matches:",
" print(match_id, doc[start:end], ratio)"
],
"code_language": "python",
"url": "https://spaczz.readthedocs.io/en/latest/",
"author": "Grant Andersen",
"author_links": {
"twitter": "gandersen101",
"github": "gandersen101"
},
"category": ["pipeline"],
"tags": ["fuzzy-matching", "regex"]
},
{
"id": "spacy-universal-sentence-encoder",
"title": "spaCy - Universal Sentence Encoder",
"slogan": "Make use of Google's Universal Sentence Encoder directly within spaCy",
"description": "This library lets you use Universal Sentence Encoder embeddings of Docs, Spans and Tokens directly from TensorFlow Hub",
"github": "MartinoMensio/spacy-universal-sentence-encoder",
"pip": "spacy-universal-sentence-encoder",
"code_example": [
"import spacy_universal_sentence_encoder",
"# load one of the models: ['en_use_md', 'en_use_lg', 'xx_use_md', 'xx_use_lg']",
"nlp = spacy_universal_sentence_encoder.load_model('en_use_lg')",
"# get two documents",
"doc_1 = nlp('Hi there, how are you?')",
"doc_2 = nlp('Hello there, how are you doing today?')",
"# use the similarity method that is based on the vectors, on Doc, Span or Token",
"print(doc_1.similarity(doc_2[0:7]))"
],
"category": ["models", "pipeline"],
"author": "Martino Mensio",
"author_links": {
"twitter": "MartinoMensio",
"github": "MartinoMensio",
"website": "https://martinomensio.github.io"
}
},
{
"id": "whatlies",
"title": "whatlies",
"slogan": "Make interactive visualisations to figure out 'what lies' in word embeddings.",
"description": "This small library offers tools to make visualisation easier of both word embeddings as well as operations on them. It has support for spaCy prebuilt models as a first class citizen but also offers support for sense2vec. There's a convenient API to perform linear algebra as well as support for popular transformations like PCA/UMAP/etc.",
"github": "koaning/whatlies",
"pip": "whatlies",
"thumb": "https://i.imgur.com/rOkOiLv.png",
"image": "https://raw.githubusercontent.com/koaning/whatlies/master/docs/gif-two.gif",
"code_example": [
"from whatlies import EmbeddingSet",
"from whatlies.language import SpacyLanguage",
"",
"lang = SpacyLanguage('en_core_web_md')",
"words = ['cat', 'dog', 'fish', 'kitten', 'man', 'woman', 'king', 'queen', 'doctor', 'nurse']",
"",
"emb = lang[words]",
"emb.plot_interactive(x_axis='man', y_axis='woman')"
],
"category": ["visualizers", "research"],
"author": "Vincent D. Warmerdam",
"author_links": {
"twitter": "fishnets88",
"github": "koaning",
"website": "https://koaning.io"
}
},
{
"id": "bertopic",
"title": "BERTopic",
"slogan": "Leveraging BERT and c-TF-IDF to create easily interpretable topics.",
"description": "BERTopic is a topic modeling technique that leverages embedding models and c-TF-IDF to create dense clusters allowing for easily interpretable topics whilst keeping important words in the topic descriptions. BERTopic supports guided, (semi-) supervised, hierarchical, and dynamic topic modeling.",
"github": "maartengr/bertopic",
"pip": "bertopic",
"thumb": "https://i.imgur.com/Rx2LfBm.png",
"image": "https://raw.githubusercontent.com/MaartenGr/BERTopic/master/images/topic_visualization.gif",
"code_example": [
"import spacy",
"from bertopic import BERTopic",
"from sklearn.datasets import fetch_20newsgroups",
"",
"docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data']",
"nlp = spacy.load('en_core_web_md', exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])",
"",
"topic_model = BERTopic(embedding_model=nlp)",
"topics, probs = topic_model.fit_transform(docs)",
"",
"fig = topic_model.visualize_topics()",
"fig.show()"
],
"category": ["visualizers", "training"],
"author": "Maarten Grootendorst",
"author_links": {
"twitter": "maartengr",
"github": "maartengr",
"website": "https://maartengrootendorst.com"
}
},
{
"id": "tokenwiser",
"title": "tokenwiser",
"slogan": "Connect vowpal-wabbit & scikit-learn models to spaCy to run simple classification benchmarks. Comes with many utility functions for spaCy pipelines.",
"github": "koaning/tokenwiser",
"pip": "tokenwiser",
"thumb": "https://koaning.github.io/tokenwiser/token.png",
"image": "https://koaning.github.io/tokenwiser/logo-tokw.png",
"code_example": [
"import spacy",
"",
"from sklearn.pipeline import make_pipeline",
"from sklearn.feature_extraction.text import CountVectorizer",
"from sklearn.linear_model import LogisticRegression",
"",
"from tokenwiser.component import attach_sklearn_categoriser",
"",
"X = [",
" 'i really like this post',",
" 'thanks for that comment',",
" 'i enjoy this friendly forum',",
" 'this is a bad post',",
" 'i dislike this article',",
" 'this is not well written'",
"]",
"",
"y = ['pos', 'pos', 'pos', 'neg', 'neg', 'neg']",
"",
"# Note that we're training a pipeline here via a single-batch `.fit()` method",
"pipe = make_pipeline(CountVectorizer(), LogisticRegression()).fit(X, y)",
"",
"nlp = spacy.load('en_core_web_sm')",
"# This is where we attach our pre-trained model as a pipeline step.",
"attach_sklearn_categoriser(nlp, pipe_name='silly_sentiment', estimator=pipe)"
],
"category": ["pipeline", "training"],
"author": "Vincent D. Warmerdam",
"author_links": {
"twitter": "fishnets88",
"github": "koaning",
"website": "https://koaning.io"
}
},
{
"id": "Klayers",
"title": "Klayers",
"category": ["pipeline"],
"tags": ["AWS"],
"slogan": "spaCy as a AWS Lambda Layer",
"description": "A collection of Python Packages as AWS Lambda(λ) Layers",
"github": "keithrozario/Klayers",
"pip": "",
"url": "https://github.com/keithrozario/Klayers",
"code_language": "python",
"author": "Keith Rozario",
"author_links": {
"twitter" : "keithrozario",
"github": "keithrozario",
"website": "https://www.keithrozario.com"
},
"code_example": [
"# SAM Template",
"MyLambdaFunction:",
" Type: AWS::Serverless::Function",
" Handler: 02_pipeline/spaCy.main",
" Description: Name Entity Extraction",
" Runtime: python3.8",
" Layers:",
" - arn:aws:lambda:${self:provider.region}:113088814899:layer:Klayers-python37-spacy:18"
]
},
{
"type": "education",
"id": "video-spacys-ner-model-alt",
"title": "Named Entity Recognition (NER) using spaCy",
"slogan": "",
"description": "In this video, I show you how to do named entity recognition using the spaCy library for Python.",
"youtube": "Gn_PjruUtrc",
"author": "Applied Language Technology",
"author_links": {
"twitter": "HelsinkiNLP",
"github": "Applied-Language-Technology",
"website": "https://applied-language-technology.mooc.fi/"
},
"category": ["videos"]
},
{
"id": "HuSpaCy",
"title": "HuSpaCy",
"category": ["models"],
"tags": ["Hungarian"],
"slogan": "HuSpaCy: industrial-strength Hungarian natural language processing",
"description": "HuSpaCy is a spaCy model and a library providing industrial-strength Hungarian language processing facilities.",
"github": "huspacy/huspacy",
"pip": "huspacy",
"url": "https://github.com/huspacy/huspacy",
"code_language": "python",
"author": "SzegedAI",
"author_links": {
"github": "https://szegedai.github.io/",
"website": "https://u-szeged.hu/english"
},
"code_example": [
"# Load the model using huspacy",
"import huspacy",
"",
"nlp = huspacy.load()",
"",
"# Load the mode using spacy.load()",
"import spacy",
"",
"nlp = spacy.load(\"hu_core_news_lg\")",
"",
"# Load the model directly as a module",
"import hu_core_news_lg",
"",
"nlp = hu_core_news_lg.load()\n",
"# Either way you get the same model and can start processing texts.",
"doc = nlp(\"Csiribiri csiribiri zabszalma - négy csillag közt alszom ma.\")"
]
},
{
"id": "spacy-stanza",
"title": "spacy-stanza",
"slogan": "Use the latest Stanza (StanfordNLP) research models directly in spaCy",
"description": "This package wraps the Stanza (formerly StanfordNLP) library, so you can use Stanford's models as a spaCy pipeline. Using this wrapper, you'll be able to use the following annotations, computed by your pretrained `stanza` model:\n\n- Statistical tokenization (reflected in the `Doc` and its tokens)\n - Lemmatization (`token.lemma` and `token.lemma_`)\n - Part-of-speech tagging (`token.tag`, `token.tag_`, `token.pos`, `token.pos_`)\n - Dependency parsing (`token.dep`, `token.dep_`, `token.head`)\n - Named entity recognition (`doc.ents`, `token.ent_type`, `token.ent_type_`, `token.ent_iob`, `token.ent_iob_`)\n - Sentence segmentation (`doc.sents`)",
"github": "explosion/spacy-stanza",
"pip": "spacy-stanza",
"thumb": "https://i.imgur.com/myhLjMJ.png",
"code_example": [
"import stanza",
"import spacy_stanza",
"",
"stanza.download(\"en\")",
"nlp = spacy_stanza.load_pipeline(\"en\")",
"",
"doc = nlp(\"Barack Obama was born in Hawaii. He was elected president in 2008.\")",
"for token in doc:",
" print(token.text, token.lemma_, token.pos_, token.dep_, token.ent_type_)",
"print(doc.ents)"
],
"category": ["pipeline", "standalone", "models", "research"],
"author": "Explosion",
"author_links": {
"twitter": "explosion_ai",
"github": "explosion",
"website": "https://explosion.ai"
}
},
{
"id": "spacy-udpipe",
"title": "spacy-udpipe",
"slogan": "Use the latest UDPipe models directly in spaCy",
"description": "This package wraps the fast and efficient UDPipe language-agnostic NLP pipeline (via its Python bindings), so you can use UDPipe pre-trained models as a spaCy pipeline for 50+ languages out-of-the-box. Inspired by spacy-stanza, this package offers slightly less accurate models that are in turn much faster.",
"github": "TakeLab/spacy-udpipe",
"pip": "spacy-udpipe",
"code_example": [
"import spacy_udpipe",
"",
"spacy_udpipe.download(\"en\") # download English model",
"",
"text = \"Wikipedia is a free online encyclopedia, created and edited by volunteers around the world.\"",
"nlp = spacy_udpipe.load(\"en\")",
"",
"doc = nlp(text)",
"for token in doc:",
" print(token.text, token.lemma_, token.pos_, token.dep_)"
],
"category": ["pipeline", "standalone", "models", "research"],
"author": "TakeLab",
"author_links": {
"github": "TakeLab",
"website": "https://takelab.fer.hr/"
}
},
{
"id": "spacy-server",
"title": "spaCy Server",
"slogan": "\uD83E\uDD9C Containerized HTTP API for spaCy NLP",
"description": "For developers who need programming language agnostic NLP, spaCy Server is a containerized HTTP API that provides industrial-strength natural language processing. Unlike other servers, our server is fast, idiomatic, and well documented.",
"github": "neelkamath/spacy-server",
"code_example": [
"docker run --rm -dp 8080:8080 neelkamath/spacy-server",
"curl http://localhost:8080/ner -H 'Content-Type: application/json' -d '{\"sections\": [\"My name is John Doe. I grew up in California.\"]}'"
],
"code_language": "shell",
"url": "https://hub.docker.com/r/neelkamath/spacy-server",
"author": "Neel Kamath",
"author_links": {
"github": "neelkamath",
"website": "https://neelkamath.com"
},
"category": ["apis"],
"tags": ["docker"]
},
{
"id": "nlp-architect",
"title": "NLP Architect",
"slogan": "Python lib for exploring Deep NLP & NLU by Intel AI",
"github": "NervanaSystems/nlp-architect",
"pip": "nlp-architect",
"thumb": "https://i.imgur.com/vMideRx.png",
"category": ["standalone", "research"],
"tags": ["pytorch"]
},
{
"id": "Chatterbot",
"title": "Chatterbot",
"slogan": "A machine-learning based conversational dialog engine for creating chat bots",
"github": "gunthercox/ChatterBot",
"pip": "chatterbot",
"thumb": "https://i.imgur.com/eyAhwXk.jpg",
"code_example": [
"from chatterbot import ChatBot",
"from chatterbot.trainers import ListTrainer",
"# Create a new chat bot named Charlie",
"chatbot = ChatBot('Charlie')",
"trainer = ListTrainer(chatbot)",
"trainer.train([",
"'Hi, can I help you?',",
"'Sure, I would like to book a flight to Iceland.',",
"'Your flight has been booked.'",
"])",
"",
"response = chatbot.get_response('I would like to book a flight.')"
],
"author": "Gunther Cox",
"author_links": {
"github": "gunthercox"
},
"category": ["conversational", "standalone"],
"tags": ["chatbots"]
},
{
"id": "alibi",
"title": "alibi",
"slogan": "Algorithms for monitoring and explaining machine learning models ",
"github": "SeldonIO/alibi",
"pip": "alibi",
"thumb": "https://i.imgur.com/YkzQHRp.png",
"code_example": [
"from alibi.explainers import AnchorTabular",
"explainer = AnchorTabular(predict_fn, feature_names)",
"explainer.fit(X_train)",
"explainer.explain(x)"
],
"author": "Seldon",
"category": ["standalone", "research"]
},
{
"id": "spacymoji",
"slogan": "Emoji handling and meta data as a spaCy pipeline component",
"github": "ines/spacymoji",
"description": "spaCy extension and pipeline component for adding emoji meta data to `Doc` objects. Detects emoji consisting of one or more unicode characters, and can optionally merge multi-char emoji (combined pictures, emoji with skin tone modifiers) into one token. Human-readable emoji descriptions are added as a custom attribute, and an optional lookup table can be provided for your own descriptions. The extension sets the custom `Doc`, `Token` and `Span` attributes `._.is_emoji`, `._.emoji_desc`, `._.has_emoji` and `._.emoji`.",
"pip": "spacymoji",
"category": ["pipeline"],
"tags": ["emoji", "unicode"],
"thumb": "https://i.imgur.com/XOTYIgn.jpg",
"code_example": [
"import spacy",
"from spacymoji import Emoji",
"",
"nlp = spacy.load(\"en_core_web_sm\")",
"nlp.add_pipe(\"emoji\", first=True)",
"doc = nlp(\"This is a test 😻 👍🏿\")",
"",
"assert doc._.has_emoji is True",
"assert doc[2:5]._.has_emoji is True",
"assert doc[0]._.is_emoji is False",
"assert doc[4]._.is_emoji is True",
"assert doc[5]._.emoji_desc == \"thumbs up dark skin tone\"",
"assert len(doc._.emoji) == 2",
"assert doc._.emoji[1] == (\"👍🏿\", 5, \"thumbs up dark skin tone\")"
],
"author": "Ines Montani",
"author_links": {
"twitter": "_inesmontani",
"github": "ines",
"website": "https://ines.io"
}
},
{
"id": "spacyopentapioca",
"title": "spaCyOpenTapioca",
"slogan": "Named entity linking on Wikidata in spaCy via OpenTapioca",
"description": "A spaCy wrapper of OpenTapioca for named entity linking on Wikidata",
"github": "UB-Mannheim/spacyopentapioca",
"pip": "spacyopentapioca",
"code_example": [
"import spacy",
"nlp = spacy.blank('en')",
"nlp.add_pipe('opentapioca')",
"doc = nlp('Christian Drosten works in Germany.')",
"for span in doc.ents:",
" print((span.text, span.kb_id_, span.label_, span._.description, span._.score))",
"# ('Christian Drosten', 'Q1079331', 'PERSON', 'German virologist and university teacher', 3.6533377082098895)",
"# ('Germany', 'Q183', 'LOC', 'sovereign state in Central Europe', 2.1099332471902863)",
"## Check also span._.types, span._.aliases, span._.rank"
],
"category": ["models", "pipeline"],
"tags": ["NER", "NEL"],
"author": "Renat Shigapov",
"author_links": {
"twitter": "_shigapov",
"github": "shigapov"
}
},
{
"id": "spacy_readability",
"slogan": "Add text readability meta data to Doc objects",
"description": "spaCy v2.0 pipeline component for calculating readability scores of of text. Provides scores for Flesh-Kincaid grade level, Flesh-Kincaid reading ease, and Dale-Chall.",
"github": "mholtzscher/spacy_readability",
"pip": "spacy-readability",
"code_example": [
"import spacy",
"from spacy_readability import Readability",
"",
"nlp = spacy.load('en')",
"read = Readability(nlp)",
"nlp.add_pipe(read, last=True)",
"doc = nlp(\"I am some really difficult text to read because I use obnoxiously large words.\")",
"doc._.flesch_kincaid_grade_level",
"doc._.flesch_kincaid_reading_ease",
"doc._.dale_chall"
],
"author": "Michael Holtzscher",
"author_links": {
"github": "mholtzscher"
},
"category": ["pipeline"]
},
{
"id": "spacy-sentence-segmenter",
"title": "Sentence Segmenter",
"slogan": "Custom sentence segmentation for spaCy",
"code_example": [
"from seg.newline.segmenter import NewLineSegmenter",
"import spacy",
"",
"nlseg = NewLineSegmenter()",
"nlp = spacy.load('en')",
"nlp.add_pipe(nlseg.set_sent_starts, name='sentence_segmenter', before='parser')",
"doc = nlp(my_doc_text)"
],
"author": "tc64",
"author_links": {
"github": "tc64"
},
"category": ["pipeline"]
},
{
"id": "spacy_cld",
"title": "spaCy-CLD",
"slogan": "Add language detection to your spaCy pipeline using CLD2",
"description": "spaCy-CLD operates on `Doc` and `Span` spaCy objects. When called on a `Doc` or `Span`, the object is given two attributes: `languages` (a list of up to 3 language codes) and `language_scores` (a dictionary mapping language codes to confidence scores between 0 and 1).\n\nspacy-cld is a little extension that wraps the [PYCLD2](https://github.com/aboSamoor/pycld2) Python library, which in turn wraps the [Compact Language Detector 2](https://github.com/CLD2Owners/cld2) C library originally built at Google for the Chromium project. CLD2 uses character n-grams as features and a Naive Bayes classifier to identify 80+ languages from Unicode text strings (or XML/HTML). It can detect up to 3 different languages in a given document, and reports a confidence score (reported in with each language.",
"github": "nickdavidhaynes/spacy-cld",
"pip": "spacy_cld",
"code_example": [
"import spacy",
"from spacy_cld import LanguageDetector",
"",
"nlp = spacy.load('en')",
"language_detector = LanguageDetector()",
"nlp.add_pipe(language_detector)",
"doc = nlp('This is some English text.')",
"",
"doc._.languages # ['en']",
"doc._.language_scores['en'] # 0.96"
],
"author": "Nicholas D Haynes",
"author_links": {
"github": "nickdavidhaynes"
},
"category": ["pipeline"]
},
{
"id": "spacy-iwnlp",
"slogan": "German lemmatization with IWNLP",
"description": "This package uses the [spaCy 2.0 extensions](https://spacy.io/usage/processing-pipelines#extensions) to add [IWNLP-py](https://github.com/Liebeck/iwnlp-py) as German lemmatizer directly into your spaCy pipeline.",
"github": "Liebeck/spacy-iwnlp",
"pip": "spacy-iwnlp",
"code_example": [
"import spacy",
"from spacy_iwnlp import spaCyIWNLP",
"",
"nlp = spacy.load('de')",
"iwnlp = spaCyIWNLP(lemmatizer_path='data/IWNLP.Lemmatizer_20170501.json')",
"nlp.add_pipe(iwnlp)",