import pandas as pd

import numpy as np

import cudf

import matplotlib.pyplot as plt

from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_extraction.text import TfidfVectorizer

import cuml 
from cuml.decomposition import TruncatedSVD


# Fill in the two lines below
newsgroups_train = 
newsgroups_test =


newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


vectorizer = TfidfVectorizer(strip_accents='ascii')
# Fill in the two lines below
training_data = 
testing_data = 

print("training data shape: ", training_data.shape)
print("testing data shape: ", testing_data.shape)

training data shape:  (11314, 101629)
testing data shape:  (7532, 101629)


pdf = pd.DataFrame(training_data, columns = vectorizer.get_feature_names())
pdf


selector = SelectPercentile(percentile = 10)

# Fit and transform the training data
training_data =


# Identify the labels of the selected features

new_features =
print(new_features)

Index(['00', '000', '00072', '001', '0013', '0060', '01', '02106', '030',
       '03hz',
       ...
       'zri', 'zrlhz', 'zrlk', 'zrmc', 'zs', 'zterm', 'zubov', 'zv', 'zw',
       'zx'],
      dtype='object', length=10163)


updated_pdf = pd.DataFrame(training_data, columns = new_features)
updated_pdf


# Set lsi to the TruncatedSVD model
lsi = 

cuml.set_global_output_type('numpy')

# Fill in the line below
transformed_training =


# Create the plot. It should resemble the plot below.


plt.xlabel("Singular value number")
plt.ylabel("Singular value")
plt.show()


# Fill in the two lines below
component = 
V_T = 

print(V_T.shape)

# Find the associated words for the given component. After ranking them, print the top 25 words for the component. You should 
# get the words below in that order.

(10163, 100)
[ 4259  9668  4349 ...  6414  8950 10079]
he 0.5271371171926732
was 0.2823310898048911
his 0.19937336111221474
my 0.14426880793537095
him 0.12694647719281008
had 0.1207283895796685
me 0.1180333897856893
for 0.08810298377329359
thanks 0.08545317186553396
and 0.085212261054007
but 0.08486326450697201
year 0.0820267190635588
anyone 0.07306924604076914
game 0.06587444475231022
got 0.06036857105316556
didn 0.05955708283239263
out 0.059094934386944284
it 0.058979833905727135
team 0.05610841596112682
said 0.055973722140024953
would 0.05591243572748104
have 0.05559258737570682
up 0.055230498365858055
good 0.05494610771407437
on 0.05469880571297485


# Display the document and its classification. The content below is the correct output.

MLB Standings and Scores for Wednesday, April 21st, 1993
	                   (including yesterday's games)

NATIONAL WEST	      Won  Lost   Pct.    GB   Last 10  Streak    Home   Road
San Francisco Giants   09   05    .643    --     8-2     Won 2   05-02  04-03
Houston Astros         07   06    .538   1.5     7-3    Lost 1   02-04  05-02
Atlanta Braves         08   07    .533   1.5     4-6     Won 1   04-03  04-04
Los Angeles Dodgers    06   08    .429   3.0     4-6    Lost 1   03-03  03-05
San Diego Padres       05   08    .385   3.5     4-6    Lost 1   03-04  02-04
Colorado Rockies       04   08    .333   4.0     4-6    Lost 2   03-03  01-05
Cincinnati Reds        04   09    .308   4.5     3-7     Won 2   02-04  02-05

NATIONAL EAST
Philadelphia Phillies  10   03    .769    --     7-3     Won 2   06-01  04-02
St. Louis Cardinals    08   05    .615   2.0     6-4     Won 1   05-02  03-03
Chicago Cubs           07   06    .538   3.0     6-4     Won 1   04-03  03-03
Montreal Expos         07   06    .538   3.0     5-5     Won 2   04-03  03-03
Pittsburgh Pirates     07   06    .538   3.0     4-6    Lost 4   03-03  04-03
New York Mets          06   06    .500   3.5     4-6    Lost 2   02-04  04-02
Florida Marlins        04   09    .308   6.0     3-7    Lost 2   02-05  02-04


AMERICAN WEST         Won  Lost   Pct.    GB   Last 10  Streak    Home   Road
Texas Rangers          08   04    .667    --     6-4    Lost 1   04-02  04-02
California Angels      07   04    .636   0.5     6-4     Won 1   04-02  03-02
Minnesota Twins        07   05    .583   1.0     6-4     Won 1   04-03  03-02
Chicago White Sox      06   07    .462   2.5     4-6     Won 1   02-03  04-04
Oakland Athletics      05   06    .455   2.5     4-6     Won 1   05-02  00-04
Seattle Mariners       05   08    .385   3.5     3-7    Lost 1   03-03  02-05
Kansas City Royals     04   09    .308   4.5     4-6     Won 2   02-05  02-04

AMERICAN EAST
Boston Red Sox         11   03    .786    --     8-2     Won 4   06-01  05-02
Detroit Tigers         08   05    .615   2.5     7-3     Won 1   06-01  02-04
Toronto Blue Jays      07   06    .538   3.5     5-5    Lost 1   04-02  03-04
New York Yankees       06   07    .462   4.5     5-5    Lost 3   03-03  03-04
Milwaukee Brewers      04   06    .400   5.0     4-6    Lost 1   02-02  02-04
Cleveland Indians      05   09    .357   6.0     3-7    Lost 2   04-03  01-06
Baltimore Orioles      04   08    .333   6.0     4-6    Lost 1   02-04  02-04


			     YESTERDAY'S SCORES
                  (IDLE teams listed in alphabetical order)

NATIONAL LEAGUE				AMERICAN LEAGUE

Houston Astros	        1		Chicago White Sox	2
Chicago Cubs	        2		Baltimore Orioles	1 (14)

Los Angeles Dodgers	3		Texas Rangers		1
Montreal Expos		7		Detroit Tigers		3

Cincinnati Reds		5		Milwaukee Brewers	0
Pittsburgh Pirates	0		Minnesota Twins	       10

Atlanta Braves		5		Toronto Blue Jays	2
Florida Marlins		4		Kansas City Royals	8

San Diego Padres	3		Cleveland Indians	2
Philadelphia Phillies	4 (14)		California Angels	7

San Francisco Giants	4		New York Yankees	7
New York Mets		1 (11)		Oakland Athletics	9 (10)

Colorado Rockies	0		Boston Red Sox		5
St. Louis Cardinals	5		Seattle Mariners	2
-- 
-------------------------------------------------------------------------------
Joseph Hernandez          |    RAMS  | |    /.\  ******* _|_|_  / |   LAKERS
jtchern@ocf.Berkeley.EDU  |   KINGS  | |__ |   | DODGERS _|_|_  | |   RAIDERS
jtcent@soda.Berkeley.EDU  |  ANGELS  |____||_|_| *******  | |  |___|  CLIPPERS
rec.sport.baseball


# Reduce the document's features to match our LSI model, then transform the document from feature format to component format


# Find the most similar documents to our selected document using cosine similarity. The numbers below should be your most 
# to least similar document ordering.

[10853  7672  8090 ...  3611  4248  2284]
10853 0.9334949048012786
7672 0.7996704612313269
8090 0.7812350563971919
3335 0.6902479400364353
7984 0.6839433585506729


# Display the content/data of the top 3 documents.

10853
MLB Standings and Scores for Satruday, April 17th, 1993
	                   (including yesterday's games)

NATIONAL WEST	      Won  Lost   Pct.    GB   Last 10  Streak    Home   Road
San Francisco Giants   07   04    .636    --     6-4     Won 2   04-01  03-03
Houston Astros         06   04    .600   0.5     6-4     Won 1   01-03  05-01
Atlanta Braves         06   06    .500   1.5     5-5    Lost 3   04-03  03-02
Los Angeles Dodgers    04   07    .364   3.0     4-6     Won 1   01-03  03-04
Colorado Rockies       03   06    .333   3.0     3-6    Lost 1   03-03  00-03
San Diego Padres       03   07    .300   3.5     3-7     Won 1   01-04  02-03
Cincinnati Reds        02   08    .200   4.5     2-8    Lost 4   01-03  01-05

NATIONAL EAST
Philadelphia Phillies  08   02    .800    --     8-2    Lost 1   05-01  03-01
Pittsburgh Pirates     07   03    .700   1.0     7-3    Lost 1   03-02  04-01
St. Louis Cardinals    07   03    .700   1.0     7-3    Lost 1   04-02  03-01
New York Mets          05   04    .556   2.5     5-4     Won 1   02-03  03-01
Chicago Cubs           05   05    .500   3.0     5-5     Won 2   02-02  03-03
Montreal Expos         05   05    .500   3.0     5-5     Won 2   02-02  03-03
Florida Marlins        03   07    .300   5.0     3-7    Lost 1   02-04  01-03


AMERICAN WEST         Won  Lost   Pct.    GB   Last 10  Streak    Home   Road
Texas Rangers          06   03    .667    --     6-3    Lost 2   04-02  02-01
California Angels      05   03    .625   0.5     5-3    Lost 1   03-02  02-01
Chicago White Sox      05   04    .556   1.0     5-4     Won 2   02-03  03-01
Minnesota Twins        05   04    .556   1.0     5-4     Won 1   02-02  03-02
Oakland Athletics      04   04    .500   1.5     4-4    Lost 2   04-02  00-02
Seattle Mariners       04   05    .444   2.0     4-5    Lost 2   03-02  01-03
Kansas City Royals     02   08    .200   4.5     2-8    Lost 1   01-05  01-03

AMERICAN EAST
Boston Red Sox         07   03    .700    --     7-3    Lost 1   03-01  04-02
New York Yankees       06   04    .600   1.0     6-4     Won 1   03-01  03-03
Detroit Tigers         05   04    .556   1.5     5-4     Won 3   03-00  02-04
Toronto Blue Jays      05   04    .556   1.5     5-4    Lost 1   04-02  01-02
Cleveland Indians      04   06    .400   3.0     4-6     Won 1   03-01  01-05
Baltimore Orioles      03   06    .333   3.5     3-6     Won 2   01-02  02-04
Milwaukee Brewers      02   05    .286   3.5     2-5    Lost 4   00-02  02-03


			     YESTERDAY'S SCORES
                  (IDLE teams listed in alphabetical order)

NATIONAL LEAGUE				AMERICAN LEAGUE

New York Mets		3		Chicago White Sox	9
Cincinnati Reds		1		Boston Red Sox		4

Florida Marlins		3		California Angels	1
Houston Astros		9		Baltimore Orioles	4

Philadelphia Phillies	1		Kansas City Royals	3
Chicago Cubs		3		Minnesota Twins		4 (10)

Colorado Rockies	2		Seattle Mariners	0
Montreal Expos		3		Detroit Tigers		5

Pittsburgh Pirates	4		Toronto Blue Jays	1
Los Angeles Dodgers	7		Cleveland Indians      13

Atlanta Braves		0		Texas Rangers		3
San Francisco Giants	1		New York Yankees	5

St. Louis Cardinals	1		Oakland Athletics     PPD
San Diego Padres	5		Milwaukee Brewers    RAIN
-- 
-------------------------------------------------------------------------------
Joseph Hernandez          |    RAMS  | |    /.\  ******* _|_|_  / |   LAKERS
jtchern@ocf.Berkeley.EDU  |   KINGS  | |__ |   | DODGERS _|_|_  | |   RAIDERS
jtcent@soda.Berkeley.EDU  |  ANGELS  |____||_|_| *******  | |  |___|  CLIPPERS
rec.sport.baseball
7672
Since everyone else seems to be running wild with predictions, I've
decided to add my own fuel to the fire:
They might seem a bit normal, but there are a few (albeit, small) surprises.

American League East	 W	 L	GB
1)New York Yankees	93	69	--
2)Baltimore Orioles	90	72	 3
3)Toronto Blue Jays	86	76	 7
4)Cleveland Indians     84      78       9
5)Boston Red Sox	77	85	16
6)Milwaukee Brewers	74	88	19
7)Detroit Tigers	73	89	20

American League West	 W	 L	GB
1)Minnesota Twins	94	68	--
2)Kansas City Royals	92	70	 2
3)Texas Rangers     	85	77	 9
4)Chicago White Sox	77	85	17
5)Oakland Athletics	74	88	20
6)Seattle Mariners	70	92	24
7)California Angels	65	97	29

AL MVP-Kirby Puckett
AL Cy Young-Kevin Appier
AL Rookie of the Year-Tim Salmon
AL Manager of the Year-Buck Showalter
AL Comeback Player of the Year-Ozzie Guillen

National League East	 W	 L	GB
1)St. Louis Cardinals	91	71	--
2)Philadelphia Phillies 89	73	 2
3)Montreal Expos	88	74	 3
4)New York Mets		84	78	 7
5)Chicago Cubs		79	83	12
6)Pittsburgh Pirates	73	89	18
7)Florida Marlins	54     108	37

National League West	 W	 L	GB
1)Atlanta Braves	96	66	--
2)Cincinnati Reds	94	68	 2
3)Houston Astros	89	73	 7
4)Los Angeles Dodgers	82	80	14
5)San Francisco Giants	81	81	15
6)San Diego Padres	75	87	21
7)Colorado Rockies	59     103	37

NL MVP-Barry Larkin
NL Cy Young-John Smoltz
NL Rookie of the Year-Wil Cordero
NL Manager of the Year-Joe Torre
NL Comeback Player of the Year-Eric Davis

NL Champions-St. Louis Cardinals
AL Champions-Minnesota Twins
World Champions-St. Louis Cardinals

The St. Louis picks are what my heart says.
What my brain says, is they will win the division, lose to the Braves
in the NLCS, and the Braves will win the Series against Minnesota.
But for now, I'll stick with the Cards all the way.
rec.sport.baseball
8090
Thanks for the 41 people who have entered this year's TEAM POOL.

Here is a summary of what was picked:

1st Round:
Pittsburgh 41, New Jersey 0
Chicago 40, St. Louis 1
Boston 40, Buffalo 1
Vancouver 30, Winnipeg 11
Calgary 27, Los Angeles 14
Detroit 26, Toronto 15
Washington 24, New York Islanders 15 (2 people picked New Jersey)
Quebec 23, Montreal 18

2nd Round:
Pittsburgh 38, Washington 2, New York Islanders 1
Boston 31, Quebec 6, Montreal 4
Detroit 21, Chicago 15, Toronto 5
Calgary 18, Vancouver 14, Los Angeles 8, Winnipeg 1

3rd Round:
Pittsburgh 31, Boston 7, Quebec 2, Washington 1
Detroit 18, Chicago 11, Toronto 5, Vancouver 3, Calgary 2, Los Angeles 2

Finals:
Pittsburgh 26, Boston 5, Detroit 4, Toronto 2, Quebec 2, Los Angeles 1, Chicago
1

Good luck to all!

rec.sport.hockey

Introduction to Latent Semantic Indexing for Text via Singular Value Decomposition¶

cuML + Latent Semantic Indexing (LSI)¶

	00	000	0000	00000	000000	00000000	0000000004	00000000b	00000001	00000001b	...	zznkj	zznkjz	zznkzz	zznp	zzrk	zzy_3w	zzz	zzzoh	zzzzzz	zzzzzzt
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
11309	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
11310	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
11311	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
11312	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
11313	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0