import numpy as np

arr = np.array([1, 2, 3, 4, 5])

print(arr, type(arr))

[1 2 3 4 5] <class 'numpy.ndarray'>

# You can create multi-dimensional or 0-D arrays:

arr_0d = np.array(84)
arr_1d = np.array([1, 2, 3, 4, 5])
arr_2d = np.array([[4, 1, 3], [6, 2, 3]])
arr_3d = np.array([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]])
arr_5d = np.array([1, 2, 3, 4], ndmin=5)

print(arr_2d)

# Check the number of dimensions
print(arr_2d.ndim)

# Check the shape of the array
print(arr_2d.shape)

[[4 1 3]
 [6 2 3]]
2
(2, 3)

# Like in lists, you can access array elements with indexes:
print(arr_2d[1])
#print(arr_2d[1,1])

# You can also use slicing
#print(arr_1d[1:3])
 
# And for loops!
# TODO: how to go through an array with a for loop?
for row in arr_2d:
    for number in row: 
        print(number)

[6 2 3]
4
1
3
6
2
3

# You can also search array
search = np.where(arr_1d == 4)
print(search)
 
print(np.sort(arr_2d))

(array([3]),)
[[1 3 4]
 [2 3 6]]

import pandas as pd

# You can convert a dict to a Dataframe

fruits = {
    "labels" : ["apple", "pineapple", "mango", "banana"],
    "prices" : [0.1, 0.8, 0.7, 0.2]
}

df_fruits = pd.DataFrame(fruits)

#df_fruits
#print(df_fruits.info())
print(df_fruits.loc[0])

labels    apple
prices      0.1
Name: 0, dtype: object

# Read and clean a CSV file
# Random file from https://www.data.gouv.fr/fr/datasets/effectifs-deleves-par-niveau-sexe-langues-vivantes-1-et-2-les-plus-frequentes-par-college-date-dobservation-au-debut-du-mois-doctobre-chaque-annee/

# df = pd.read_csv("fr-en-college-effectifs-niveau-sexe-lv.csv")
df = pd.read_csv("fr-en-college-effectifs-niveau-sexe-lv.csv", sep=";")

# Let's have a first look at it to see what it is about
print(df.info())
#df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50164 entries, 0 to 50163
Data columns (total 81 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   num_ligne                                   50164 non-null  int64  
 1   Rentrée scolaire                            50164 non-null  int64  
 2   Code région académique                      50164 non-null  int64  
 3   Code région Insee                           50164 non-null  int64  
 4   Région académique                           50164 non-null  object 
 5   Code académie                               49632 non-null  float64
 6   Académie                                    50164 non-null  object 
 7   Code département                            50164 non-null  object 
 8   Département                                 50164 non-null  object 
 9   Code commune                                47860 non-null  object 
 10  Commune                                     50164 non-null  object 
 11  UAI                                         50164 non-null  object 
 12  Dénomination principale                     50164 non-null  object 
 13  Patronyme                                   50164 non-null  object 
 14  Secteur                                     50164 non-null  object 
 15  REP                                         50164 non-null  int64  
 16  REP +                                       50164 non-null  int64  
 17  nombre_eleves_total                         50164 non-null  int64  
 18  Nombre d'élèves total hors Segpa hors ULIS  50164 non-null  int64  
 19  Nombre d'élèves total Segpa                 50164 non-null  int64  
 20  Nombre d'élèves total ULIS                  50164 non-null  int64  
 21  6èmes total                                 50164 non-null  int64  
 22  6èmes hors Segpa hors ULIS                  50164 non-null  int64  
 23  6èmes Segpa                                 50164 non-null  int64  
 24  6èmes ULIS                                  50164 non-null  int64  
 25  6eme_filles                                 50164 non-null  int64  
 26  6emes_garcons                               50164 non-null  int64  
 27  6èmes LV1 allemand                          50164 non-null  int64  
 28  6èmes LV1 anglais                           50164 non-null  int64  
 29  6èmes LV1 espagnol                          50164 non-null  int64  
 30  6èmes LV1 autres langues                    50164 non-null  int64  
 31  6èmes LV2 allemand                          50164 non-null  int64  
 32  6èmes LV2 anglais                           50164 non-null  int64  
 33  6èmes LV2 espagnol                          50164 non-null  int64  
 34  6èmes LV2 italien                           50164 non-null  int64  
 35  6ème LV2 autres langues                     50164 non-null  int64  
 36  5èmes total                                 50164 non-null  int64  
 37  5èmes hors Segpa hors ULIS                  50164 non-null  int64  
 38  5èmes Segpa                                 50164 non-null  int64  
 39  5èmes ULIS                                  50164 non-null  int64  
 40  5eme_filles                                 50164 non-null  int64  
 41  5emes_garcons                               50164 non-null  int64  
 42  5èmes LV1 allemand                          50164 non-null  int64  
 43  5èmes LV1 anglais                           50164 non-null  int64  
 44  5èmes LV1 espagnol                          50164 non-null  int64  
 45  5èmes LV1 autres langues                    50164 non-null  int64  
 46  5èmes LV2 allemand                          50164 non-null  int64  
 47  5èmes LV2 anglais                           50164 non-null  int64  
 48  5èmes LV2 espagnol                          50164 non-null  int64  
 49  5èmes LV2 italien                           50164 non-null  int64  
 50  5èmes LV2 autres langues                    50164 non-null  int64  
 51  4èmes total                                 50164 non-null  int64  
 52  4èmes hors Segpa hors ULIS                  50164 non-null  int64  
 53  4èmes Segpa                                 50164 non-null  int64  
 54  4èmes ULIS                                  50164 non-null  int64  
 55  4eme_filles                                 50164 non-null  int64  
 56  4emes_garcons                               50164 non-null  int64  
 57  4èmes LV1 allemand                          50164 non-null  int64  
 58  4èmes LV1 anglais                           50164 non-null  int64  
 59  4èmes LV1 espagnol                          50164 non-null  int64  
 60  4èmes LV1 autres langues                    50164 non-null  int64  
 61  4èmes LV2 allemand                          50164 non-null  int64  
 62  4èmes LV2 anglais                           50164 non-null  int64  
 63  4èmes LV2 espagnol                          50164 non-null  int64  
 64  4èmes LV2 italien                           50164 non-null  int64  
 65  4èmes LV2 autres langues                    50164 non-null  int64  
 66  3èmes total                                 50164 non-null  int64  
 67  3èmes hors Segpa hors ULIS                  50164 non-null  int64  
 68  3èmes Segpa                                 50164 non-null  int64  
 69  3èmes ULIS                                  50164 non-null  int64  
 70  3eme_filles                                 50164 non-null  int64  
 71  3emes_garcons                               50164 non-null  int64  
 72  3èmes LV1 allemand                          50164 non-null  int64  
 73  3èmes LV1 anglais                           50164 non-null  int64  
 74  3èmes LV1 espagnol                          50164 non-null  int64  
 75  3èmes LV1 autres langues                    50164 non-null  int64  
 76  3èmes LV2 allemand                          50164 non-null  int64  
 77  3èmes LV2 anglais                           50164 non-null  int64  
 78  3èmes LV2 espagnol                          50164 non-null  int64  
 79  3èmes LV2 italien                           50164 non-null  int64  
 80  3èmes LV2 autres langues                    50164 non-null  int64  
dtypes: float64(1), int64(70), object(10)
memory usage: 31.0+ MB
None

# Let's clean it up!
df = df.dropna()
# or: df.fillna("Undetermined")
df = df.drop_duplicates()

# To check if a cell is "na"
#print(df['code_region'].isna())

# To check if a cell is NOT "na"
#print(df['code_region'].notna())

# Manipulate the data
#interesting_col = "nombre_de_3emes_lv1_anglais"
interesting_col = "3èmes LV1 anglais"
print("Mean",df[interesting_col].mean())
print("Median",df[interesting_col].median())
print("Min",df[interesting_col].min())
print("Max",df[interesting_col].max())
print("Correlation",df[interesting_col].corr(df["Code région académique"]))

print(df.sort_values(by=interesting_col))

Mean 95.64283299526707
Median 95.0
Min 0
Max 480
Correlation 0.07456253299142748
       num_ligne  Rentrée scolaire  Code région académique  Code région Insee  \
50042      22531              2021                      14                 28   
54          8936              2020                       1                 84   
38055      15388              2020                      16                 76   
32448       3360              2019                       9                 32   
38174      17984              2021                       2                 27   
...          ...               ...                     ...                ...   
853        30410              2022                      13                  6   
24719       5368              2019                      13                  6   
17047      13707              2020                      13                  6   
29438      47169              2024                      13                  6   
44860      38783              2023                      13                  6   

             Région académique  Code académie   Académie Code département  \
50042                NORMANDIE           70.0  NORMANDIE               76   
54        AUVERGNE-RHONE-ALPES            8.0   GRENOBLE               73   
38055                OCCITANIE           16.0   TOULOUSE               31   
32448          HAUTS-DE-FRANCE            9.0      LILLE               59   
38174  BOURGOGNE-FRANCHE-COMTE            7.0      DIJON               21   
...                        ...            ...        ...              ...   
853                    MAYOTTE           43.0    MAYOTTE              976   
24719                  MAYOTTE           43.0    MAYOTTE              976   
17047                  MAYOTTE           43.0    MAYOTTE              976   
29438                  MAYOTTE           43.0    MAYOTTE              976   
44860                  MAYOTTE           43.0    MAYOTTE              976   

          Département Code commune  ... 3emes_garcons 3èmes LV1 allemand  \
50042  SEINE MARITIME        76681  ...            25                  0   
54             SAVOIE        73011  ...            13                  0   
38055   HAUTE-GARONNE        31395  ...            16                  0   
32448            NORD        59378  ...             0                  0   
38174       COTE D'OR        21054  ...            16                  0   
...               ...          ...  ...           ...                ...   
853           MAYOTTE        97607  ...           225                  0   
24719         MAYOTTE        97611  ...           240                  0   
17047         MAYOTTE        97610  ...           231                  0   
29438         MAYOTTE        97607  ...           209                  0   
44860         MAYOTTE        97607  ...           214                  0   

      3èmes LV1 anglais 3èmes LV1 espagnol 3èmes LV1 autres langues  \
50042                 0                  0                        0   
54                    0                  0                        0   
38055                 0                  0                        0   
32448                 0                  0                        0   
38174                 0                  0                        0   
...                 ...                ...                      ...   
853                 462                  0                        0   
24719               475                  0                        0   
17047               477                  0                        0   
29438               478                  0                        0   
44860               480                  0                        0   

       3èmes LV2 allemand  3èmes LV2 anglais  3èmes LV2 espagnol  \
50042                   0                  0                   0   
54                      0                  0                   0   
38055                   0                  0                   0   
32448                   0                  0                   0   
38174                   0                  0                   0   
...                   ...                ...                 ...   
853                     0                  0                 432   
24719                   0                  0                 426   
17047                   0                  0                 421   
29438                   0                  0                 448   
44860                   0                  0                 449   

       3èmes LV2 italien  3èmes LV2 autres langues  
50042                  0                         0  
54                     0                         0  
38055                  0                         0  
32448                  0                         0  
38174                  0                         0  
...                  ...                       ...  
853                    0                        30  
24719                  0                        49  
17047                  0                        56  
29438                  0                        30  
44860                  0                        31  

[47328 rows x 81 columns]

# Create filters
filter_year = df[df['Rentrée scolaire'] < 2020]
#print(filter_year)

filter_academie = df[df['Académie'] == "NANCY-METZ"]
#filter_academie

filter_german = df[df['3èmes LV1 allemand'] > df['3èmes LV1 anglais']]
# filter_german

# Combine conditions (notice the syntax with brackets and '&' or '|')
filter_nancygerman = df[(df['Académie'] == "NANCY-METZ") | (df['3èmes LV1 allemand'] > df['3èmes LV1 anglais'])]
#filter_nancygerman

# Visualize 
import matplotlib.pyplot as plt
df.plot(kind = 'scatter', x = 'Région académique', y = '3èmes LV1 espagnol')

plt.xticks(rotation=90)
plt.show()

df[interesting_col].plot(kind='hist')

<Axes: ylabel='Frequency'>

import spacy

nlp = spacy.load("en_core_web_sm")
# From Wikipedia
pythonidae = nlp("The Pythonidae, commonly known as pythons, are a family of over 40 nonvenomous snakes found in Africa, Asia, and Australia.")
    
print("TEXT, LEMMA, POS, TAG, DEP")
for token in pythonidae:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_)

TEXT, LEMMA, POS, TAG, DEP
The the DET DT det
Pythonidae Pythonidae PROPN NNP nsubj
, , PUNCT , punct
commonly commonly ADV RB advmod
known know VERB VBN acl
as as ADP IN prep
pythons python NOUN NNS pobj
, , PUNCT , punct
are be AUX VBP ROOT
a a DET DT det
family family NOUN NN attr
of of ADP IN prep
over over ADP IN quantmod
40 40 NUM CD nummod
nonvenomous nonvenomous ADJ JJ amod
snakes snake NOUN NNS pobj
found find VERB VBN acl
in in ADP IN prep
Africa Africa PROPN NNP pobj
, , PUNCT , punct
Asia Asia PROPN NNP conj
, , PUNCT , punct
and and CCONJ CC cc
Australia Australia PROPN NNP conj
. . PUNCT . punct

from spacy import displacy

displacy.render(pythonidae, style="dep")

for ent in pythonidae.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

over 40 59 66 CARDINAL
Africa 95 101 LOC
Asia 103 107 LOC
Australia 113 122 GPE

displacy.render(pythonidae, style="ent")

python = nlp("Python is a programming language that we are currently learning.")
master = nlp("I am a master's students  , I study computer science and programming among other things")

# Similarity of two documents
print(python.similarity(master))

print(python.similarity(pythonidae))

print(pythonidae.similarity(master))

0.36686354875564575
0.22934459149837494
0.3949538469314575

/tmp/ipykernel_248728/361809591.py:5: UserWarning: [W007] The model you're using has no word vectors loaded, so the result of the Doc.similarity method will be based on the tagger, parser and NER, which may not give useful similarity judgements. This may happen if you're using one of the small models, e.g. `en_core_web_sm`, which don't ship with word vectors and only use context-sensitive tensors. You can always add your own word vectors, or use one of the larger models instead if available.
  print(python.similarity(master))
/tmp/ipykernel_248728/361809591.py:7: UserWarning: [W007] The model you're using has no word vectors loaded, so the result of the Doc.similarity method will be based on the tagger, parser and NER, which may not give useful similarity judgements. This may happen if you're using one of the small models, e.g. `en_core_web_sm`, which don't ship with word vectors and only use context-sensitive tensors. You can always add your own word vectors, or use one of the larger models instead if available.
  print(python.similarity(pythonidae))
/tmp/ipykernel_248728/361809591.py:9: UserWarning: [W007] The model you're using has no word vectors loaded, so the result of the Doc.similarity method will be based on the tagger, parser and NER, which may not give useful similarity judgements. This may happen if you're using one of the small models, e.g. `en_core_web_sm`, which don't ship with word vectors and only use context-sensitive tensors. You can always add your own word vectors, or use one of the larger models instead if available.
  print(pythonidae.similarity(master))

# CODE ME
python_ = "Pythons are snakes..."
p3 = python[3]
m9 = master[9]
print(p3, m9)
print(p3.similarity(m9))

programming study
-0.029292620718479156

/tmp/ipykernel_248728/777209842.py:6: UserWarning: [W007] The model you're using has no word vectors loaded, so the result of the Token.similarity method will be based on the tagger, parser and NER, which may not give useful similarity judgements. This may happen if you're using one of the small models, e.g. `en_core_web_sm`, which don't ship with word vectors and only use context-sensitive tensors. You can always add your own word vectors, or use one of the larger models instead if available.
  print(p3.similarity(m9))

from tqdm import tqdm
import time

# The syntax is the following: for word in tqdm(word_list)

for i in tqdm(range(1,21)):
    print(i)
    time.sleep(1)

  0%|                                                    | 0/20 [00:00<?, ?it/s]

1

  5%|██▏                                         | 1/20 [00:01<00:19,  1.00s/it]

2

 10%|████▍                                       | 2/20 [00:02<00:18,  1.00s/it]

3

 15%|██████▌                                     | 3/20 [00:03<00:17,  1.00s/it]

4

 20%|████████▊                                   | 4/20 [00:04<00:16,  1.00s/it]

5

import matplotlib.pyplot as plt
import numpy as np

# Let's make a basic, random plot
xpoints = np.array([2, 5, 12, 24])
ypoints = np.array([0, 15, 2, 8])

plt.plot(xpoints, ypoints)
plt.savefig("my_plot.pdf")
plt.show()

# Notice the syntax difference for one or multiple lines
plt.plot(xpoints)
plt.plot(ypoints)
plt.show()

# Let's make it prettier/more interesting (endless possibilities/options!)
#plt.plot(xpoints, ypoints, marker="o")
#plt.plot(xpoints, ypoints, linestyle = 'dotted')
plt.plot(xpoints, ypoints, color= 'r', linewidth = '10', linestyle="dotted", marker="o")

plt.xlabel("Potato")
plt.ylabel("Time")
#plt.grid()
plt.grid(axis='x')
plt.show()

# More plot types!
plt.scatter(xpoints, ypoints)
plt.show()

plt.bar(["A", "B", "C", "D"], xpoints, width=0.1)
plt.show()

plt.bar(["A", "B", "C", "D"], xpoints), #height=0.1
plt.show()

# Note: be careful when plotting, try to make it color-blind friendly for greater inclusivity!
plt.style.use('tableau-colorblind10')

y = np.array([35, 25, 25, 15])
mylabels = ["Apples", "Bananas", "Cherries", "Dates"]

plt.pie(y, labels = mylabels)
plt.legend(title = "Four Fruits:", loc="upper right")
plt.show()

Some useful libraries¶

Numpy¶

Pandas¶

Spacy¶

tqdm¶

matplotlib¶