From fdea95ab5defcedc839d2ff1cee43e5c61f0cffb Mon Sep 17 00:00:00 2001 From: Nemanja Stojanovic Date: Thu, 5 Dec 2024 19:30:30 -0500 Subject: [PATCH 1/3] add python DA --- python/data-analysis/README.md | 16 ++ .../data-analysis/pyda-analyzing-ii/README.md | 15 ++ .../counting-and-transforming-series-ii.md | 182 ++++++++++++++ .../counting-and-transforming-series.md | 114 +++++++++ ...movie-count-per-release-year-visualized.md | 207 ++++++++++++++++ .../movie-count-per-release-year.md | 60 +++++ .../pyda-analyzing-ii/tvshow-ratings.md | 133 +++++++++++ .../visualize-tvshow-ratings.md | 125 ++++++++++ .../pyda-analyzing-iii/README.md | 14 ++ .../computing-group-sizes.md | 53 +++++ .../pyda-analyzing-iii/movies-per-actor-ii.md | 138 +++++++++++ .../pyda-analyzing-iii/movies-per-actor.md | 99 ++++++++ .../total-number-of-actors.md | 120 ++++++++++ .../pyda-analyzing-iii/tvshow-per-actor.md | 46 ++++ .../data-analysis/pyda-analyzing-iv/README.md | 11 + .../pyda-analyzing-iv/longest-movies-ii.md | 161 +++++++++++++ .../pyda-analyzing-iv/longest-movies.md | 114 +++++++++ .../longest-running-shows.md | 109 +++++++++ python/data-analysis/pyda-analyzing/README.md | 17 ++ .../pyda-analyzing/dataframe-loc.md | 78 ++++++ .../data-analysis/pyda-analyzing/end-goals.md | 111 +++++++++ .../pyda-analyzing/modifying-pie-chart-ii.md | 137 +++++++++++ .../pyda-analyzing/modifying-pie-chart.md | 86 +++++++ .../pyda-analyzing/movie-tvshow-count.md | 112 +++++++++ .../pyda-analyzing/movies-per-rating.md | 99 ++++++++ .../pyda-analyzing/visualize-movie-ratings.md | 131 ++++++++++ .../visualize-movie-show-count.md | 156 ++++++++++++ .../pyda-da-analysis-environments/README.md | 13 + .../creating-your-first-notebook.md | 49 ++++ .../different-tools-to-use.md | 103 ++++++++ .../ipython-vs-shell-vs-scripts.md | 106 +++++++++ .../notebooks.md | 112 +++++++++ .../what-are-analysis-environments.md | 55 +++++ python/data-analysis/pyda-da-tips/README.md | 10 + .../pyda-da-tips/pandas-profiling.md | 61 +++++ .../pyda-da-tips/str-contains.md | 225 ++++++++++++++++++ .../README.md | 14 ++ .../cleaning-dataset-ii.md | 104 ++++++++ .../cleaning-dataset.md | 99 ++++++++ .../importing-data-sets.md | 125 ++++++++++ .../series-and-dataframes.md | 156 ++++++++++++ .../what-and-why-pandas.md | 85 +++++++ .../data-analysis/pyda-introduction/README.md | 13 + .../pyda-introduction/analysis-example.md | 45 ++++ .../brief-introduction-to-data-analysis.md | 28 +++ .../pyda-introduction/prerequisites.md | 30 +++ .../python-data-libraries.md | 77 ++++++ .../why-python-for-data-analysis.md | 29 +++ python/python-core/README.md | 2 +- 49 files changed, 4184 insertions(+), 1 deletion(-) create mode 100644 python/data-analysis/README.md create mode 100644 python/data-analysis/pyda-analyzing-ii/README.md create mode 100644 python/data-analysis/pyda-analyzing-ii/counting-and-transforming-series-ii.md create mode 100644 python/data-analysis/pyda-analyzing-ii/counting-and-transforming-series.md create mode 100644 python/data-analysis/pyda-analyzing-ii/movie-count-per-release-year-visualized.md create mode 100644 python/data-analysis/pyda-analyzing-ii/movie-count-per-release-year.md create mode 100644 python/data-analysis/pyda-analyzing-ii/tvshow-ratings.md create mode 100644 python/data-analysis/pyda-analyzing-ii/visualize-tvshow-ratings.md create mode 100644 python/data-analysis/pyda-analyzing-iii/README.md create mode 100644 python/data-analysis/pyda-analyzing-iii/computing-group-sizes.md create mode 100644 python/data-analysis/pyda-analyzing-iii/movies-per-actor-ii.md create mode 100644 python/data-analysis/pyda-analyzing-iii/movies-per-actor.md create mode 100644 python/data-analysis/pyda-analyzing-iii/total-number-of-actors.md create mode 100644 python/data-analysis/pyda-analyzing-iii/tvshow-per-actor.md create mode 100644 python/data-analysis/pyda-analyzing-iv/README.md create mode 100644 python/data-analysis/pyda-analyzing-iv/longest-movies-ii.md create mode 100644 python/data-analysis/pyda-analyzing-iv/longest-movies.md create mode 100644 python/data-analysis/pyda-analyzing-iv/longest-running-shows.md create mode 100644 python/data-analysis/pyda-analyzing/README.md create mode 100644 python/data-analysis/pyda-analyzing/dataframe-loc.md create mode 100644 python/data-analysis/pyda-analyzing/end-goals.md create mode 100644 python/data-analysis/pyda-analyzing/modifying-pie-chart-ii.md create mode 100644 python/data-analysis/pyda-analyzing/modifying-pie-chart.md create mode 100644 python/data-analysis/pyda-analyzing/movie-tvshow-count.md create mode 100644 python/data-analysis/pyda-analyzing/movies-per-rating.md create mode 100644 python/data-analysis/pyda-analyzing/visualize-movie-ratings.md create mode 100644 python/data-analysis/pyda-analyzing/visualize-movie-show-count.md create mode 100644 python/data-analysis/pyda-da-analysis-environments/README.md create mode 100644 python/data-analysis/pyda-da-analysis-environments/creating-your-first-notebook.md create mode 100644 python/data-analysis/pyda-da-analysis-environments/different-tools-to-use.md create mode 100644 python/data-analysis/pyda-da-analysis-environments/ipython-vs-shell-vs-scripts.md create mode 100644 python/data-analysis/pyda-da-analysis-environments/notebooks.md create mode 100644 python/data-analysis/pyda-da-analysis-environments/what-are-analysis-environments.md create mode 100644 python/data-analysis/pyda-da-tips/README.md create mode 100644 python/data-analysis/pyda-da-tips/pandas-profiling.md create mode 100644 python/data-analysis/pyda-da-tips/str-contains.md create mode 100644 python/data-analysis/pyda-initializing-and-cleaning-datasets/README.md create mode 100644 python/data-analysis/pyda-initializing-and-cleaning-datasets/cleaning-dataset-ii.md create mode 100644 python/data-analysis/pyda-initializing-and-cleaning-datasets/cleaning-dataset.md create mode 100644 python/data-analysis/pyda-initializing-and-cleaning-datasets/importing-data-sets.md create mode 100644 python/data-analysis/pyda-initializing-and-cleaning-datasets/series-and-dataframes.md create mode 100644 python/data-analysis/pyda-initializing-and-cleaning-datasets/what-and-why-pandas.md create mode 100644 python/data-analysis/pyda-introduction/README.md create mode 100644 python/data-analysis/pyda-introduction/analysis-example.md create mode 100644 python/data-analysis/pyda-introduction/brief-introduction-to-data-analysis.md create mode 100644 python/data-analysis/pyda-introduction/prerequisites.md create mode 100644 python/data-analysis/pyda-introduction/python-data-libraries.md create mode 100644 python/data-analysis/pyda-introduction/why-python-for-data-analysis.md diff --git a/python/data-analysis/README.md b/python/data-analysis/README.md new file mode 100644 index 0000000000..c4d4893c7b --- /dev/null +++ b/python/data-analysis/README.md @@ -0,0 +1,16 @@ +name: Python Data Analysis +description: How to use Python to analyze data. + +sections: + '0': + - pyda-introduction + - pyda-analysis-environments + - pyda-initializing-and-cleaning-datasets + - pyda-analyzing + - pyda-analyzing-ii + - pyda-analyzing-iii + - pyda-analyzing-iv + - pyda-da-tips + +next: + - python:functional-programming diff --git a/python/data-analysis/pyda-analyzing-ii/README.md b/python/data-analysis/pyda-analyzing-ii/README.md new file mode 100644 index 0000000000..712b0914b4 --- /dev/null +++ b/python/data-analysis/pyda-analyzing-ii/README.md @@ -0,0 +1,15 @@ +name: Analysis II + +description: Learn more methods for manipulating a dataset and visualizing outputs. + +insights: + - counting-and-transforming-series + - counting-and-transforming-series-ii + - tvshow-ratings + - visualize-tvshow-ratings + - movie-count-per-release-year + - movie-count-per-release-year-visualized + +aspects: + - workout + diff --git a/python/data-analysis/pyda-analyzing-ii/counting-and-transforming-series-ii.md b/python/data-analysis/pyda-analyzing-ii/counting-and-transforming-series-ii.md new file mode 100644 index 0000000000..5bb5c9171b --- /dev/null +++ b/python/data-analysis/pyda-analyzing-ii/counting-and-transforming-series-ii.md @@ -0,0 +1,182 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: how-to + +links: + - >- + [to_frame before reset_index](https://stackoverflow.com/questions/40914200/can-i-assign-a-reset-index-a-name){documentation} + +practiceQuestion: + formats: + - fill-in-the-gap + - type-in-the-gap + context: standalone +revisionQuestion: + formats: + - fill-in-the-gap + - type-in-the-gap + context: standalone + +--- + +# Counting & Transforming a Series II + +--- +## Content + +Using the same `df` from the previous insight[1]. + +### .to_frame() + +This method can only be used on a `Series`. + +It transforms a `Series` into a `DataFrame`. + +There is only one parameter, `name`. You use it to specify the name of the column. + +Using the same example from the previous insight, we will add this function and a name for the column: + +```python +# Previous DataFrame +# Saved into the same variable +df = df.groupby("col1").size() + +# It is now a Series + +# Transform it into a DataFrame +df.to_frame(name="sum") +``` + +From this code, we get this output: + +![groupby-toframe](https://img.enkipro.com/fd333ed70937bef66a27f7eaf474b9c1.png) + +However, now we don't have an index and need to reset it. + +### .reset_index() + +This method is used to reset an index of a `DataFrame`. + +When resetting an index, if one already exists, it will become a column, and a new logical index from 0 to length will be assigned. + +Let's reset it for the first column by adding `.reset_index('col1')`. + +![final-output-toframe](https://img.enkipro.com/eb130d395d576c5e425fab2a948f3f44.png) + +Now we have a `DataFrame` output. + +If we reset again, we get a new column and index. + +![reset-again](https://img.enkipro.com/2c81a2be21196e7e6f28f26030ac7587.png) + +Another way we could've done this process is by skipping `to_frame()`. + +If we skip this method, we have to pass a `name` parameter to `reset_index()` to choose a different name for the column. + +This is because, if we reset an index on a `Series` and add it as a column, we will have a `DataFrame` output: + +![new-column-blah](https://img.enkipro.com/d42a98e46bf8547b37abb42e5528ab0b.png) + +> The main reason we would want a regular `DataFrame` as output is so that we can continue manipulating it. + +> Let's see in the next insight how this works on our `netflix_series` dataset. + +--- + +## Practice + +Given the `df`: + +```python +df = pd.DataFrame( + { + 'col1': [1, 1, 1, 1, 2], + 'col2': [2, 3, 4, 5, 9] + } +) +``` + +What would the output look like if we do this: +```python +df.groupby('col1').size() +``` + +```python +# Option A +col1 +1 4 +2 1 +dtype: int64 + +# Option B +col1 +1 2 +2 4 +dtype: int64 + +# Option C +col1 +1 1 +2 1 +dtype: int64 + +# The output would look like +# Option ??? +``` + +- A +- B +- C + +--- + +## Revision + +Given the `Series` output: +```python +# df.groupby('col1').size() +col1 +1 4 +2 1 +dtype: int64 +``` + +Fill in the code to transform the output `Series` back into a `DataFrame` object. Name the new column `"Sum"`. + +```python +df.???(???="???") +``` + +- to_frame +- name +- Sum +- column +- label +- sum +- to_dataframe + +--- + +## Footnotes + +[1: Previous Result] + +```python +df = pd.DataFrame( + { + 'col1': ['A', 'A', 'A', 'B'], + 'col2': [1, 2, 3, 4] + } +) +``` + +The output looks like this: + +![simple-df](https://img.enkipro.com/f9b6e16544fa99a6814f987648715061.png) + +Add `.groupby("col1").size()`: + +![output-without-reseting-index](https://img.enkipro.com/1acf3dd12a251c308f497ea3d43019fc.png) \ No newline at end of file diff --git a/python/data-analysis/pyda-analyzing-ii/counting-and-transforming-series.md b/python/data-analysis/pyda-analyzing-ii/counting-and-transforming-series.md new file mode 100644 index 0000000000..3500dc12a5 --- /dev/null +++ b/python/data-analysis/pyda-analyzing-ii/counting-and-transforming-series.md @@ -0,0 +1,114 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: how-to + +links: + - >- + [to_frame before reset_index](https://stackoverflow.com/questions/40914200/can-i-assign-a-reset-index-a-name){dicsussion} + +practiceQuestion: + formats: + - fill-in-the-gap + - type-in-the-gap + context: standalone +revisionQuestion: + formats: + - fill-in-the-gap + - type-in-the-gap + context: standalone + +--- + +# Counting & Transforming a Series I + +--- +## Content + +In the previous workout, we found how many movies of each rating are on this dataset[1]. + +Now, let's find out how many TV Shows for each rating are there but in a different way. + +We will use `groupby().size()`, `to_frame()` and `reset_index`. + +### .groupby().size() + +`groupby()` is used to group data by a series of columns. + +`.size()` on a `Series` returns the number of rows. Whilst on a `DataFrame` it returns the number of rows times the number of columns. + +`.groupby().size()` returns the number of rows in each group for the specified column. The column is specified in `groupby('HERE')`. + +> `.groupby().size()` on a `DataFrame` retuns a `Series`. This is why we use `.size()` here. To get the exact number. + +Let's say you have this `DataFrame`: + +```python +df = pd.DataFrame( + { + 'col1': ['A', 'A', 'A', 'B'], + 'col2': [1, 2, 3, 4] + } +) +``` + +The output looks like this: + +![simple-df](https://img.enkipro.com/f9b6e16544fa99a6814f987648715061.png) + +If we add `.groupby("col1").size()` we get this output: + +![output-without-reseting-index](https://img.enkipro.com/1acf3dd12a251c308f497ea3d43019fc.png) + +In `col1`, there are three A's and one B. + +At that point, our output is no longer a `DataFrame` but a `Series`. + +> We will continue in the next insight. + +--- + +## Practice + +The method that counts the number of rows on a `Series` or the number of rows times the number of columns on a `DataFrame`: + +```python +??? +``` + +The method that groups data by a series of columns is: +```python +??? +``` + +This method returns the number of rows in each group of the specified column: +```python +??? +``` + +- .size() +- .groupby() +- .groupby().size() +- .sizeby().group() +- .group() +- .sizeby() + +--- + +## Revision + +Fill in the code to count the total number of rows for each group in the `'col3'` column of the `grades` `DataFrame`. + + +```python +????????? +``` + +- grades +- .groupby('col3') +- .size() +- Grades +- groupby(col3) +- .sum() diff --git a/python/data-analysis/pyda-analyzing-ii/movie-count-per-release-year-visualized.md b/python/data-analysis/pyda-analyzing-ii/movie-count-per-release-year-visualized.md new file mode 100644 index 0000000000..8b98f99641 --- /dev/null +++ b/python/data-analysis/pyda-analyzing-ii/movie-count-per-release-year-visualized.md @@ -0,0 +1,207 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: how-to + +links: + - >- + [Line Styles](https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.plot.html){documentation} + +practiceQuestion: + formats: + - fill-in-the-gap + - type-in-the-gap + context: standalone +revisionQuestion: + formats: + - fill-in-the-gap + - type-in-the-gap + context: standalone + +--- + +# Movie Count Per Release Year + +--- +## Content + +Now that we have these values extracted and ordered[1], let's make some line charts. + +Line charts are best used to compare changes over time. They are useful for both small and large groups of data when trying to look for a trend or a measure over time. + +```python +# Plot the X values with data from the release_year column of the `before_1979` df, +# Plot the Y values with the data from the `movie_count` column of the `before_1979` df, +# Add a label "Before 1979" +# Make the style of the line dashed +# Add a marker 'o' on break points +# Color the line in a green color +plt.plot(before_1979.release_year,\ + before_1979.movie_count,\ + label="Until 1979",\ + linestyle='--',\ + marker='o',\ + color='g' +) + +# Set the X axis label to "Release Year" +plt.xlabel('Release Year') +# Set the Y axis label to "Count" +plt.ylabel('Count') +# Set a title of the current axes. +plt.title('Movie VS TV Show count by relese year') +# show a legend on the plot +plt.legend() +# Display a figure. +plt.show() +``` + +The output of this code looks like this: + +![movies-before-1979](https://img.enkipro.com/22e7c709119de6a088cbd0cd89a88d4c.png) + +We will use the same code for the other two groups, except we will make some small changes to the styling. + +For movies between 1980 and 2000, we will use blue color, the same marker, and a full line. + +```python +linestyle='-',\ +marker='o',\ +color='blue' +``` + +You can view the full code here[2]. + +![movies-after-2000](https://img.enkipro.com/1c7a7a522724ac61443c3806ca4abf33.png) + +For movies released after 2000, we will use a red color, an X marker, and a dotted line[3]. + +```python +linestyle=':',\ +marker='x',\ +color='#FF0000' +``` + +![movies-between-1980-1999](https://img.enkipro.com/998ce9deaacc3485cb8d967915196c51.png) + +As you can see, the colors of the line can be assigned with a single letter, a hex string, or a full name. There are also a lot of different markers you can choose from. To check out all the different styles, visit the link in the **Learn More** section. + +> Here is all the work we have done so far in a notebook: + +[Google Collab Notebook](https://colab.research.google.com/drive/1v69QDwPVvbS3BMxlG4cAZIoDGh0I8l-R?usp=sharing) + +--- + +## Practice + +Fill in the gaps to plot a line-styled chart. The chart should have an `x` marker, a dotted line, and red color. + +```python +import matplotlib.pyplot as plt + +plt.???(x1, x2, \ + label="This is a label",\ + ??? =':',\ + marker='???',\ + ???='???') +``` + + +- plot +- linestyle +- x +- color +- red + +--- + +## Revision + +Fill in the gaps to plot a line styled chart. The chart should have an `>` marker, a solid line, and purple color. + +```python +import matplotlib.pyplot as plt + +???.???(x1, x2, \ + label="Some Label",\ + linestyle ='???',\ + marker='???',\ + ???='???') +``` + +- plt +- plot +- `-` +- `>` +- color +- purple + +--- + +## Footnotes + +[1:Previous insight] + +After extracting the values, we have re-ordered them based on the `release_year` in descending order. +```python +after_2000 = movies_after_2000\ + .sort_values('release_year', ascending=False) + +before_1979 = movies_before_1980\ + .sort_values('release_year', ascending=False) + +between_1980_2000 = movies_between_1980_2000\ + .sort_values('release_year', ascending=False) +``` + +[2: Movies Between 1980 & 2000] +```python +# Plot the X values with data from the release_year column of the `between_1980_2000` df, +# Plot the Y values with the data from the `movie_count` column of the `between_1980_2000` df, +# Add a label +# Make the style of the line full +# Add a marker 'o' on break points +# Color the line in a green color +plt.plot(between_1980_2000.release_year,\ + between_1980_2000.movie_count, \ + label="Between 1980 & 2000",\ + linestyle='-',\ + marker='o',\ + color='blue' +) + +# Set the X axis label to "Release Year" +plt.xlabel('Release Year') +# Set the Y axis label to "Count" +plt.ylabel('Count') +# Set a title of the current axes. +plt.title('Movie Count By Year Between 1980 & 2020') +# show a legend on the plot +plt.legend() +# Display a figure. +plt.show() +``` + +[3: Movies After 2000] +```python +plt.plot(after_2000.release_year,\ + after_2000.movie_count,\ + label="Movies",\ + linestyle=':',\ + marker='x',\ + color='#FF0000' +) + +# Set the X axis label to "Release Year" +plt.xlabel('Release Year') +# Set the Y axis label to "Count" +plt.ylabel('Count') +# Set a title of the current axes. +plt.title('Movie Count By Year After 2000') +# show a legend on the plot +plt.legend() +# Display a figure. +plt.show() +``` diff --git a/python/data-analysis/pyda-analyzing-ii/movie-count-per-release-year.md b/python/data-analysis/pyda-analyzing-ii/movie-count-per-release-year.md new file mode 100644 index 0000000000..9a13572a1e --- /dev/null +++ b/python/data-analysis/pyda-analyzing-ii/movie-count-per-release-year.md @@ -0,0 +1,60 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: how-to + +--- + +# Movie Count Per Release Year + +--- +## Content + +To determine the movie count per year released we have to `groupby` the `release year` and create a new column labeled `"Movie Count"` by reseting the index. + +Here's how that looks: +```py +# Save in movies by release year +# Group by the year of release +# Get the row count for each group +# Sort the values in descending order +# Reset index and create new colum labeled Movie Count +movies_by_release_year = netflix_movies.groupby('release_year')\ + .size()\ + .sort_values(ascending=False)\ + .reset_index(name='movie_count') +``` + +The output looks like this: + +![movies-by-release-year](https://img.enkipro.com/8edbf5b4ac2e83772d5ccb533d7daeda.png) + +Since this output has 70 rows, visualizing it in a single chart will have too much information. Let's split it into 3 `DataFrames` and visualize each of them: + +```py +# Movies released on and prior to 1979 +movies_before_1980 = movies_by_release_year[movies_by_release_year['release_year'] <= 1979] + +# Movies released betweem 1980 and 1999 +movies_between_1980_2000 = movies_by_release_year[(movies_by_release_year['release_year'] >= 1980) & (movies_by_release_year['release_year'] < 2000)] + +# Movies released on and after 2000 +movies_after_2000 = movies_by_release_year[movies_by_release_year['release_year'] >= 2000] +``` + +Now that they are extracted, they are still in order based on the number of movies per year of release. Let's change the order to be according to the year of release for each group and save them in a new variable: + +```python +after_2000 = movies_after_2000\ + .sort_values('release_year', ascending=False) + +before_1979 = movies_before_1980\ + .sort_values('release_year', ascending=False) + +between_1980_2000 = movies_between_1980_2000\ + .sort_values('release_year', ascending=False) +``` + +> We will plot them in the next insight. \ No newline at end of file diff --git a/python/data-analysis/pyda-analyzing-ii/tvshow-ratings.md b/python/data-analysis/pyda-analyzing-ii/tvshow-ratings.md new file mode 100644 index 0000000000..8998aac633 --- /dev/null +++ b/python/data-analysis/pyda-analyzing-ii/tvshow-ratings.md @@ -0,0 +1,133 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: how-to + +links: + - >- + [to_frame before reset_index](https://stackoverflow.com/questions/40914200/can-i-assign-a-reset-index-a-name){documentation} + + +practiceQuestion: + formats: + - fill-in-the-gap + context: standalone +revisionQuestion: + formats: + - fill-in-the-gap + - type-in-the-gap + context: standalone + +--- + +# TV Shows Ratings + +--- +## Content + +Now that we know how the previous code works, let's apply it to `netflix_series`. + +> There are two ways we can do this. + +### With to_frame & reset_index + +```py +# On netflix_series, group by rating +# count the number of rows for each individual rating +# sort the values in descending order +# transform the series into a DataFrame +# label the new column "Sum of TV Shows" +# Reset the index of the rating column. +netflix_series\ + .groupby('rating').size()\ + .sort_values(ascending=False)\ + .to_frame("Sum of TV Shows")\ + .reset_index('rating') +``` + +The output of that code is: + +![toframe-output](https://img.enkipro.com/92ca8af7959786cd9bebb9c0898e3010.png) + +> `\` is an escape character. If you use it at the end of a line you can tell the interpreter to extend the logical line to the next physical line. + +In our case, we have extended our logical line to 5 physical lines so it's easier to read. + +### With reset_index + +Another way for doing this with one less step. + +```py +# On netflix_series, group by rating +# count the number of rows for each individual rating +# sort the values in descending order +# reset the index and add a new column +# Sum of TV Shows +netflix_series\ + .groupby('rating').size()\ + .sort_values(ascending=False)\ + .reset_index(name='Sum of TV Shows') +``` + +The output of this code is exactly the same. + +![series-per-rating-count](https://img.enkipro.com/a7699cddce250f540b7259c4d0a10e2b.png) + +The difference is that in the 2nd example we set the new column through the index. + +Both ways work the same and it's up to you which one you want to use in your code. + +--- + +## Practice + +Given the code with missing parts, and the description, make the code match the description. + +```python +???.groupby('???')???\ + ???(ascending=False)\ + .reset_index(name='???') +``` + +Group the `people` `DataFrame` by its `Age` column. Count the number of rows for each `Age` group. Sort the values in descending order. Reset the index to a new logical one from 0 to length and add a new column called `Count By Age` where the `groupby.size` will display the output. + +- `people` +- `Age` +- `.size()` +- `.sort_values` +- `Count By Age` +- `count by age` +- `age` + +--- + +## Revision + +Fill the missing code to group the `people` df by its `Age` column. Sort the values in descending order. Transform it back to a df where the new column's name will be `Count`, then reset the index by the `Age` column. + +```python +# On netflix_series, group by rating +# count the number of rows for each individual rating +# sort the values in descending order +# transform the series into a DataFrame +# label the new column "Sum of TV Shows" +# Reset the index of the rating column. +people\ + .groupby('???').size()\ + ???(???)\ + ???('Count')\ + ???('Age') +``` + +- Age +- .sort_values +- ascending=False +- .to_frame +- .reset_index +- index +- .to_data_frame +- descending=True +- .sort +- age diff --git a/python/data-analysis/pyda-analyzing-ii/visualize-tvshow-ratings.md b/python/data-analysis/pyda-analyzing-ii/visualize-tvshow-ratings.md new file mode 100644 index 0000000000..98230fb308 --- /dev/null +++ b/python/data-analysis/pyda-analyzing-ii/visualize-tvshow-ratings.md @@ -0,0 +1,125 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: how-to + +practiceQuestion: + formats: + - fill-in-the-gap + - type-in-the-gap + context: standalone +revisionQuestion: + formats: + - fill-in-the-gap + context: standalone + +--- + +# Visualize TV Show Ratings + +--- +## Content + +Previously we used this[1] code to get a `DataFrame` output containing all TV shows sorted by rating in this dataset. + +Now, let's visualize it. + +First, we have to save what we found and then plot it. + +```py +# Save the previous code in shows_per_rating +shows_per_rating = netflix_series\ + .groupby('rating').size()\ + .sort_values(ascending=False)\ + .reset_index(name='Sum of TV Shows') +``` + +We will also add colors to the bar chart by adding a parameter called `color`: + +> Colors are not necessary to create a chart. However, they are useful for color-coding. + +```python +# Plot the bar chart with the rating on X axis +# and Sum of TV Shows on Y axis +shows_per_rating.plot( + x="rating", + y="Sum of TV Shows", + kind="bar", + color=['red','green', 'blue'] +) +# Show the plot +plt.show() +``` + +> When adding colors, you do not have to specify a color per bar. You can specify less than the total amount of bars and the bars will alternate in the order of the specified colors. + +This is the output: + +![shows-per-rating](https://img.enkipro.com/cfefb3658f1a04a37e2b36a0ad4ad519.png) + +On the other hand, if you specify more colors than there are bars, the bars will be colored according to the specified colors while all the remaining colors are ignored. + +--- + +## Practice + +Finish the code to create alternating colors on a bar chart: + +```python +df.plot( + x=x1, + y=x2, + ???="???", + ???=['red','green', 'blue'] +) +plt.show() +``` + +- kind +- bar +- color +- colors +- type +- Bar + +--- + +## Revision + +Given the code below: + +```python +plt.plot( + x=x1, + y=x2, + kind="bar", + color=['pink', 'green', 'yellow'] +) +plt.show() +``` + +If our output bar chart has more than three bars, how will those bars be colored? + +??? + +- Alternating in the order of the specified colors repeating after the last color. Pink, green, yellow, then pink, green, yellow again, and so on. +- Random order between the three specified colors. +- Each bar will be a combination of the specified colors. + + +--- +## Footnotes +[1:Previous Code] + +```py +netflix_series\ + .groupby('rating').size()\ + .sort_values(ascending=False)\ + .reset_index(name='Sum of TV Shows') +``` + +The output of this code is: + +![series-per-rating-count](https://img.enkipro.com/a7699cddce250f540b7259c4d0a10e2b.png) \ No newline at end of file diff --git a/python/data-analysis/pyda-analyzing-iii/README.md b/python/data-analysis/pyda-analyzing-iii/README.md new file mode 100644 index 0000000000..421bb9f81c --- /dev/null +++ b/python/data-analysis/pyda-analyzing-iii/README.md @@ -0,0 +1,14 @@ +name: Analysis III + +description: Learn even more ways to manipulate a dataset and visualize outputs. + +insights: + - total-number-of-actors + - movies-per-actor + - movies-per-actor-ii + - tvshow-per-actor + - computing-group-sizes + +aspects: + - workout + diff --git a/python/data-analysis/pyda-analyzing-iii/computing-group-sizes.md b/python/data-analysis/pyda-analyzing-iii/computing-group-sizes.md new file mode 100644 index 0000000000..8f0ba53169 --- /dev/null +++ b/python/data-analysis/pyda-analyzing-iii/computing-group-sizes.md @@ -0,0 +1,53 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: how-to + +--- + +# Computing Group Sizes. + +--- +## Content + +Now that we have `moviePerActor` and `tvshowPerActor`, we can use them to count how many actors have a specific number of movies/shows. + +### Movies + +```py +# Group by Sum of Movies +# count rows +# transform to DataFrame +# new column named "Actors" +# sort values in descending order by new column +moviePerActor.groupby("Sum of Movies")\ + .size()\ + .reset_index(name="Actors")\ + .sort_values("Actors",ascending=False) +``` + +The output looks like this: + +![movie-per-actor](https://img.enkipro.com/715b109ee5c3549bf2ab8104509f6031.png) + +### TV Shows + +```py +tvshowPerActor.groupby("Sum of TV Shows")\ + .size()\ + .reset_index(name="Actors")\ + .sort_values("Actors",ascending=False) +``` +Gives this output: + +![show-per-actor](https://img.enkipro.com/69d1e2673e1b43ea2cf95ffb52b11fb8.png) + +Using this information we can determine that over 10,000 actors have starred in 1 movie, but only 1 has starred in 32. + +Same for shows; 9484 actors have starred in 1 show and only 1 in 18. + +> Here is all the work we have done so far in a notebook: + +[Google Collab Notebook](https://colab.research.google.com/drive/14jMcJjW5iu6vf2_oQDPEm3oi5jzWuPOj?usp=sharing) \ No newline at end of file diff --git a/python/data-analysis/pyda-analyzing-iii/movies-per-actor-ii.md b/python/data-analysis/pyda-analyzing-iii/movies-per-actor-ii.md new file mode 100644 index 0000000000..7c06c82780 --- /dev/null +++ b/python/data-analysis/pyda-analyzing-iii/movies-per-actor-ii.md @@ -0,0 +1,138 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: how-to + +links: + - >- + [Stack](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.stack.html){documentation} + - >- + [Rename](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rename.html){documentation} + +revisionQuestion: + formats: + - fill-in-the-gap + - type-in-the-gap + context: standalone + +--- + +# Movies Per Actor Continued + +--- +## Content + +In the previous insight, we used this code[1] to split the cast column for the movie cast. But we didn't finish modifying it. + +The next thing we need to do is rename the `0` labeled column. + +To rename a column, you need to use `.rename(columns={"column":"New Name"})`. + +There are two options: +```python +# Rename by column index +df.rename(columns={0: "A", 1: "B"}) + +# Rename by column name +df.rename(columns={"B": "b", "C": "c"}) +``` + +In our case, all we have to do is: +```python +splitMovieCast = \ + splitMovieCast\ + .rename(columns={0: 'Actor'}) +``` + +The output looks like this: + +![rename-split-movie-cast](https://img.enkipro.com/ebb6cd3ba5f0f6806fb22eb8650e5244.png) + +Next, we can sort the values and swap the columns using this code: + +```python +splitMovieCast\ + .sort_values('Actor')[['Actor', 'type']] +``` + +![sort-value-reorder-column](https://img.enkipro.com/766cd3b731d60c2f5434041b4a2b4478.png) + +Then we reset the index to have a logical index from 0 to the length of the column: + +![reset-to-type](https://img.enkipro.com/8f8a386e584d4a9c61c4ce051636417b.png) + + +As mentioned in a previous workout, `reset_index` resets the index and adds it as a new column. The `drop=True` prevents it from doing that. + +Here is how the above code would look if we didn't use `drop=True`: + +![no-drop-true](https://img.enkipro.com/f2aad21978de43d81590aa1a80f9e2f2.png) + +The next step is something we have already done before, `groupby().size()`. + +```py +# Save in moviePerActor +# group by Actor +# Get the number of rows for each actor +# Sort in descending order +# reset the index to Sum of Movies +moviePerActor = splitMovieCast\ + .groupby('Actor')\ + .size()\ + .sort_values(ascending=False)\ + .reset_index(name='Sum of Movies') +``` + +When we call `moviePerActor` we get this output: + +![movie-per-actor](https://img.enkipro.com/0abc712cb32ead2f1275255e3d4c605a.png) + +This output gives us actor names, the total count of movies they were in, and the total number of actors present across all movies in this dataset; `18860`. + +> Let's do the TV Show actor count in the next insight. + +--- + +## Revision + +Fill in the gaps to make the comments valid. + +```py +humans + ???(columns={???:'People'})\ + .sort_values('???')\ + .reset_index(drop=???) + +# Rename the second column of the humans df to "People" +# Sort the values by the People column +# reset the index but don't add a new column + +``` + +- .rename +- 1 +- People +- True +- False +- rename +- 2 +- people + + +--- +## Footnotes +[1: Previous Code] + +```python +# Save to splitMovieCast +# Set the Index to an existing column (type) +# split the cast column by the `, ` delimiter +# stack each value into individual rows +# Reset the index to the previously selected column +splitMovieCast = movieCast.set_index('type')\ + .cast.str.split(', ', expand=True)\ + .stack()\ + .reset_index('type') +``` \ No newline at end of file diff --git a/python/data-analysis/pyda-analyzing-iii/movies-per-actor.md b/python/data-analysis/pyda-analyzing-iii/movies-per-actor.md new file mode 100644 index 0000000000..f81ad7ae3a --- /dev/null +++ b/python/data-analysis/pyda-analyzing-iii/movies-per-actor.md @@ -0,0 +1,99 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: how-to + +links: + - >- + [Stack](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.stack.html){documentation} + - >- + [Set_index](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.set_index.html){documentation} + +revisionQuestion: + formats: + - fill-in-the-gap + - type-in-the-gap + context: standalone + +--- + +# Movies Per Actor + +--- +## Content + +Now that we know how to split a string column, let's split `movieCast` and `tvshowCast`. + +However, we also need to know what `set_index` and `.stack()` are. + +`set_index` is used to set the index of the `DataFrame` using one of the existing columns. + +The `.stack()` method is used to return a reshaped `DataFrame` or `Series`. The output depends on if the columns have a single level (`Series`) or multiple levels (`DataFrame`). + +Here is the code we will use to split the `cast` column: + +```py +# Save to splitMovieCast +# Set the Index to an existing column (type) +# split the cast column by the `, ` delimiter +# stack each value into individual rows +# Reset the index to the previously selected column +splitMovieCast = movieCast.set_index('type')\ + .cast.str.split(', ', expand=True)\ + .stack()\ + .reset_index('type') +``` + +Let's see how this code behaves line by line. + +First off, here is how the `movieCast` df looks like: + +![moviecast](https://img.enkipro.com/23051c00c5838e6025ac34e5fc9868a5.png) + +When we do `movieCast.set_index('type')` we remove the index and set the `type` column as the new index: + +![moviecast-setindex](https://img.enkipro.com/9d412222b4f38824e41dd4da07313436.png) + +Next, we split the `cast` column by the `, ` delimiter: + +![moviecast-set-split](https://img.enkipro.com/eb2d93fef0eb3af0a31449291bf576fb.png) + +Then we stack everything back into individual rows: + +![moviecast-set-split-stack](https://img.enkipro.com/f2a2b98eb631abd09bcc7eb598d6756b.png) + +Lastly, we reset the index and our `DataFrame` looks like this: + +![moviecast-set-split-stack-reset](https://img.enkipro.com/0eaf1f4bffe913833b249201cd298946.png) + +Then we save that code in `splitMovieCast`. However, we are not done yet! + +> We will continue in the next insight. + +--- + +## Revision + +Fill in the gaps to make the comments valid. + +```py +df.set_index('???')\ + ??????('-', expand=True)\ + ???\ + ???('name') + +# Set the Index to an existing column 'name' +# split the people column by the `-` delimiter +# stack each value into individual rows +# Reset the index to the previously selected column +``` + +- name +- .people +- .str.split +- .stack() +- .reset_index +- set_index +- stack[] diff --git a/python/data-analysis/pyda-analyzing-iii/total-number-of-actors.md b/python/data-analysis/pyda-analyzing-iii/total-number-of-actors.md new file mode 100644 index 0000000000..2d580c931b --- /dev/null +++ b/python/data-analysis/pyda-analyzing-iii/total-number-of-actors.md @@ -0,0 +1,120 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: how-to + +practiceQuestion: + formats: + - fill-in-the-gap + context: standalone +revisionQuestion: + formats: + - fill-in-the-gap + - type-in-the-gap + context: standalone + +--- + +# Total Number Of Actors + +--- +## Content + +In the first Analyzing workout, we set up a couple of questions that we will try to answer regarding the `netflix_titles` dataset. + +In this workout, we will answer **what the total number of actors is**. + +To get the total number of actors, we first have to separate the comma-separated values from the `cast` column. + +Also, we knew from the beginning that the `cast` column had some missing information[1]. + +The cells with missing information have been auto-filled with `NaN` values. + +To remove rows that have `NaN` we have to use the `.dropna` method. + +```py +# Remove all nan values from `cast` column +# Save into noNanValuesDF +noNanValuesDF = importedData.dropna(subset=['cast']) +``` +`subset` is used to specify the column. If no column is specified, all columns will drop rows with `NaN` values. + +Next, let's separate the movie cast from the TV Show cast: +```py +# Locate rows where the column called "type" has "Movies" in it. +movieCast = noNanValuesDF.loc[noNanValuesDF['type'] == 'Movie'] +# 3905 + +# Locate rows where the column called "type" has "TV Shows" in it. +tvshowCast = noNanValuesDF.loc[noNanValuesDF['type'] == 'TV Show'] +# 1759 +``` + +Now that they are split, let's separate the `cast` field by the `, ` delimiter. + +To split a `string` column you have to use the `.str.split()` method. + +The syntax for this method is: +```py +.str.split(pat=None, n=- 1, expand=False) +``` + +`pat` is the delimiter, if not specified, the default value is whitespace. + +`n` is used to limit the number of splits. If not specified, the default value is -1. It means "Split all" + +`expand` is a bool field. If set to `True`, it returns a `DataFrame`. If set to `False`, it returns a `Series` containing a list of strings. + +In our case, we want to split by the `, ` delimiter, split all, and return a `DataFrame`. + +That means we will use this code: +```py +.str.split(', ', expand=True) +``` + +> The extra space is there because there is a space after every comma-separated value. When we split values like: "Stefan, Maria", we don't want to be left with `Stefan` and ` Maria`. + +> We will continue in the next insight. + +--- + +## Practice + +Which method can be used to remove all rows that have a `NaN` value? + +```python +??? +``` + +- .dropna() +- .removena() +- .dropNaN() +- .removeNaN() + +--- + +## Revision + +Fill in the code to drop all rows that have `NaN` values in the `Age` column of the `school` dataset and save it in a new variable called "`noNanValues`". + +```python + +??? = \ + school.???(???=['???']) +``` + +- noNanValues +- dropna +- subset +- Age +- age +- nonanvalues +- SUBSET + +--- +## Footnotes +[1:Missing Information] + +![missing-cast](https://img.enkipro.com/b382ceeb3ea3513cf9f1874b7a7e7354.png) diff --git a/python/data-analysis/pyda-analyzing-iii/tvshow-per-actor.md b/python/data-analysis/pyda-analyzing-iii/tvshow-per-actor.md new file mode 100644 index 0000000000..735236c717 --- /dev/null +++ b/python/data-analysis/pyda-analyzing-iii/tvshow-per-actor.md @@ -0,0 +1,46 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: how-to + +--- + +# Total Number Of TV Show Actors + +--- +## Content + +The process for TV show actors is exactly the same: + +```py +splitTVShowCast = tvshowCast.set_index('type')\ + .cast.str.split(', ', expand=True)\ + .stack()\ + .reset_index('type')\ + .rename(columns={0:'Actor'})\ + .sort_values('Actor')[['Actor', 'type']]\ + .reset_index(drop=True) +``` + +Which gives this output: + +![split-tvshow-cast](https://img.enkipro.com/4af36d03ef84957b3b3e270984900c35.png) + +Then we group and sort again: + +```py +tvshowPerActor = splitTVShowCast\ + .groupby('Actor')\ + .size()\ + .sort_values(ascending=False)\ + .reset_index(name='Sum of TV Shows') + +# Call +tvshowPerActor +``` + +And get this output: + +![tvshow-per-actor](https://img.enkipro.com/cc41ead80e14bd00ae6342f1b49b5d8b.png) diff --git a/python/data-analysis/pyda-analyzing-iv/README.md b/python/data-analysis/pyda-analyzing-iv/README.md new file mode 100644 index 0000000000..889b532532 --- /dev/null +++ b/python/data-analysis/pyda-analyzing-iv/README.md @@ -0,0 +1,11 @@ +name: Analysis IV + +description: Learn how to separate values for easier analysis. + +insights: + - longest-running-shows + - longest-movies + - longest-movies-ii + +aspects: + - workout \ No newline at end of file diff --git a/python/data-analysis/pyda-analyzing-iv/longest-movies-ii.md b/python/data-analysis/pyda-analyzing-iv/longest-movies-ii.md new file mode 100644 index 0000000000..585a3b1b1b --- /dev/null +++ b/python/data-analysis/pyda-analyzing-iv/longest-movies-ii.md @@ -0,0 +1,161 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: how-to + +links: + - >- + [Astype)](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.astype.html){documentation} + +practiceQuestion: + formats: + - fill-in-the-gap + - type-in-the-gap + context: standalone +revisionQuestion: + formats: + - fill-in-the-gap + context: standalone + +--- + +# Longest Running Movies Continued + +--- +## Content + +Now that our string column has only numbers, let's convert it to numerical values so that we can sort it. + +> Strings are not sorted the same way numbers are[1]. + +Because of this, we need to first transform the values into numbers. + +`.astype()` on a `Series` changes the type of values to a specified one. + +Here's how we can transform `duration`'s values to `int`: +```py +# Transform duration column to numerical values and save into x +x = movieDuration['duration'].astype(int) +``` + +The output of `x` is just a `Series` of numerical values: + +![x=series](https://img.enkipro.com/21affff2db3caa5fbb12820b32c9da1a.png) + +> The order stays exactly the same as it was. + +Then we are going to create a new `DataFrame` using `.assign`: + +```py +# In movieDuration.assign(duration=x) +# From movieDuration, using assign +# replace the duration column with x + +sortedByDuration = \ + movieDuration.assign(duration=x)\ + .sort_values('duration',ascending=False)\ + .set_index('title').reset_index() +``` + +`duration` `dtype` in `movieDuration`: + +![movieduration](https://img.enkipro.com/900b83a7e0b0b01183e48390df0cfcf2.png) + +`dtype` in `sortedByDuration`: + +![sortedby](https://img.enkipro.com/7d620e0831c0cb4a749aa8623bb52f08.png) + +Then we sort the values by the new `duration` column in descending order. + +Last we set the index to `title` and then reset it to make the `title` a column and get a new logical index starting from 0 to the length. + +Before reset: +![before-reset](https://img.enkipro.com/84fd6109603dce8537fb3b98e69ae1fc.png) + +After reset: + +![after-reset](https://img.enkipro.com/61cc15f3ea0a10f13ade8ca914c9c381.png) + +> Here is all the work we have done so far in a notebook: + +[Google Collab Notebook](https://colab.research.google.com/drive/18kAABpWyjlbJGNbJORniu_UxdTYjMB-2?usp=sharing) + + +--- + +## Practice + +Fill in the gaps to replace the "gredas" column in the students `DataFrame` with the "grades" series. + +```python +students = \ + students???(???=???) +``` + +- .assign +- gredas +- grades +- .add +- Grades +- Greads + +--- + +## Revision + +Given the code: + +```python +shows['Length']\ + .astype(int) +``` + +Select the answer that best explains what is going on. + +??? + +- The `shows` df has its `Length` column transformed into a `Series` of integers values. +- The `shows` df has all of its columns transformed into integer values. + +--- + +## Footnotes + +[1: Sorting Strings] + +`String`s are not sorted like numbers. They are sorted **lexicographically**. + +This means that every digit in a `string` filled with numbers, is treated as a letter. If you try to sort 1, 2, and 10, they will be treated as "1", "2", and "10". + + +This is the same as comparing "a", "b" and "ab". + +"a" clearly goes before "b" in the lexicographical order. +```plain-text +a +b +``` + +Next we compare "ab" with "b", "a" is clearly before "b", so "ab" goes before "b". +```plain-text +a +ab +b +``` + +The same logic applies to `string`s containing numbers. + +"1" is clearly less than "2" +```plain-text +1 +2 +``` + +In "10", 1 is clearly less than "2" so it goes before. +```plain-text +1 +10 +2 +``` \ No newline at end of file diff --git a/python/data-analysis/pyda-analyzing-iv/longest-movies.md b/python/data-analysis/pyda-analyzing-iv/longest-movies.md new file mode 100644 index 0000000000..e2502b0a47 --- /dev/null +++ b/python/data-analysis/pyda-analyzing-iv/longest-movies.md @@ -0,0 +1,114 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: how-to + +links: + - >- + [Str.Replace](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.replace.html){documentation} + +revisionQuestion: + formats: + - fill-in-the-gap + context: standalone + +--- + +# Longest Running Movies + +--- +## Content + +For longest-running movies, we are going to remove ` min` from every cell in the `duration` column using `str.replace`, and then convert that column to numerical values using `.astype(int)`. + +The syntax for `.str.replace()` is : +```py +Series.str.replace( + pat, + repl, + n= -1, + case=True, + regex=True +) +``` + +`pat` is the sequence of characters or a regular expression. + +`repl` is the replacement string. + +`n` is the number of replacements to make from the start. By default it is set to `-1` which means *replace all*. + +`case` is `bool`. `True` means case sensitive, `False` means not. By default, set to `True`. + +`regex` is also of type `bool`. `True` means treat `pat` as a regular expression while `False` means to treat it as a string literal. By default, it is set to `True`. + +The output is a `Series` where everything matching `pat` was replaced with `repl`. + +Let's use the same `simplifiedData`[1] from the last insight. + +We will extract movies into `movieDuration` and drop the `type` column. +```py +# Split movies +movieDuration = simplifiedData[simplifiedData['type'] == "Movie"] + +# Drop type column +movieDuration = movieDuration.drop(columns=['type']) +``` + +After that we will remove ` min` from the `duration` column: +```py +movieDuration['duration']= movieDuration['duration']\ +.str.replace(" min", "") +``` + +Now the column looks like this: + +![duration-no-minutes](https://img.enkipro.com/20a9bbe37844becebfd5ce77b42bada2.png) + +> We will continue in the next insight. + +--- + +## Revision + +What does the `.str.replace()` method do in this code? + +```python +df['duration'] = \ + df['duration']\ + .str.replace(" min", "") +``` + +Replaces every string in the ??? column of the ??? `DataFrame` that has a ??? substring in it with ???. + +- `duration` +- `df` +- `" min"` +- `nothing` +- `"min "` +- `DF` +- `replace` +- `a space` + +--- +## Footnotes +[1: Previous Data] + +```py +# Remove columns we wont be using in this analysis +simplifiedData = importedRawData.drop( + columns=[ + "show_id", + "description", + "country", + "date_added", + "director", + "cast", + "release_year", + "rating", + "listed_in" + ] +) +``` diff --git a/python/data-analysis/pyda-analyzing-iv/longest-running-shows.md b/python/data-analysis/pyda-analyzing-iv/longest-running-shows.md new file mode 100644 index 0000000000..d70ca81b80 --- /dev/null +++ b/python/data-analysis/pyda-analyzing-iv/longest-running-shows.md @@ -0,0 +1,109 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: how-to + + +--- + +# Longest Running Shows + +--- +## Content + +In the first Analyzing workout, we set up a couple of questions that we will try to answer regarding the `netflix_titles` dataset[2]. + +In this workout, we will answer **which shows and movies have the longest duration**. + +Let's start with the shows first. + +To determine which show has the most seasons we need to use the `duration` column. Along with that, we also need the `title` column. + +It will help us in determining the titles of the longest and shortest running shows. + +We already know that there are no missing values in these two columns[1]. + +Since they are the only columns we need, let's remove the information we don't need and extract rows where `type` equals `TV Show`. +```py +# Remove columns we wont be using in this analysis +simplifiedData = importedRawData.drop( + columns=[ + "show_id", + "description", + "country", + "date_added", + "director", + "cast", + "release_year", + "rating", + "listed_in" + ] +) +``` + +Extract TV shows: +```py +showDuration = simplifiedData[simplifiedData['type'] == "TV Show"] +``` + +After extracting tv shows from the `type` column, let's drop it. +```py +showDuration = showDuration.drop(columns=['type']) +``` + +What we are left with is: + +![showduration](https://img.enkipro.com/b2b53d42b2c50f241fafaf129e279734.png) + +So, what can we do with this? + +For instance we can determine how many shows have how many seasons: +```py +showDuration\ + .groupby('duration')\ + .size()\ + .sort_values(ascending=False)\ + .reset_index(name="Num Of Shows") +``` + +This gives us this output: + +![shows-per-seasons](https://img.enkipro.com/30b18b757f39d3f7ec6aec384c9b9476.png) + +From this, we get that there are only 13 shows with 10 or more seasons. While over 1000 shows have at least 1 season. + +Knowing this, we can use the `.loc` function to locate the `14 Seasons` row to find its title. + +```py +showDuration.loc[showDuration['duration'] == "14 Seasons"] +``` + +The output looks like this: +![14seasons](https://img.enkipro.com/cde1eafbe5e64448ef6b47cfefea1c15.png) + +--- +## Footnotes +[1: No missing values] + +![nomissingvalues](https://img.enkipro.com/629e4a33e5c98d860011a442d3bba282.png) + +[2: End goal questions] + +End Goal Questions: + +- How many movies vs TV shows are on this dataset +- How many ratings exist in the dataset +- How many movies of specific ratings are there +- Count of movies for each rating +- How many TV shows of specific ratings are there +- Count of TV shows for each rating +- In how many movies does a specific actor star in / movies per actor +- How many movies/TV shows were released every year +- Total number of actors in movies and TV Shows +- Total number of actors that have X amount of movies/TV shows +- Which shows and movies have the longest duration +- How many genres are there +- How many movies of a specific genre are there +- Number of movies for all genres diff --git a/python/data-analysis/pyda-analyzing/README.md b/python/data-analysis/pyda-analyzing/README.md new file mode 100644 index 0000000000..85f6085703 --- /dev/null +++ b/python/data-analysis/pyda-analyzing/README.md @@ -0,0 +1,17 @@ +name: Analysis I + +description: Learn different methods for manipulating a dataset and visualizing outputs. + +insights: + - end-goals + - movie-tvshow-count + - visualize-movie-show-count + - modifying-pie-chart + - modifying-pie-chart-ii + - movies-per-rating + - dataframe-loc + - visualize-movie-ratings + +aspects: + - introduction + - workout diff --git a/python/data-analysis/pyda-analyzing/dataframe-loc.md b/python/data-analysis/pyda-analyzing/dataframe-loc.md new file mode 100644 index 0000000000..5f31784556 --- /dev/null +++ b/python/data-analysis/pyda-analyzing/dataframe-loc.md @@ -0,0 +1,78 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: must-know + +practiceQuestion: + formats: + - fill-in-the-gap + context: standalone +revisionQuestion: + formats: + - fill-in-the-gap + - type-in-the-gap + context: standalone + +--- + +# DataFrame.loc + +--- +## Content + +Previously, we have found how many movies of each rating there are on this dataset[1]. + +However, what if we wanted to find a single rating and save it in a new variable? + +To search for only a single rating, we would use the `DataFrame.loc` function. + +Let's find the number of movies that have the `TV-Y` rating: + +```python +ratingTVY = netflix_movies.loc[netflix_movies['rating'] == 'TV-Y'].count() +``` + +Calling `ratingTVY` we get this output: + +![rating-tvy-output](https://img.enkipro.com/f7b7f094361df6f461a20f0c128ce806.png) + + +Using this function we could manually find all movies of every rating as we did in the previous workout using `value_counts()`. However, it would take more time. + +--- +## Practice + +What will the `.count()` method do in the following code? +```py +df.loc[df['Type'] == 'Film'].count() +``` + +Count the number of rows that have ??? + +- the `Film` value in the `Type` column. +- the `Type` value in the `Film` column. + +--- +## Revision + +Fill in the gaps to find all rows that contain the value `Mike` in the `Name` column of the given `DataFrame` and count them: + +```python +df.loc[df['???'] == '???'].??? +``` + +- Name +- Mike +- count() +- name +- mike +- Count() + +--- +## Footnotes + +[1: Movies Per Rating] + +![rating-value-counts](https://img.enkipro.com/0cc70287b1a54ac2352a870d35c659a4.png) \ No newline at end of file diff --git a/python/data-analysis/pyda-analyzing/end-goals.md b/python/data-analysis/pyda-analyzing/end-goals.md new file mode 100644 index 0000000000..4a4c22a578 --- /dev/null +++ b/python/data-analysis/pyda-analyzing/end-goals.md @@ -0,0 +1,111 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: how-to + +--- + +# End Goals? + +--- +## Content + +First off, here's what we did in the previous workout[1]. + +To sum up, we determined which dataset we will use, which columns we will drop and we cleaned it up a bit. + +This process is called **data wrangling[2]**. + +The next step is to decide what we want to do with the information. + +So, let's think of a few questions we would like to answer after analyzing this dataset. + +What do we know? + +We know that we have ratings, cast members, genre (`listed_in`), year of release, type of content, and the duration of it. + +Here are some of the questions we can answer about this dataset: +- How many movies vs TV shows are there? +- How many ratings exist? +- How many movies vs TV shows of a specific ratings are there? +- and more[3]. + +--- +## Footnotes + +[1:Previous workout summary] +Here is all the code we have used in the previous workout: +```py +# Import pandas with the alias "pd" +import pandas as pd + +# Create a DataFrame called "importedRawData". +# Read an external CSV file called "netflix_title.csv" and save it into the DataFrame. +importedRawData = pd.read_csv('netflix_titles.csv') + +# Get the Column & Row count +importedRawData.shape +# (6234, 12) + +# We ran .head() and .tail() to see if everything loaded +importedRawData.head() +importedRawData.tail() +# it did + +# Check which cells are empty +# True means empty while False means populated. +importedRawData.isnull() + +# To not waste time counting all the missing fields, we attached a .sum() function to give us the sum of missing values +importedRawData.isnull().sum() +``` +Which gave us this output: + +| Column | Missing Information | +|--------------|---------------------| +| show_id | 0 | +| type | 0 | +| title | 0 | +| director | 1969 | +| cast | 570 | +| country | 476 | +| date_added | 11 | +| release_year | 0 | +| rating | 10 | +| duration | 0 | +| listed_in | 0 | +| description | 0 | + +Next, we removed the columns we wont use in the analysis: +```py +# Remove columns we wont be using in this analysis +importedData = importedRawData.drop( + columns=["show_id", + "description", + "country", + "date_added", + "director" + ] +) +``` + +[2: Wrangling] +Data Wrangling is the process of collecting, selecting, and transforming data to answer an analytical question. + +This process is very important and is present in every analysis. + +Before you can analyze any data, you will have to gather it, select the portions you need, remove the ones you don't, and transform others to your needs. + +[3: Questions] +- Count of movies for each rating +- Count of TV shows for each rating +- In how many movies does a specific actor star in / movies per actor +- How many movies/TV shows were released every year +- Total number of actors in movies and TV Shows +- Total number of actors that have X amount of movies/TV shows +- Which shows and movies have the longest duration +- How many genres are there +- How many movies of a specific genre are there +- Number of movies for all genres diff --git a/python/data-analysis/pyda-analyzing/modifying-pie-chart-ii.md b/python/data-analysis/pyda-analyzing/modifying-pie-chart-ii.md new file mode 100644 index 0000000000..20e42beef2 --- /dev/null +++ b/python/data-analysis/pyda-analyzing/modifying-pie-chart-ii.md @@ -0,0 +1,137 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: must-know + +practiceQuestion: + formats: + - fill-in-the-gap + context: standalone +revisionQuestion: + formats: + - fill-in-the-gap + context: standalone + +--- + +# Visualize Output III + +--- +## Content + +So far, we modified our pie chart using this code[1]. The output looks like this: + +![with-autopct](https://img.enkipro.com/8070c3d3f4bbb725ef7b528ab7487cfa.png) + +Another step we could do to even better visualize each portion of the chart is to separate them. This is done through the `explode` parameter. + +```py +plt.title( + 'Movies VS TV in %', + color="White" +) + +plt.pie( + [4265,1969], + labels=['Movies','TV Shows'], + textprops={"color": "white"}, + autopct="%.2f", + explode=(0, 0.1) +) + +plt.show() +``` + +The output looks like this: + +![pie-chart-percentage](https://img.enkipro.com/d39c93873479e577b35fd11e786c38b1.png) + +`explode` has the same number of parameters as there are slices of the chart. + +If our chart had 4 slices, and we wanted to only separate the 3rd one, we would write: +```python +explode =(0,0,0.1,0) +``` + +The number `0` represents no change. Positive values move portions of the chart away from the center. Whereas negative values move them towards the center. + +If we used a larger value like `1`, the chart would look like this: + +![explode-1](https://img.enkipro.com/c14ddb84d4b30b98b3103852cb8474c2.png) + +Whereas if we used a negative value like `-1` it would look like this: + +![explode-negative-1](https://img.enkipro.com/8b1661eeaee31d4493725f6472e49eb8.png) + + +--- + +## Practice + +Fill in the gaps to create a pie chart that has its third slice slightly separated from the chart. Don't forget to show the plot afterward. + +```python +data = [100, 150, 200] +labels = 'Milk','Coffee','Tea' + +plt.pie( + ???, + ???=labels, + ???=???, + autopct="%.f" +) + +plt.title('Drinks left') +??? +``` + + +- data +- labels +- explode +- `(0, 0, 0.1)` +- plt.show() +- plt.display() +- `(0.1, 0.1, 0)` +- separate + +--- + +## Revision + +Add the pie chart parameter name to its definition. + +```python +??? - Offset a portion of the chart. +??? - Mark the labels of each portion. +??? - Add numerical representation in percentage. +``` + +- explode +- labels +- autopct +- mark +- wedge + + +--- +## Footnotes +[1: Previous Pie Chart] + +```python +plt.pie( + [4265,1969], + labels=['Movies','TV Shows'], + textprops={'color':"white"}, + autopct='%.2f' +) + +plt.title( + 'Movies VS TV in %', + color="White" +) + +plt.show() +``` diff --git a/python/data-analysis/pyda-analyzing/modifying-pie-chart.md b/python/data-analysis/pyda-analyzing/modifying-pie-chart.md new file mode 100644 index 0000000000..59034b79c9 --- /dev/null +++ b/python/data-analysis/pyda-analyzing/modifying-pie-chart.md @@ -0,0 +1,86 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: must-know + +links: + - >- + [Matplotlib Modules](https://matplotlib.org/stable/py-modindex.html){documentation} + - >- + [Pyplot Module](https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.plot.html){documentation} + - >- + [String Formatting](https://thepythonguru.com/python-string-formatting/){documentation} + +--- + +# Visualize Output II + +--- +## Content + +Previously, we have created this basic pie chart: + +![plt-pie-output](https://img.enkipro.com/91fd23fddc318cc690ae4cba72b1c809.png) + +Let's modify it. + +First off, since we are using a dark theme, let's change the text color to white and add some custom labels: +```python +plt.pie( + [4265,1969], + labels=['Movies','TV Shows'], + textprops={'color':"white"} +) + +plt.show() +``` + +Labels are added through the `labels` parameter while the text color is changed through `textprops`. + +Currently, the chart looks like this: + +![labels-and-textprop](https://img.enkipro.com/c8c7197baf90ab37ae3725f318716ace.png) + +Next, let's add a title and change its color to white: +```python +plt.title( + 'Movies VS TV in %', + color="White" +) +``` + +![labels-and-textprop](https://img.enkipro.com/86633a292c711657c17743768794c63d.png) + +So far, the chart has a label, a title, and all of the text set to white. + +Next, we can display what percentage of a whole is each portion of the chart. This is done through the `autopct` parameter. + +```python +plt.pie( + [4265,1969], + labels=['Movies','TV Shows'], + textprops={'color':"white"}, + autopct='%.2f' +) + +plt.title( + 'Movies VS TV in %', + color="White" +) + +plt.show() +``` + +![with-autopct](https://img.enkipro.com/8070c3d3f4bbb725ef7b528ab7487cfa.png) + + +`autopct` uses string formatting. The value we passed to it is `%.2f`. + +Since we are representing a percentage, we need to use floating-point numbers. + +(`%`) is a special character that tells `autopct` to display the value as a percentage up to two decimal points (`.2`) of a floating-point number (`f`). + + +> We will continue in the next insight. \ No newline at end of file diff --git a/python/data-analysis/pyda-analyzing/movie-tvshow-count.md b/python/data-analysis/pyda-analyzing/movie-tvshow-count.md new file mode 100644 index 0000000000..f780c20f77 --- /dev/null +++ b/python/data-analysis/pyda-analyzing/movie-tvshow-count.md @@ -0,0 +1,112 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: must-know + +practiceQuestion: + formats: + - fill-in-the-gap + - type-in-the-gap + context: standalone +revisionQuestion: + formats: + - fill-in-the-gap + - type-in-the-gap + context: standalone + +--- + +# Movie & TV Show Count + +--- +## Content + +To determine how many movies and/or tv shows this dataset has, we will use two functions: `DataFrame.loc` and `.count()` + +`DataFrame.loc` is used to locate a group of rows and columns based on the label. + +`.count()` is used to count the total number of rows without `NaN` values across all columns. + +Here's how we can use them on our `importedRawData` dataset: +```py +# Locate rows where the column labeled "type" has "Movie" in it. +netflix_movies = importedData.loc[importedData['type'] == 'Movie'] + +# Locate rows where the column labeled "type" has "TV Show" in it. +netflix_series = importedData.loc[importedData['type'] == 'TV Show'] +``` + +Using `.loc`, we have extracted movies and TV shows into separate variables to be able to easily get the total count of rows for movies and shows separately. + +Next, we applied the `.count()` function and printed the result. + +```python +# Count and print the number of values for netflix_movies and netflix_series +print(netflix_movies.count()) + +print(netflix_series.count()) +``` + +> 💡 We could also attach `.count()` at the end of each `.loc` statement. + +The output of this print statement looks like this: + +![movie-tvshow-count](https://img.enkipro.com/2cb189e0e89c3245a37bb333d8d4eba1.png) + +The left side is the name of the column and the right side is how many rows of data exist in that column. + +Since we know from our previous function[1] that the `type` column has no missing data. We can determine that this number is the total number of movies and/or tv shows. + +As we have first run the `netflix_movies.count()` the upper numbers are for movies. + +That means there are a total of 4265 movies and 1969 tv shows on this dataset. + +--- + +## Practice + +Fill in the gaps to display every person in the `people` dataset whose `age` is 27 or older. + +```python +people.loc[ + ???['???'] ??? '27' +] +``` + +- `people` +- `age` +- `>=` +- `Age` +- `>` + + +--- + +## Revision + +The function that is used to locate a group of rows and columns based on the label/s is: + +```python +DataFrame.??? +``` + +- loc +- find +- search +- discover + +--- + +## Footnotes + +[1:Sum Of Empty Cells] +We determined which columns do and don't have empty cells with this command: + +```python +importedRawData.isnull().sum() +``` + +Which gave us this output: +![sum-empty-cells](https://img.enkipro.com/629e4a33e5c98d860011a442d3bba282.png) diff --git a/python/data-analysis/pyda-analyzing/movies-per-rating.md b/python/data-analysis/pyda-analyzing/movies-per-rating.md new file mode 100644 index 0000000000..b20649df5d --- /dev/null +++ b/python/data-analysis/pyda-analyzing/movies-per-rating.md @@ -0,0 +1,99 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: must-know + +revisionQuestion: + formats: + - fill-in-the-gap + - type-in-the-gap + context: standalone + +links: + - >- + [TV Parental Guidelines](https://en.wikipedia.org/wiki/TV_Parental_Guidelines){documentation} + +--- + +# Movie Per Rating + +--- +## Content + +Currently, we know we have exactly 4265 movies in this dataset and an X amount of ratings. + +Let's figure out which groups of ratings there are and how many movies belong to each group. + +> Using this information we will be able to determine which movie we would like to watch based on a rating. + +We already found and saved into a new variable the number of movies and tv shows[1]. + +To find how many movies belong to each rating we will use `.value_counts()`. + +`value_counts()` returns a `Series` containing the count of only unique values. All non-unique and `NaN` values are ignored. + +There are two ways we can apply this to our dataset. + +First option: +```python +# 1 +netflix_movies.value_counts('rating') +``` + +This translates to: + +On `netflix_movies` count all the values for the `rating` column. + +The output looks like this: + +![value-counts-rating](https://img.enkipro.com/b25578827a1f43c6de372b4fb786cae8.png) + +Second option: + +```python +# 2 +netflix_movies.rating.value_counts() +``` + +On `netflix_movies` `rating` column, count all the values. + +Now it looks like this: + +![rating-value-counts](https://img.enkipro.com/0cc70287b1a54ac2352a870d35c659a4.png) + +Both options give us the total number of movies per rating sorted in descending order. + +Whichever option you chose, save it in a new variable, for instance, `movie_ratings`. We will use this later on. + +According to these results, these are all the ratings we have: `TV-Y`, `TV-Y7`, `G`, `TV-G`, `PG`, `TV-PG`, `PG-13`, `TV-14`, `R`, `TV-MA`, `NC-17`, `TV-Y7-FV`, `NR`, and `UR`. + +> To learn more about what each rating means, check out the **Learn More** section. + +--- +## Revision + +Fill in the gaps to count the total number of values for the books column: + +```python +df.???.??? +``` + +- books +- value_counts() +- count_value() + +--- +## Footnotes + +[1: Movie/Tv Show Count] + +```py +# Locate rows where the column called "type" has "Movies" in it. +netflix_movies = importedData.loc[importedData['type'] == 'Movie'] + +# Locate rows where the column called "type" has "TV Show" in it. +netflix_series = importedData.loc[importedData['type'] == 'TV Show'] + +``` diff --git a/python/data-analysis/pyda-analyzing/visualize-movie-ratings.md b/python/data-analysis/pyda-analyzing/visualize-movie-ratings.md new file mode 100644 index 0000000000..d2a51c7af5 --- /dev/null +++ b/python/data-analysis/pyda-analyzing/visualize-movie-ratings.md @@ -0,0 +1,131 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: how-to + +links: + - >- + [sort_values](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sort_values.html){documentation} + +revisionQuestion: + formats: + - fill-in-the-gap + - type-in-the-gap + context: standalone + +--- + +# Visualize Movie Ratings + +--- + +## Content + +Just like before, to better represent what we found, let's visualize it. + +For this visual representation, we will pick the `bar` chart. + +Bar charts are best used to provide a visual presentation of ordinal data. + +If our data had 6 or fewer categories, it would be best to use a pie chart. However, with these 14 categories, a pie chart would be too cluttered. + +To create this chart, we will use the data from the previously created `movie_ratings`[1] variable. + +Unlike plotting the `pie` chart, this time we will use the `.plot` function. + +This is a universal plotting function that can be applied to a `Series` or a `DataFrame` to plot any chart type. Some of these are, `line`, `bar`, `barh`, `pie`, `box` and so on. The default chart type is `line`. + +The `.plot` function has over 30 arguments you can use, but only one is mandatory (the data). + +Let's see how we can apply this function to our `movie_ratings` variable: + +```py +import matplotlib.pyplot as plt + +# Plot a `kind=bar` chart from `movie_ratings` +movie_ratings.plot(kind='bar') +plt.show() +``` + +In our example, the data is applied by attaching the `plot` function to the `Series`. If it were a `DataFrame`, we would also have to specify the `x` and `y` values based on the columns we want to use. + +The visualized output looks like this: + +![visualized-moviecount](https://img.enkipro.com/7cb7123b75813d6b9f8f4a23b9d3eb79.png) + +As the x axis labels are hard to read in this image, we can also slant them to a desired degree. + +This is done through the `.xticks(rotation=degrees)` method. + +For instance: +```python +import matplotlib.pyplot as plt + +movie_ratings.plot(kind='bar') +plt.xticks(rotation = 45) + +plt.show() +``` + +After adding the 45 degree rotation, the visualization looks like this: + +![visualized-slanted-x-axis-movie-count](https://img.enkipro.com/dff8a6358155ebb5b50737620cf2a86b.png) + +> Here is all the work we have done so far in a notebook: + +[Google Collab Notebook](https://colab.research.google.com/drive/1vn3b-SaZX4Jky7vEduRYUpd5Wvx7YbDQ?authuser=1) + +--- + +## Revision + +Given the `DataFrame`: + +```python +data = { + "Name": [ + "Stefan", + "Marcus", + "Danielle", + "Layla" + ], + "Book_Count": [36, 33, 71, 132] +} + +bookCountPerPerson = pd.DataFrame(data, columns=["Name", "Book_Count"]) +``` + + +Fill in the gaps to create a simple bar chart from `bookCountPerPerson`. Don't forget to show the plot afterward. + +```python +import matplotlib.pyplot as plt + +bookCountPerPerson.???( + ??? = "Name", + ??? = "Book_Count", + ??? = "bar" +) + +??? +``` + +- plot +- x +- y +- kind +- plt.show() +- type +- style + +--- + +## Footnotes + +[1: Previous Code] + +```python +movie_ratings = netflix_movies.rating.value_counts() +``` \ No newline at end of file diff --git a/python/data-analysis/pyda-analyzing/visualize-movie-show-count.md b/python/data-analysis/pyda-analyzing/visualize-movie-show-count.md new file mode 100644 index 0000000000..2ba22af28c --- /dev/null +++ b/python/data-analysis/pyda-analyzing/visualize-movie-show-count.md @@ -0,0 +1,156 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: must-know + +links: + - >- + [Matplotlib Modules](https://matplotlib.org/stable/py-modindex.html){documentation} + - >- + [Pyplot Module](https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.plot.html){documentation} + +practiceQuestion: + formats: + - fill-in-the-gap + context: standalone +revisionQuestion: + formats: + - fill-in-the-gap + context: standalone + +--- + +# Visualize Output I + +--- +## Content + +In the previous insight[1] we figured out how many movies and shows are there in the database. However, to fully grasp or communicate the scale of the difference, we could also visualize these stats. + +To do this we will use `matplotlib`. You can see how to install it here[2]. + +`matplotlib` is an open-source library that contains an extensive collection of modules for visualization. + +Before we decide which module we will use, we need to decide which type of chart we want to plot. + +Pie charts are best used when you want to compare parts of a whole. Since we are only comparing two parts of a whole, we will use this chart. + + +The `matplotlib` library provides a `.pie` function in the `.pyplot` module. + +To use it, we need to import it. + +```py +import matplotlib.pyplot as plt +``` + +To create the most basic pie chart, all we have to do is add the data we gathered in the last insight to the `.pie` function and show the plot: + +```python +# netflix_movies.count() +# 4265 + +# netflix_series.count() +# 1969 + +# Plot Pie chart +plt.pie([4265,1969]) + +# Show the plot +plt.show() +``` + +The output of this code looks like this: + +![plt-pie-output](https://img.enkipro.com/91fd23fddc318cc690ae4cba72b1c809.png) + +In the example, we added the output values by rewriting them. This is not the best practice for larger data. We can directly plug in the outputs of our previous functions by modifying them a bit. + +Instead of `netflix_movies.count()` which gives the count of all rows across all columns, we can count only one column's values and plug that in. + +We will pick any column with zero `NaN` values[3]. For instance, the `type` column. + +```python +plt.pie([ + netflix_movies.type.count(), + netflix_series.type.count() +]) + +plt.show() +``` + +`plt.show()` looks for all currently active figure objects, and opens interactive windows that display your figures. This is required to properly display the chart. + +> Let's modify the appearance of the chart in the next insight. + +--- + +## Practice + +Create a pie chart by filling in the blanks. Don't forget to show the plot. + +```python +import matplotlib.pyplot as plt + +plt???(???) + +plt??? +``` + +- `.pie` +- `[1200, 900]` +- `.show()` + +--- + +## Revision + +Pie charts are best used when you want to represent ??? + +- parts of a whole +- a change over time + +--- + +## Footnotes + +[1: Movie/TV Show Count] + +```py +# Locate rows where the column called "type" has "Movie" in it. +netflix_movies = importedData.loc[importedData['type'] == 'Movie'] + +# Locate rows where the column called "type" has "TV Show" in it. +netflix_series = importedData.loc[importedData['type'] == 'TV Show'] + +# Count and print the number of values for netflix_movies and netflix_series +print(netflix_movies.count()) + +print(netflix_series.count()) + +# netflix_movies.count() = 4265 +# netflix_series.count() = 1969 +``` + +[2: Installing matplotlib] + +There are two ways to install `matplotlib`. + +Installing through `pip`: +```python +python -m pip install -U matplotlib +``` + +> If you already have `matplotlib` installed, the above code will uninstall an older version and install the newest stable release. + +Installing through `conda`: + +```python +conda install -c conda-forge matplotlib +``` + +[3: No Nan Values] + +![raw-isnull-sum](https://img.enkipro.com/629e4a33e5c98d860011a442d3bba282.png) diff --git a/python/data-analysis/pyda-da-analysis-environments/README.md b/python/data-analysis/pyda-da-analysis-environments/README.md new file mode 100644 index 0000000000..d4ab038d4c --- /dev/null +++ b/python/data-analysis/pyda-da-analysis-environments/README.md @@ -0,0 +1,13 @@ +name: Analysis Environments + +description: Get familiar with different analysis environments. + +insights: + - what-are-analysis-environments + - ipython-vs-shell-vs-scripts + - different-tools-to-use + - notebooks + - creating-your-first-notebook + +aspects: + - introduction \ No newline at end of file diff --git a/python/data-analysis/pyda-da-analysis-environments/creating-your-first-notebook.md b/python/data-analysis/pyda-da-analysis-environments/creating-your-first-notebook.md new file mode 100644 index 0000000000..6692595f62 --- /dev/null +++ b/python/data-analysis/pyda-da-analysis-environments/creating-your-first-notebook.md @@ -0,0 +1,49 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: how-to + +links: + - >- + [Markdown Guide](https://www.markdownguide.org/basic-syntax/){website} + +--- + +# Creating Your First Notebook + +--- +## Content + +Open your terminal and type `jupyter-lab` if you installed the application locally or go to [their website](https://jupyter.org/try) to open the online version. + +Next, to create a new notebook go to `File - > New -> Notebook`. + +This will give you an empty notebook with a single empty cell. + +At the top, you can choose what kind of cell this will be. There are two types; `Code` and `Text/Markdown` cells. + +`Code` is used for any code you want to write and run. + +`Text/Markdown` is used for adding text as well as elements like headers, paragraphs, and others. + +![preview](https://img.enkipro.com/2b3ab5584c545906ee8ccbf7119ea3e9.png) + +The first two cells are `Text/Markdown` cells. The first one contains markdown and the second one contains that same markdown but *executed* to show the resulting text. + +> 💡 To learn more about markdown, check the *Learn More* section. + +The third cell is a `Code` cell. The `[1]:` to the left of it means that it was the first `Code` cell that was executed. This cell prints `Hello World` to the console, which is shown underneath it. + +The final two cells both have a `[2]:` before them. + +The first `[2]:` is shown because that was the second `Code` line that was executed. The next `[2]:` is shown because it matches the output of the previous cell. We returned the variable `x` and its resulting output is `"I am a String"`. + +Any `Code` cell that returns something will have a matching cell after it with the same number. + +> 💡 The cell numbers don't have to be in order! If you run the same cell again or run a different cell, the number will increase. + +Once you're done playing with the example, save the notebook. + +> 💡 We will be using the same notebook in the next few workouts to import and analyze a dataset. diff --git a/python/data-analysis/pyda-da-analysis-environments/different-tools-to-use.md b/python/data-analysis/pyda-da-analysis-environments/different-tools-to-use.md new file mode 100644 index 0000000000..cbd32e35b6 --- /dev/null +++ b/python/data-analysis/pyda-da-analysis-environments/different-tools-to-use.md @@ -0,0 +1,103 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: must-know + +links: + - '[Google colab](https://colab.research.google.com/notebooks/basic_features_overview.ipynb#scrollTo=JyG45Qk3qQLS){documentation}' + - '[VScode Jupyter Notebooks](https://code.visualstudio.com/docs/python/jupyter-support){website}' + - '[Spyder Notebook](https://github.com/spyder-ide/spyder-notebook){documentation}' + - '[Binder](https://mybinder.org/){website}' + - '[Jupyter.org](https://jupyter.org/){website}' + - '[Cocalc](https://cocalc.com/){website}' + + +--- + +# Python Coding Tools + +--- +## Content + +Before we start coding, we have to decide which tool we're going to use. + +One important category split between most tools is whether they're accessible offline or online. + +The main difference is that online tools are immediately available (assuming you have an internet connection) and usually come with all the necessary libraries[1] included (no need to install anything). + +Offline tools need to be installed first, including any library you might need, but can afterwards be used without internet. + +> 💡 A popular kind of Python coding tool (available both offline and online) is called a *notebook*. + +Notebooks offer interactive Python environments that can combine code with other visual elements such as text, charts, and images. + +One offline tool that lets you use notebooks is VScode[2] + +As for online tools, here're two: + +| Name | Unique Feature | +|--------------|-------------------------------------| +| Google Colab | Real-time collaboration | +| Cocalc | Collaboration based on edit history | + +For some in-depth information on these tools and more, check out the **Learn More** section. + +> 💡 We mentioned in the first insight that we'll be using Jupyter Lab in this course. All of our examples will be available online. If you prefer to run Jupyter Lab offline, here's how you can install it using `pip`[3] or **anaconda**[4]. + +> 💡 All of the tools mentioned above have support for Jupyter notebooks. It doesn't matter which tool you choose to work on, you can still follow along. + +--- + +## Footnotes +[1: Libraries] +Think of libraries as external programs you can use to quickly get some functionality you don't want to write by yourself. + +For example, if you wanted to draw plots and charts, you'd typically use an existing charting library instead of writing all the code for graphical and spatial calculations yourself. + +Most online Python environments come with popular libraries pre-installed. + +[2: VSCode] +VScode is an advanced text editor with extensions that let you edit, modify, delete, create and run notebooks. + +[3:pip] +pip is a package manager for Python. + +> 💡 When installing through `pip` you first have to make sure `pip` is upgraded to the latest version. + +To upgrade: +```python +# use pip to upgrade itself :) +pip install --upgrade pip +``` + +If you don't have pip, download the latest python installer from the [official website](https://www.python.org/downloads/) and make sure the checkbox for `pip` is ticked on. + +> 💡 Along with Jupyter we'll also install iPython. + +```python +pip install --upgrade jupyter ipython +``` + +To run Jupyter, type this in your terminal: +```sh +jupyter-lab +``` + +[4:Anaconda] +Anaconda is a package and environment manager for Python and R. It provides a graphical user interface and a terminal. + +To download Anaconda visit [their official website](https://www.anaconda.com/products/individual) link. + +> 💡 Along with Jupyter we'll also install iPython. + +Installing through Anaconda: +```sh +conda install jupyter ipython +``` + +To run Jupyter, type this in your terminal: +```sh +jupyter-lab +``` diff --git a/python/data-analysis/pyda-da-analysis-environments/ipython-vs-shell-vs-scripts.md b/python/data-analysis/pyda-da-analysis-environments/ipython-vs-shell-vs-scripts.md new file mode 100644 index 0000000000..2aec075d79 --- /dev/null +++ b/python/data-analysis/pyda-da-analysis-environments/ipython-vs-shell-vs-scripts.md @@ -0,0 +1,106 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: must-know + +practiceQuestion: + formats: + - fill-in-the-gap + context: standalone +revisionQuestion: + formats: + - fill-in-the-gap + context: standalone + +--- + +# IPython (and Scripts) + +--- +## Content + +### IPython + +The Python language comes with a basic interpreter and a [REPL](https://www.enki.com/glossary/general/repl) that lets us write and run Python programs. + +iPython is an enhanced version of that with user-friendly features such as: +- Auto-completion +- Support for Data Visualization +- Multi-line editing +- Syntax highlighting +- and more + +The IPython shell is usually the recommended shell as it runs your Python code just like the normal Python shell does while also providing a richer set of features on top[1]. + +The IPython interpreter, as well as the basic Python interpreter, are both interactive shells that are accessed through the terminal via the `python`/`ipython` commands. + +If we save code into a file, we call that file a *script*. + +If we give the name of our file to the Python shell, we can have it execute the code for us. + +```sh +# scripts with python code +# are usually saved with +# the .py extension +python my_script.py +``` + +> 💡 Scripts are executed in the same way as regular, command-line code. + + +--- +## Practice + +??? are command-line tools used to execute code. + +??? are pieces of code saved into files. + +??? are sessions on the computer used to communicate code to shells. + +- Shells +- Scripts +- Terminals + +--- +## Revision + +When you put save code in a file, what do you typically call that file? + +???. + +- a script +- a terminal +- text +- an interpreter + +--- +## Footnotes +[1: Multi Line Execution] + +Here is the same code run on the basic interpreter vs IPython: + +Basic interpreter: + +![windows-10-example](https://img.enkipro.com/cb342ec6c5fb4860fee889d907ee176b.png) + +IPython: + +![ipython-example](https://img.enkipro.com/02420b736677cad5a5d5d8bcaac54bf4.png) + +> 💡 In iPython, you can re-run any part of code you've already run with or without modification to the code. + + +iPython lets us store multi-line code blocks behind special `Line [N]` variable names. If you look at the iPython example above, all of the code was run on `Line [1]`. This lets us split code into sections and re-run or re-use any section at any time. + +On the other hand, in the regular interpreter, we had to write all lines of code one by one. + +In any terminal, pressing the ⬆️ key would give us our last executed line. + +If we press ⬆️ in iPython, it would give us last executed `Line`. + +> ⚠️ The multi-line editing feature is not available within the iPython terminal and only available in notebooks (more on this to come later) + +You can think of notebooks as interactive Python environments that can combine code execution, rich text, charts, and rich media. + \ No newline at end of file diff --git a/python/data-analysis/pyda-da-analysis-environments/notebooks.md b/python/data-analysis/pyda-da-analysis-environments/notebooks.md new file mode 100644 index 0000000000..4ced24c30f --- /dev/null +++ b/python/data-analysis/pyda-da-analysis-environments/notebooks.md @@ -0,0 +1,112 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: how-to + +practiceQuestion: + formats: + - fill-in-the-gap + context: standalone +revisionQuestion: + formats: + - fill-in-the-gap + context: standalone + +--- + +# What Are Notebooks? + +--- +## Content + +Notebooks are interactive documents[1] that let you combine code with other visual elements such as text, charts, and images. + +As long as you have a link to a notebook you can share it with your team or modify it from any computer. + +You and anyone you've shared your notebook with can collaborate on the notebook by writing parts of it or leave discussion comments. + +Notebooks let you mix code, text and images in any order you like, and each piece can be tested independently. + +> ⚠️ Running code out-of-order may have unintended effects[2] as notebooks run as whole, top to bottom. + +We'll show you how to create a notebook on Jupyter Lab in the next insight. + +--- +## Practice + +Notebooks are documents that support ??? + +- various elements such as code, text and images, and more +- only code and images + + +--- +## Revision + +Notebooks support interactive images. + +??? + +Notebooks support only markdown cells. No programming code. + +??? + +Notebooks are like scripts. You can execute the whole notebook using a single button. + +??? + +If you have a sharable link to an online notebook you can modify it from any computer. + +??? + +- True +- False +- True +- True +- False +- False + +--- +## Footnotes + +[1: Notebook example] +Here's a notebook containing a line of code, some Markdown text, and an image. + +![notebook-writing](https://img.enkipro.com/b0cf77ab69e42faf8e771314ea5c4a46.png) + +And here's how the notebook looks when we run it: + +![notebook-running](https://img.enkipro.com/bf0e7814a79f2764f337137772d133ae.png) + +[2:Unintended Consequences] + +Imagine you have this code: +```python +# Line 1 +x = 1 + +# Line 2 +x = x + 1 + +# ... +# Something in between + +# Line 57 +x = "something else" +``` + +If you run the lines in order up to line 57, everything will work as intended. However, if after line 57 you run line 2 without running line 1 first, you will get an error. + +This is because now `x = "something else"` and the line 2 code would be: +```python +x = "something else" + 1 +``` + +Which will create an error: + +`TypeError: can only concatenate str (not "int") to str` + +So be careful when re-running pieces of code. + \ No newline at end of file diff --git a/python/data-analysis/pyda-da-analysis-environments/what-are-analysis-environments.md b/python/data-analysis/pyda-da-analysis-environments/what-are-analysis-environments.md new file mode 100644 index 0000000000..2cc0b25b4a --- /dev/null +++ b/python/data-analysis/pyda-da-analysis-environments/what-are-analysis-environments.md @@ -0,0 +1,55 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: must-know + +--- + +# What Are Analysis Environments? + +--- +## Content + +When it comes to coding in Python, there are several environments you can choose from. + +These environments are shells[1], [IDE's](https://www.enki.com/glossary/general/ide), and other programs that let you write, test, and run Python code. + +The most basic environment you can write Python in is your OS's terminal[2]. + +To try it out, open your terminal, type in the word `python` and press Enter. + +A python environment that understands Python code is usually called *the [interpreter](https://www.enki.com/glossary/general/interpreter)*. + +> 💡 Different versions of Python will have different keywords to launch the interpreter. The most common ones are `py` and `python`. + +Even though most operating systems come with a pre-installed version of the Python interpreter it is not the recommended tool for coding as it lacks many user-friendly features. + +There are superior programs with better interpreters, such as: +- Jupyter Lab +- IDLE +- nteract +- PyScripter +- Google colab +- VScode +- and many more + +> 💡 Some of the awesome features these provide are type suggestions, syntax highlighting, in-line graphics, and so on. + +In this course, we will be using the Jupyter Lab environment. + +--- +## Footnotes + +[1: Shell] + +Shells are command-line tools used to execute code. + +[2: Terminal Gif] + +Terminals are programs on the computer that let us communicate with shells. We send input (commands) into and receive output (results) out of the shell via a terminal. + +Here's how we can run the `python` shell: + +![terminal-gif](https://img.enkipro.com/0bf78b2222dca114a879cc242715adc3.gif) diff --git a/python/data-analysis/pyda-da-tips/README.md b/python/data-analysis/pyda-da-tips/README.md new file mode 100644 index 0000000000..e796913511 --- /dev/null +++ b/python/data-analysis/pyda-da-tips/README.md @@ -0,0 +1,10 @@ +name: Tips + +description: Useful tips for analyzing data with Python. + +insights: + - str-contains + - pandas-profiling + +aspects: + - introduction \ No newline at end of file diff --git a/python/data-analysis/pyda-da-tips/pandas-profiling.md b/python/data-analysis/pyda-da-tips/pandas-profiling.md new file mode 100644 index 0000000000..fe92899ad2 --- /dev/null +++ b/python/data-analysis/pyda-da-tips/pandas-profiling.md @@ -0,0 +1,61 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: tip + +--- + +# Pandas Profiling + +--- +## Content + +Sometimes you might want to analyze a dataset but you don't have the time for it. + +Or, you might wanna check if some of your analysis results are correct. + +Luckily for us, there is `pandas profiling`. + +`Pandas profiling` is an open-source module that we can use to perform analysis on a `DataFrame` in just a few lines of code. + +To use it you first have to install it. + +Open your terminal and write: +```py +# pip installation version 1 +pip install -U pandas-profiling[notebook] +jupyter nbextension enable --py widgetsnbextension + +# pip installation version 2 +pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip + +# conda installation +conda env create -n pandas-profiling +conda activate pandas-profiling +conda install -c conda-forge pandas-profiling +``` + +After installing it you need to import it and then you can use it: +```py +from pandas_profiling import ProfileReport +``` + +The syntax to use it is: +```py +ProfileReport(df, title="Some Title Here") +``` + +If we used this on the `df` we made in the previous insight it would look like this: + +Screenshot 1: +![overview](https://img.enkipro.com/23f31d4f4665f7c51e6ddb26b984420d.png) + +Screenshot 2: +![variables](https://img.enkipro.com/e50a063c292f8be31adb72cfd803cef3.png) + +Screenshot 3: +![first-last](https://img.enkipro.com/890da86d282cdd7ef61072b14ea0324c.png) + +> If our dataset was larger, it would create numerous charts for us, determine missing values, show duplicate/missing rows/cells, it would also generate warnings, show correlations, and more. \ No newline at end of file diff --git a/python/data-analysis/pyda-da-tips/str-contains.md b/python/data-analysis/pyda-da-tips/str-contains.md new file mode 100644 index 0000000000..46885acdc4 --- /dev/null +++ b/python/data-analysis/pyda-da-tips/str-contains.md @@ -0,0 +1,225 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: tip + +practiceQuestion: + formats: + - fill-in-the-gap + - type-in-the-gap + context: standalone +revisionQuestion: + formats: + - fill-in-the-gap + - type-in-the-gap + context: standalone + +--- + +# str.contains + +--- +## Content + +`str.contains` is used to find a string based on at least one character. + +This method only works on a `Series` or a single column of a `DataFrame`. + +The syntax is: +```py +Series.str.contains +( + pat, + case=True, + na=None, + regex=True +) +``` + +`pat` is the sequence of characters or a regular expression. + +`case` is `bool`. `True` means case sensitive, `False` means not. By default, set to `True`. + +`na` determines if empty values should be filled. By default, it is off. + +`regex` is also of type `bool`. `True` means treat `pat` as a regular expression while `False` means to treat it as a string literal. By default, it is set to `True`. + +As for the output, there are two possibilities. + +Either a `Series` with the values or an index of boolean values. + +Now, let's say we have this `DataFrame`: +```py +import pandas as pd + +df = pd.DataFrame({ + 'name': [ + 'Joe', + 'Ana', + 'Mariana', + 'Ana-Maria', + 'Magdalena', + 'Aleksandra', + 'Aleks', + 'Sandra' + ], +}) +``` +It looks like this: + +![df](https://img.enkipro.com/1fe787c9f641b0c1c3b41e6c30b42ced.png) + + +### Series output + +Here's how we would find all occurances of `ana`: +```py +df[df['name'].str.contains('ana', case=False)] +``` + +![ana-false](https://img.enkipro.com/b47ba7d8a682f3711ef220e2be80995e.png) + +If we didn't include `case=False` the output would look like this: + +![ana-true](https://img.enkipro.com/1da99b77f12d1428f54dbb6015851ed8.png) + +### Bool output + +If we re-wrote this: +```py +df[df['name'].str.contains('ana', case=False)] +``` +as +```py +df['name'].str.contains('ana', case=False) +``` +we would get a boolean output: + +![boolean-output](https://img.enkipro.com/c3a5cc404a2f0fb49c34bd75d7c3cdc6.png) + + +--- + +## Practice + +Give the `DataFrame`: + +```python +import pandas as pd + +df = pd.DataFrame({ + 'name': [ + 'Joe', + 'Ana', + 'Mariana', + 'Ana-Maria', + 'Magdalena', + 'Aleksandra', + 'Aleks', + 'Sandra' + ], +}) +``` + +If we did this: +```py +df[df['name']\ + .str.contains('ana', case=False)] +``` + + +What do you think the output would be? + +```python +# Option A +| Index | Value | +|-------------|-------------| +| 0 | False | +| 1 | False | +| 2 | False | +| 3 | True | +| 4 | False | +| 5 | False | +| 6 | False | +| 7 | False | +| 8 | False | +| Name: name, | dtype: bool | + +# Option B +| | name | +|-------------|-------------| +| 2 | Ana | +| 3 | Mariana | +| 4 | Ana-Maria | + + +# The output would be +# Option ??? +``` + +- B +- A + + +--- + +## Revision + +Give the `DataFrame`: + +```python +import pandas as pd + +df = pd.DataFrame({ + 'name': [ + 'Joe', + 'Ana', + 'Mariana', + 'Ana-Maria', + 'Magdalena', + 'Aleksandra', + 'Aleks', + 'Sandra' + ], +}) +``` + +If we did this: +```py +df['name'].str.contains('ana', case=False) +``` + + +What do you think the output would be? + +```python +# Option A +| Index | Value | +|-------------|-------------| +| 0 | False | +| 1 | False | +| 2 | False | +| 3 | True | +| 4 | False | +| 5 | False | +| 6 | False | +| 7 | False | +| 8 | False | +| Name: name, | dtype: bool | + +# Option B +| | name | +|-------------|-------------| +| 2 | Ana | +| 3 | Mariana | +| 4 | Ana-Maria | + + +# The output would be +# Option ??? +``` + +- A +- B diff --git a/python/data-analysis/pyda-initializing-and-cleaning-datasets/README.md b/python/data-analysis/pyda-initializing-and-cleaning-datasets/README.md new file mode 100644 index 0000000000..6f5162d199 --- /dev/null +++ b/python/data-analysis/pyda-initializing-and-cleaning-datasets/README.md @@ -0,0 +1,14 @@ +name: Preparing a Dataset + +description: Learn what Series and Dataframes are, and how to prepare a dataset for analysis. + +insights: + - what-and-why-pandas + - series-and-dataframes + - importing-data-sets + - cleaning-dataset + - cleaning-dataset-ii + +aspects: + - introduction + - workout diff --git a/python/data-analysis/pyda-initializing-and-cleaning-datasets/cleaning-dataset-ii.md b/python/data-analysis/pyda-initializing-and-cleaning-datasets/cleaning-dataset-ii.md new file mode 100644 index 0000000000..5a5ec05b0a --- /dev/null +++ b/python/data-analysis/pyda-initializing-and-cleaning-datasets/cleaning-dataset-ii.md @@ -0,0 +1,104 @@ +--- +author: Stefan-Stojanovic + +aspects: + - workout + +type: normal + +category: how-to + +practiceQuestion: + formats: + - fill-in-the-gap + context: standalone +revisionQuestion: + formats: + - fill-in-the-gap + - type-in-the-gap + context: standalone + +--- + +# Cleaning The Dataset II + +--- +## Content + +Previously we have determined what our dataset is about and how many rows/columns it has. + +We also checked which cells are empty or not using the `isnull()` function[1]. + +However, reading that table cell by cell would take a lot of time. Luckily, adding `.sum()` at the end would count the missing cells for us: + +![raw-isnull-sum](https://img.enkipro.com/629e4a33e5c98d860011a442d3bba282.png) + +Now that we know what is and isn't missing, we can decide what we want to do with the information we have. + +First, let's remove the columns we won't use. + +To remove a column from a dataset we need to use the `.drop(columns = ["column1", "column2", ...])` command. + +Here are the columns we will remove: + +```py +# Remove columns from importedRawData +# save it into importedData +importedData = importedRawData.drop( + columns=["show_id", + "description", + "country", + "date_added", + "director" + ] +) +``` + +If we were to run head or tail again, we would get a cleaner output. + +![cleaner-head](https://img.enkipro.com/f559d042eb418779165dc355236fab44.png) + +The next step is deciding what we want to analyze and start analyzing. We will do that in the next workout. + +> Here is all the work we have done in a notebook: + +[Google Collab Notebook](https://colab.research.google.com/drive/1WQoAQhnaI5Eh-gQ_Qg-VA9VPk7aSJvgB?authuser=1) + +--- +## Practice + +The ??? method is used to remove columns from a DataFrame. + +- `.drop()` +- `.remove()` +- `.disable()` +- `.getRidOf()` + +--- +## Revision + +Finish the code to remove the `"name"`,`"surname"`, and `"age"` columns from the given `DataFrame`: + +```python +df???( + ???=["???", + "name", + "age" + ] +) +``` + +- `.drop` +- `columns` +- `surname` +- `age` + +--- + +## Footnotes + +[1:Missing Values] + +Using the `.isnull()` function on a `DataFrame` will give us a table of True/False values. + +![raw-is-null](https://img.enkipro.com/f21cee53d181046c10b517cf0bf4c9a2.png) diff --git a/python/data-analysis/pyda-initializing-and-cleaning-datasets/cleaning-dataset.md b/python/data-analysis/pyda-initializing-and-cleaning-datasets/cleaning-dataset.md new file mode 100644 index 0000000000..2fe3f29619 --- /dev/null +++ b/python/data-analysis/pyda-initializing-and-cleaning-datasets/cleaning-dataset.md @@ -0,0 +1,99 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: how-to + +practiceQuestion: + formats: + - fill-in-the-gap + context: standalone +revisionQuestion: + formats: + - fill-in-the-gap + - type-in-the-gap + context: standalone + +--- + +# Initializing & Cleaning a Dataset + +--- +## Content + +Using the dataset from the previous insight[1], we will show you how to clean it up before we start the analysis. + +First off, when we import a dataset, we can use the `head()` or `tail()` functions to check the top or bottom 5 rows, respectively. + +> You can also pass a number to `head()` and `tail()` to overwrite the default value of `5`. + +Using `importedRawData.head()` we get: + +![df.head](https://img.enkipro.com/dd122d46f56ca9b68d0a416aeea42ec0.png) + +Using `importedRawData.tail()` we get: + +![df.tail](https://img.enkipro.com/54954e701a8fd2e3715788975ef01223.png) + +This is useful to know right away if your dataset has loaded or not. + +As you can see, there are a lot of columns in this dataset. + +To check the total number of rows and columns in your dataset, add `.shape` to your DataFrame. + +![dataframe-dot-shape](https://img.enkipro.com/ef84e063e9b77b20f3a069b54c8cd437.png) + +This dataset has 6234 rows and 12 columns. + +Rows start from 0 instead of 1. That is why the last row's index is 6233 instead of 6234. + +> We will remove the columns we don't need for our analysis and leave the ones we will use in this workout. + +To determine which columns we will remove, let's first check which cells have missing data. + +To check which data is missing run the `.isnull()` command: + +![raw-is-null](https://img.enkipro.com/f21cee53d181046c10b517cf0bf4c9a2.png) + +This will give us a table with `True` / `False` values. `True` meaning empty. + +> We will continue cleaning this data set in the next insight. + + +--- +## Practice + +??? is used to check if cells are empty or not? + +- `isnull()` +- `isnan()` +- `exists()` +- `isempty()` + + +--- +## Revision + +Attach 2 functions to the `cars` `DataFrame` to check and count all the missing values across all rows and columns. + +```py +cars.???.??? +``` + +- `isnull()` +- `sum()` +- `calculate()` +- `count` +- `isempty()` + + +--- +## Footnotes + +[1:Previous Dataset] +```python +import pandas as pd + +importedData = pd.read_csv('netflix_titles.csv') +``` diff --git a/python/data-analysis/pyda-initializing-and-cleaning-datasets/importing-data-sets.md b/python/data-analysis/pyda-initializing-and-cleaning-datasets/importing-data-sets.md new file mode 100644 index 0000000000..704584569b --- /dev/null +++ b/python/data-analysis/pyda-initializing-and-cleaning-datasets/importing-data-sets.md @@ -0,0 +1,125 @@ +--- +author: Stefan-Stojanovic +type: normal +category: how-to +links: + - >- + [Netflix Dataset](https://www.kaggle.com/shivamb/netflix-shows/version/3){website} + +revisionQuestion: + formats: + - fill-in-the-gap + - type-in-the-gap + context: standalone + +--- + +# Importing Datasets + +--- +## Content + +Previously we learned how to create a `Series` and a `DataFrame` and what they are. + +Now we will learn how to import a data set into a `DataFrame`. + +Before that, we should first decide if we are importing from an external or a local source. After that, we have to decide what type of file we want to import. + +For now, we will import a `CSV` file called "netflix_titles.csv". + +![netflix-titles-head](https://img.enkipro.com/c9637b9513606236454371c7c6d749dd.png) + +If we are importing externally, we can just use `pandas` `.read_csv` module with the `URL` inside. + +```py +pd.read_csv('URL') +``` + +On the other hand, if we are importing from a local source, we first have to make sure the file is in our working Python directory[1]. + +Now we can import it into our project. + +> 💡 Add this code to the same notebook we created in the previous workout. + +```py +import pandas as pd + +importedRawData = pd.read_csv('netflix_titles.csv') +``` + +If you want to use the same dataset, you can download it using the link in the **Learn More** section. + + +--- +## Revision + +Import a dataset called `my_data.csv` into a `DataFrame` called `df`. + +```py +??? = pd.???('???') +``` + +- df +- read_csv +- my_data.csv +- my_data +- readCSV + +--- + +## Footnotes + +[1: Working Directory] + +To check your current working directory, you have to first import the `os` library and use its `.getcwd()` module. + +```py +import os + +os.getcwd() +``` + +The `.getcwd()` module will output the current working directory. + +Just locate the folder using the path provided by the `.getcwd()` module and move the CSV file there. + +To check if a file is in a specific folder, we need to use the `.listdir()` method from the same module: + +```python +import os + +# Add the path to our folder to the path variable +path = "C:\Windows\System32\Python Testing" + +# Create an object that holds a list of all the files located in the path +directories = os.listdir(path) +``` + +We can print the files in a list: +```python +print(directories) + +# Output: +[ + 'Analyzing Netflix Titles.ipynb', + 'cars.csv', + 'daily_csv.csv', + 'monthly_csv.csv', + 'netflix_titles.csv', + 'unigram_freq.csv' +] +``` + +Or use a for loop: + +```python +for file in directories: + print(file) + +# Analyzing Netflix Titles.ipynb +# cars.csv +# daily_csv.csv +# monthly_csv.csv +# netflix_titles.csv +# unigram_freq.csv +``` diff --git a/python/data-analysis/pyda-initializing-and-cleaning-datasets/series-and-dataframes.md b/python/data-analysis/pyda-initializing-and-cleaning-datasets/series-and-dataframes.md new file mode 100644 index 0000000000..98c4f16de0 --- /dev/null +++ b/python/data-analysis/pyda-initializing-and-cleaning-datasets/series-and-dataframes.md @@ -0,0 +1,156 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: how-to + +practiceQuestion: + formats: + - fill-in-the-gap + context: standalone +revisionQuestion: + formats: + - fill-in-the-gap + - type-in-the-gap + context: standalone + +--- + +# Series & DataFrames + +--- +## Content + +With `pandas` you can create a one-dimensional array called `Series`. + +These one-dimensional arrays can be labeled and can hold any data type(`strings`, `objects`, `integers`, etc.). + +However, they are homogeneous. You cannot have a `string` and an `int` in the same `Series`. + +You can create a `Series` from a `list`, `dictionary`, `array`, and more. + +Here is a simple series from an array: +```python +# import pandas with the alias pd +import pandas as pd + +# create a Series +firstSeries = pd.Series([21,24,23]) +``` + +When we print `firstSeries` we get this output: +```plain-text +0 21 +1 24 +2 23 +dtype: int64 +``` + +> ⚠️ It is possible to create a `Series` from more than one data type thanks to type coercion[1]. + +### DataFrames + +`DataFrames` are `pandas` two-dimensional arrays. They are aligned in tabular format. They have labels and are created with rows and columns. + +Just like `Series`, `DataFrames` can be created from any data type. + +However, unlike `Series`, `DataFrames` are heterogeneous. You can have more than one data type in a `DataFrame`. + +Here is how to create a `DataFrame` from two different `Series`: + +```python +# Create two Series +firstSeries = pd.Series([21,24,23]) +secondSeries = pd.Series(["Stefan", "Marcus", "Emma"]) + +# Combine the series into a +# Dictonary with column identifiers +f = {"Name" : secondSeries, "Age" : firstSeries} + +# Create a DataFrame from the above Dictonary +df = pd.DataFrame(f) +``` + +When we print `df` we get: + +| id | Name | Age | +|---:|-------:|----:| +| 0 | Stefan | 21 | +| 1 | Marcus | 24 | +| 2 | Emma | 23 | + +Both `DataFrame`s and `Series` have an index. By default, this index goes from 0 to the length of the array. + +> A singular column within any `DataFrame` is defined as a `Series`. So, if a `DataFrame` has 10 columns, those 10 columns are all individual `Series`. + +> We will show you different ways of how we will use these arrays to import datasets and manipulate them in the next few insights. + +--- + +## Practice + +??? are 1-Dimensional `pandas` arrays that can hold ???. + +??? are 2-Dimensional `pandas` arrays that can hold any data type. + +- Series +- any data type +- DataFrames +- only strings and integers +- only dictionaries and integers +- Lists + +--- + +## Revision + +Create a `Series` from a `numpy` array. + +```python +someArray = np.???([1, 2, 3]) + +someSeries = pd.???(???) +``` + +- array +- Series +- someArray +- series +- someSeries +- Array + +--- +## Footnotes + +[1:Coersion] + +In Python, coercion is automatic. It is when the language implicitly converts an object to a different type to avoid errors. + +For instance, you can add a float (3.1) and an int (2) without any errors. +```python +3.1 + 2 + +# 5.1 +``` + +The python interpreter figures out that one is a `float` and the other is an `int`. It converts the `int` into a `float` and then it adds them up. + +As for the `Series`, take this example: + +```python +s = pd.Series([1, 'hello', True]) +``` + +It would compile and run without any problems. This is because the `dtype` of this Series is coerced into an "object". + +If we call `s` we would get: +```python +0 1 +1 hello +2 True +dtype: object +``` + +Which shows its `dtype` is an `object`. + \ No newline at end of file diff --git a/python/data-analysis/pyda-initializing-and-cleaning-datasets/what-and-why-pandas.md b/python/data-analysis/pyda-initializing-and-cleaning-datasets/what-and-why-pandas.md new file mode 100644 index 0000000000..99719cf3eb --- /dev/null +++ b/python/data-analysis/pyda-initializing-and-cleaning-datasets/what-and-why-pandas.md @@ -0,0 +1,85 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: how-to + +practiceQuestion: + formats: + - fill-in-the-gap + context: standalone +revisionQuestion: + formats: + - fill-in-the-gap + context: standalone + +--- + +# What is pandas and why use it? + +--- +## Content + +`pandas` is an open-source data manipulation tool. It is fast, powerful, and flexible. + +It offers an efficient `DataFrame` object used for data manipulation. + +With `DataFrame`s you can import, transform, manipulate, clean, and analyze datasets. + +Think of an `DataFrame` as a library with empty shelves. When we import a dataset into the `DataFrame` we fill up the shelves with books(data). + +After we fill up our library, we can clean it up and analyze it. + +For instance, we could search through the library, remove the books we don't like, split books into different shelves by organizing them into categories, sort, count, remove duplicates, and more. + +`pandas` is built on the numerical python library called `numpy`. + +> To be able to use `pandas`, you will have to install `numpy`[1] first. + +Check this footnote to learn how to install `pandas`[2] through `pip` or `conda`. + +--- +## Practice + +`pandas` is a ??? built on ???. + +- data manipulation tool +- `numpy` +- graphical library +- `matplotlib` + +--- +## Revision + +To use `pandas` you also have to install ???. + +- numpy +- matplotlib +- pyplot +- seaborn + +--- +## Footnotes + +[1:NumPy Installation] +Installing through `pip`: +```python +pip install numpy +``` + +Installing through `conda`: +```python +conda install numpy +``` + +[2:Pandas Installation] +Installing through `pip`: +```python +pip install pandas +``` + +Installing through `conda`: +```python +conda install pandas +``` diff --git a/python/data-analysis/pyda-introduction/README.md b/python/data-analysis/pyda-introduction/README.md new file mode 100644 index 0000000000..d7ea81fcc2 --- /dev/null +++ b/python/data-analysis/pyda-introduction/README.md @@ -0,0 +1,13 @@ +name: Introduction + +description: Learn how to analyze data using Python. + +insights: + - brief-introduction-to-data-analysis + - why-python-for-data-analysis + - python-data-libraries + - prerequisites + - analysis-example + +aspects: + - introduction diff --git a/python/data-analysis/pyda-introduction/analysis-example.md b/python/data-analysis/pyda-introduction/analysis-example.md new file mode 100644 index 0000000000..1995536e47 --- /dev/null +++ b/python/data-analysis/pyda-introduction/analysis-example.md @@ -0,0 +1,45 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: how-to + +links: + - '[Python](https://www.python.org/downloads/){website}' + - '[Installation guide](https://wiki.python.org/moin/BeginnersGuide/Download){website}' + +--- + +# Analysis Process Example + +--- +## Content + +Now that you know what this topic is about, let's take a look at an example of the data analysis process! + +First, we figure out which question we want to answer. + +- I want to find which employees have worked for more than 1000 hours. + +This means gathering all the workers' information along with dates they have started working on. + +- Next, we want to measure hours from the beginning of their employment. + +If we have the exact dates the employees started work on, as well as how many hours they put in daily/weekly, we can calculate how many hours they have worked for. + +- Then, we have different tools we can use to spot patterns. + +Since this is a relatively simple task, there are no patterns to look at here. As an example of a pattern, you could find the day on which people worked most. + +- Analyze/conclude. + +After the analysis, we've found that 6 employees have worked more than 1000 hours. + +- Communicate/share findings + +After analyzing all employees, these six employees have the most hours put in with Emma having the most. + +![analysis-result](https://img.enkipro.com/f0b7aab5f47088e19ebeac44affb98c1.png) + +> 💡 You will also learn how to visualize your findings throughout this course. \ No newline at end of file diff --git a/python/data-analysis/pyda-introduction/brief-introduction-to-data-analysis.md b/python/data-analysis/pyda-introduction/brief-introduction-to-data-analysis.md new file mode 100644 index 0000000000..a50e85e266 --- /dev/null +++ b/python/data-analysis/pyda-introduction/brief-introduction-to-data-analysis.md @@ -0,0 +1,28 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: must-know + +--- + +# Intro to Data Analysis With Python + +--- +## Content + +Data analysis is the process of analyzing and discovering useful information from data. + +The process has a handful of steps: + +- Decide which questions you want to answer +- Figure out which data you need +- Decide how you want to measure and gather your data +- After collecting the data, you can use different tools to spot patterns +- Analyze/conclude your findings +- Communicate/share the findings + +> 💡 The analysis process is iterative. You will sometimes repeat the same steps to get different or more accurate results based on newly acquired data. + +> We will explain why Python was chosen for this course in the next insights. diff --git a/python/data-analysis/pyda-introduction/prerequisites.md b/python/data-analysis/pyda-introduction/prerequisites.md new file mode 100644 index 0000000000..23a74557dd --- /dev/null +++ b/python/data-analysis/pyda-introduction/prerequisites.md @@ -0,0 +1,30 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: must-know + +--- + +# Prerequisites + +--- +## Content + +In this course, you will learn how to analyze data with Python. + +If you've never done any analysis, you can complete the first few workouts of our [Data Analysis](https://app.enkipro.com/skill/data-analysis) topic. + +Also, if you have never done any Python, you should complete a couple of workouts in our [Core](https://app.enkipro.com/course/python-core) course of the **Python** topic. + +These workouts will teach some useful information about Python that is necessary for this course. + +| Prerequisites | Description | +|----------------------|------------------------------------------| +| Variable declaration | How variables are created | +| Pemdas/Bomdas | Order of operation in Python | +| Input/Output | How to import and export information | +| Types of variables | `boolean`, `string`, etc | +| Different operators | `in`, `out`, etc | +| Reserved keywords | `print`, `return`, `def`, etc | diff --git a/python/data-analysis/pyda-introduction/python-data-libraries.md b/python/data-analysis/pyda-introduction/python-data-libraries.md new file mode 100644 index 0000000000..54ce9c74c9 --- /dev/null +++ b/python/data-analysis/pyda-introduction/python-data-libraries.md @@ -0,0 +1,77 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: how-to + +links: + - >- + [Built-in Functions](https://docs.python.org/3/library/functions.html){documentation} + +practiceQuestion: + formats: + - fill-in-the-gap + context: standalone +revisionQuestion: + formats: + - fill-in-the-gap + context: standalone + +--- + +# Python Data Libraries + +--- +## Content + +Libraries are groups of pre-written code often referred to as *modules*. + +You can import the libraries and use their *modules* in your code. + +A module is a file with Python code that contains one or more variables, functions, and classes. + +For instance, Python comes with a built-in *Standard Library*. This library provides several built-in modules. + +Among them, we have built-in functions, constants, types, exceptions, and so on... + +This library is very extensive. Some functions included there are: + +| Func Name | Brief Explanation | Example | +|-----------|------------------------------------|---------------------------------------| +| `print()` | Used to print a message | `print("Hello World")` | +| `dict()` | Used to create a dictionary | `x = dict(name = "Stefan", age = 27)` | +| `abs()` | Get the absolute value of a number | `abs(-5)` | + +> 💡 To view the full list of modules from the *standard library* check out the **Learn more** section. + +Functions from the *standard library* are always available. + +The ones that don't fall in the *standard library* category are the community-driven libraries. + +Here're some well-known libraries for data analysis which we will use throughout this course: + +`matplotlib` is used to create visualizations. + +`numpy` makes working with arrays much easier and faster than Python lists. + +`pandas` has many useful tools used for data analysis. + +--- +## Practice + +The groups of modules that come built-in with Python are referred to as the ???. + +- standard library +- main modules +- standard modules + +--- +## Revision + +Python doesn't come with any built-in modules. + +??? + +- False +- True diff --git a/python/data-analysis/pyda-introduction/why-python-for-data-analysis.md b/python/data-analysis/pyda-introduction/why-python-for-data-analysis.md new file mode 100644 index 0000000000..05c3d1a760 --- /dev/null +++ b/python/data-analysis/pyda-introduction/why-python-for-data-analysis.md @@ -0,0 +1,29 @@ +--- +author: Stefan-Stojanovic + +type: normal + +category: how-to + +--- + +# Why Python for Data Analysis + +--- +## Content + +Most people choose Python when working with data. Whether it's for analysis, visualization, or manipulation of data, Python is very flexible. + +There are several good reasons for this. + +Python closely resembles the English language, which makes it quick to get started with, especially for beginners. + +It has a mature ecosystem of tools for extracting and manipulating data. + +These tools usually come from an ever-expanding collection of libraries, many of which are community-driven. + +New libraries or updates to existing libraries arrive often. + +Many of these libraries are created for specific tasks like analysis, visualization, array manipulation, and more. + +> 💡 We will dive into the different libraries Python has to offer in the next few workouts. diff --git a/python/python-core/README.md b/python/python-core/README.md index 44ac84979f..6ac240845b 100644 --- a/python/python-core/README.md +++ b/python/python-core/README.md @@ -202,6 +202,6 @@ standards: 6: Create a spy to verify the behavior of a function next: - - python:functional-programming + - python:data-analysis From 1bd9667c79e2de685df2746496f964b1891916eb Mon Sep 17 00:00:00 2001 From: Nemanja Stojanovic Date: Thu, 5 Dec 2024 19:43:27 -0500 Subject: [PATCH 2/3] unique slugs --- python/data-analysis/pyda-analyzing-ii/README.md | 12 ++++++------ ... pyda-counting-and-transforming-series-ii.md} | 0 ... => pyda-counting-and-transforming-series.md} | 0 ...a-movie-count-per-release-year-visualized.md} | 0 ...r.md => pyda-movie-count-per-release-year.md} | 0 ...{tvshow-ratings.md => pyda-tvshow-ratings.md} | 0 ...tings.md => pyda-visualize-tvshow-ratings.md} | 0 .../data-analysis/pyda-analyzing-iii/README.md | 10 +++++----- ...up-sizes.md => pyda-computing-group-sizes.md} | 0 ...r-actor-ii.md => pyda-movies-per-actor-ii.md} | 0 ...ies-per-actor.md => pyda-movies-per-actor.md} | 0 ...-actors.md => pyda-total-number-of-actors.md} | 0 ...how-per-actor.md => pyda-tvshow-per-actor.md} | 0 python/data-analysis/pyda-analyzing-iv/README.md | 6 +++--- ...st-movies-ii.md => pyda-longest-movies-ii.md} | 0 ...{longest-movies.md => pyda-longest-movies.md} | 0 ...ng-shows.md => pyda-longest-running-shows.md} | 0 python/data-analysis/pyda-analyzing/README.md | 16 ++++++++-------- .../{dataframe-loc.md => pyda-dataframe-loc.md} | 0 .../{end-goals.md => pyda-end-goals.md} | 0 ...hart-ii.md => pyda-modifying-pie-chart-ii.md} | 0 ...-pie-chart.md => pyda-modifying-pie-chart.md} | 0 ...vshow-count.md => pyda-movie-tvshow-count.md} | 0 ...s-per-rating.md => pyda-movies-per-rating.md} | 0 .../pyda-da-analysis-environments/README.md | 10 +++++----- ...k.md => pyda-creating-your-first-notebook.md} | 0 ...-to-use.md => pyda-different-tools-to-use.md} | 0 ...ts.md => pyda-ipython-vs-shell-vs-scripts.md} | 0 .../{notebooks.md => pyda-notebooks.md} | 0 ...md => pyda-what-are-analysis-environments.md} | 0 python/data-analysis/pyda-da-tips/README.md | 4 ++-- ...das-profiling.md => pyda-pandas-profiling.md} | 0 .../{str-contains.md => pyda-str-contains.md} | 0 .../README.md | 10 +++++----- ...dataset-ii.md => pyda-cleaning-dataset-ii.md} | 0 ...aning-dataset.md => pyda-cleaning-dataset.md} | 0 ...-data-sets.md => pyda-importing-data-sets.md} | 0 python/data-analysis/pyda-introduction/README.md | 10 +++++----- ...lysis-example.md => pyda-analysis-example.md} | 0 ... pyda-brief-introduction-to-data-analysis.md} | 0 .../{prerequisites.md => pyda-prerequisites.md} | 0 ...ibraries.md => pyda-python-data-libraries.md} | 0 ...s.md => pyda-why-python-for-data-analysis.md} | 0 43 files changed, 39 insertions(+), 39 deletions(-) rename python/data-analysis/pyda-analyzing-ii/{counting-and-transforming-series-ii.md => pyda-counting-and-transforming-series-ii.md} (100%) rename python/data-analysis/pyda-analyzing-ii/{counting-and-transforming-series.md => pyda-counting-and-transforming-series.md} (100%) rename python/data-analysis/pyda-analyzing-ii/{movie-count-per-release-year-visualized.md => pyda-movie-count-per-release-year-visualized.md} (100%) rename python/data-analysis/pyda-analyzing-ii/{movie-count-per-release-year.md => pyda-movie-count-per-release-year.md} (100%) rename python/data-analysis/pyda-analyzing-ii/{tvshow-ratings.md => pyda-tvshow-ratings.md} (100%) rename python/data-analysis/pyda-analyzing-ii/{visualize-tvshow-ratings.md => pyda-visualize-tvshow-ratings.md} (100%) rename python/data-analysis/pyda-analyzing-iii/{computing-group-sizes.md => pyda-computing-group-sizes.md} (100%) rename python/data-analysis/pyda-analyzing-iii/{movies-per-actor-ii.md => pyda-movies-per-actor-ii.md} (100%) rename python/data-analysis/pyda-analyzing-iii/{movies-per-actor.md => pyda-movies-per-actor.md} (100%) rename python/data-analysis/pyda-analyzing-iii/{total-number-of-actors.md => pyda-total-number-of-actors.md} (100%) rename python/data-analysis/pyda-analyzing-iii/{tvshow-per-actor.md => pyda-tvshow-per-actor.md} (100%) rename python/data-analysis/pyda-analyzing-iv/{longest-movies-ii.md => pyda-longest-movies-ii.md} (100%) rename python/data-analysis/pyda-analyzing-iv/{longest-movies.md => pyda-longest-movies.md} (100%) rename python/data-analysis/pyda-analyzing-iv/{longest-running-shows.md => pyda-longest-running-shows.md} (100%) rename python/data-analysis/pyda-analyzing/{dataframe-loc.md => pyda-dataframe-loc.md} (100%) rename python/data-analysis/pyda-analyzing/{end-goals.md => pyda-end-goals.md} (100%) rename python/data-analysis/pyda-analyzing/{modifying-pie-chart-ii.md => pyda-modifying-pie-chart-ii.md} (100%) rename python/data-analysis/pyda-analyzing/{modifying-pie-chart.md => pyda-modifying-pie-chart.md} (100%) rename python/data-analysis/pyda-analyzing/{movie-tvshow-count.md => pyda-movie-tvshow-count.md} (100%) rename python/data-analysis/pyda-analyzing/{movies-per-rating.md => pyda-movies-per-rating.md} (100%) rename python/data-analysis/pyda-da-analysis-environments/{creating-your-first-notebook.md => pyda-creating-your-first-notebook.md} (100%) rename python/data-analysis/pyda-da-analysis-environments/{different-tools-to-use.md => pyda-different-tools-to-use.md} (100%) rename python/data-analysis/pyda-da-analysis-environments/{ipython-vs-shell-vs-scripts.md => pyda-ipython-vs-shell-vs-scripts.md} (100%) rename python/data-analysis/pyda-da-analysis-environments/{notebooks.md => pyda-notebooks.md} (100%) rename python/data-analysis/pyda-da-analysis-environments/{what-are-analysis-environments.md => pyda-what-are-analysis-environments.md} (100%) rename python/data-analysis/pyda-da-tips/{pandas-profiling.md => pyda-pandas-profiling.md} (100%) rename python/data-analysis/pyda-da-tips/{str-contains.md => pyda-str-contains.md} (100%) rename python/data-analysis/pyda-initializing-and-cleaning-datasets/{cleaning-dataset-ii.md => pyda-cleaning-dataset-ii.md} (100%) rename python/data-analysis/pyda-initializing-and-cleaning-datasets/{cleaning-dataset.md => pyda-cleaning-dataset.md} (100%) rename python/data-analysis/pyda-initializing-and-cleaning-datasets/{importing-data-sets.md => pyda-importing-data-sets.md} (100%) rename python/data-analysis/pyda-introduction/{analysis-example.md => pyda-analysis-example.md} (100%) rename python/data-analysis/pyda-introduction/{brief-introduction-to-data-analysis.md => pyda-brief-introduction-to-data-analysis.md} (100%) rename python/data-analysis/pyda-introduction/{prerequisites.md => pyda-prerequisites.md} (100%) rename python/data-analysis/pyda-introduction/{python-data-libraries.md => pyda-python-data-libraries.md} (100%) rename python/data-analysis/pyda-introduction/{why-python-for-data-analysis.md => pyda-why-python-for-data-analysis.md} (100%) diff --git a/python/data-analysis/pyda-analyzing-ii/README.md b/python/data-analysis/pyda-analyzing-ii/README.md index 712b0914b4..e90aae3ab2 100644 --- a/python/data-analysis/pyda-analyzing-ii/README.md +++ b/python/data-analysis/pyda-analyzing-ii/README.md @@ -3,12 +3,12 @@ name: Analysis II description: Learn more methods for manipulating a dataset and visualizing outputs. insights: - - counting-and-transforming-series - - counting-and-transforming-series-ii - - tvshow-ratings - - visualize-tvshow-ratings - - movie-count-per-release-year - - movie-count-per-release-year-visualized + - pyda-counting-and-transforming-series + - pyda-counting-and-transforming-series-ii + - pyda-tvshow-ratings + - pyda-visualize-tvshow-ratings + - pyda-movie-count-per-release-year + - pyda-movie-count-per-release-year-visualized aspects: - workout diff --git a/python/data-analysis/pyda-analyzing-ii/counting-and-transforming-series-ii.md b/python/data-analysis/pyda-analyzing-ii/pyda-counting-and-transforming-series-ii.md similarity index 100% rename from python/data-analysis/pyda-analyzing-ii/counting-and-transforming-series-ii.md rename to python/data-analysis/pyda-analyzing-ii/pyda-counting-and-transforming-series-ii.md diff --git a/python/data-analysis/pyda-analyzing-ii/counting-and-transforming-series.md b/python/data-analysis/pyda-analyzing-ii/pyda-counting-and-transforming-series.md similarity index 100% rename from python/data-analysis/pyda-analyzing-ii/counting-and-transforming-series.md rename to python/data-analysis/pyda-analyzing-ii/pyda-counting-and-transforming-series.md diff --git a/python/data-analysis/pyda-analyzing-ii/movie-count-per-release-year-visualized.md b/python/data-analysis/pyda-analyzing-ii/pyda-movie-count-per-release-year-visualized.md similarity index 100% rename from python/data-analysis/pyda-analyzing-ii/movie-count-per-release-year-visualized.md rename to python/data-analysis/pyda-analyzing-ii/pyda-movie-count-per-release-year-visualized.md diff --git a/python/data-analysis/pyda-analyzing-ii/movie-count-per-release-year.md b/python/data-analysis/pyda-analyzing-ii/pyda-movie-count-per-release-year.md similarity index 100% rename from python/data-analysis/pyda-analyzing-ii/movie-count-per-release-year.md rename to python/data-analysis/pyda-analyzing-ii/pyda-movie-count-per-release-year.md diff --git a/python/data-analysis/pyda-analyzing-ii/tvshow-ratings.md b/python/data-analysis/pyda-analyzing-ii/pyda-tvshow-ratings.md similarity index 100% rename from python/data-analysis/pyda-analyzing-ii/tvshow-ratings.md rename to python/data-analysis/pyda-analyzing-ii/pyda-tvshow-ratings.md diff --git a/python/data-analysis/pyda-analyzing-ii/visualize-tvshow-ratings.md b/python/data-analysis/pyda-analyzing-ii/pyda-visualize-tvshow-ratings.md similarity index 100% rename from python/data-analysis/pyda-analyzing-ii/visualize-tvshow-ratings.md rename to python/data-analysis/pyda-analyzing-ii/pyda-visualize-tvshow-ratings.md diff --git a/python/data-analysis/pyda-analyzing-iii/README.md b/python/data-analysis/pyda-analyzing-iii/README.md index 421bb9f81c..95de87e871 100644 --- a/python/data-analysis/pyda-analyzing-iii/README.md +++ b/python/data-analysis/pyda-analyzing-iii/README.md @@ -3,11 +3,11 @@ name: Analysis III description: Learn even more ways to manipulate a dataset and visualize outputs. insights: - - total-number-of-actors - - movies-per-actor - - movies-per-actor-ii - - tvshow-per-actor - - computing-group-sizes + - pyda-total-number-of-actors + - pyda-movies-per-actor + - pyda-movies-per-actor-ii + - pyda-tvshow-per-actor + - pyda-computing-group-sizes aspects: - workout diff --git a/python/data-analysis/pyda-analyzing-iii/computing-group-sizes.md b/python/data-analysis/pyda-analyzing-iii/pyda-computing-group-sizes.md similarity index 100% rename from python/data-analysis/pyda-analyzing-iii/computing-group-sizes.md rename to python/data-analysis/pyda-analyzing-iii/pyda-computing-group-sizes.md diff --git a/python/data-analysis/pyda-analyzing-iii/movies-per-actor-ii.md b/python/data-analysis/pyda-analyzing-iii/pyda-movies-per-actor-ii.md similarity index 100% rename from python/data-analysis/pyda-analyzing-iii/movies-per-actor-ii.md rename to python/data-analysis/pyda-analyzing-iii/pyda-movies-per-actor-ii.md diff --git a/python/data-analysis/pyda-analyzing-iii/movies-per-actor.md b/python/data-analysis/pyda-analyzing-iii/pyda-movies-per-actor.md similarity index 100% rename from python/data-analysis/pyda-analyzing-iii/movies-per-actor.md rename to python/data-analysis/pyda-analyzing-iii/pyda-movies-per-actor.md diff --git a/python/data-analysis/pyda-analyzing-iii/total-number-of-actors.md b/python/data-analysis/pyda-analyzing-iii/pyda-total-number-of-actors.md similarity index 100% rename from python/data-analysis/pyda-analyzing-iii/total-number-of-actors.md rename to python/data-analysis/pyda-analyzing-iii/pyda-total-number-of-actors.md diff --git a/python/data-analysis/pyda-analyzing-iii/tvshow-per-actor.md b/python/data-analysis/pyda-analyzing-iii/pyda-tvshow-per-actor.md similarity index 100% rename from python/data-analysis/pyda-analyzing-iii/tvshow-per-actor.md rename to python/data-analysis/pyda-analyzing-iii/pyda-tvshow-per-actor.md diff --git a/python/data-analysis/pyda-analyzing-iv/README.md b/python/data-analysis/pyda-analyzing-iv/README.md index 889b532532..cd63da5603 100644 --- a/python/data-analysis/pyda-analyzing-iv/README.md +++ b/python/data-analysis/pyda-analyzing-iv/README.md @@ -3,9 +3,9 @@ name: Analysis IV description: Learn how to separate values for easier analysis. insights: - - longest-running-shows - - longest-movies - - longest-movies-ii + - pyda-longest-running-shows + - pyda-longest-movies + - pyda-longest-movies-ii aspects: - workout \ No newline at end of file diff --git a/python/data-analysis/pyda-analyzing-iv/longest-movies-ii.md b/python/data-analysis/pyda-analyzing-iv/pyda-longest-movies-ii.md similarity index 100% rename from python/data-analysis/pyda-analyzing-iv/longest-movies-ii.md rename to python/data-analysis/pyda-analyzing-iv/pyda-longest-movies-ii.md diff --git a/python/data-analysis/pyda-analyzing-iv/longest-movies.md b/python/data-analysis/pyda-analyzing-iv/pyda-longest-movies.md similarity index 100% rename from python/data-analysis/pyda-analyzing-iv/longest-movies.md rename to python/data-analysis/pyda-analyzing-iv/pyda-longest-movies.md diff --git a/python/data-analysis/pyda-analyzing-iv/longest-running-shows.md b/python/data-analysis/pyda-analyzing-iv/pyda-longest-running-shows.md similarity index 100% rename from python/data-analysis/pyda-analyzing-iv/longest-running-shows.md rename to python/data-analysis/pyda-analyzing-iv/pyda-longest-running-shows.md diff --git a/python/data-analysis/pyda-analyzing/README.md b/python/data-analysis/pyda-analyzing/README.md index 85f6085703..33ddb90d78 100644 --- a/python/data-analysis/pyda-analyzing/README.md +++ b/python/data-analysis/pyda-analyzing/README.md @@ -3,14 +3,14 @@ name: Analysis I description: Learn different methods for manipulating a dataset and visualizing outputs. insights: - - end-goals - - movie-tvshow-count - - visualize-movie-show-count - - modifying-pie-chart - - modifying-pie-chart-ii - - movies-per-rating - - dataframe-loc - - visualize-movie-ratings + - pyda-end-goals + - pyda-movie-tvshow-count + - pyda-visualize-movie-show-count + - pyda-modifying-pie-chart + - pyda-modifying-pie-chart-ii + - pyda-movies-per-rating + - pyda-dataframe-loc + - pyda-visualize-movie-ratings aspects: - introduction diff --git a/python/data-analysis/pyda-analyzing/dataframe-loc.md b/python/data-analysis/pyda-analyzing/pyda-dataframe-loc.md similarity index 100% rename from python/data-analysis/pyda-analyzing/dataframe-loc.md rename to python/data-analysis/pyda-analyzing/pyda-dataframe-loc.md diff --git a/python/data-analysis/pyda-analyzing/end-goals.md b/python/data-analysis/pyda-analyzing/pyda-end-goals.md similarity index 100% rename from python/data-analysis/pyda-analyzing/end-goals.md rename to python/data-analysis/pyda-analyzing/pyda-end-goals.md diff --git a/python/data-analysis/pyda-analyzing/modifying-pie-chart-ii.md b/python/data-analysis/pyda-analyzing/pyda-modifying-pie-chart-ii.md similarity index 100% rename from python/data-analysis/pyda-analyzing/modifying-pie-chart-ii.md rename to python/data-analysis/pyda-analyzing/pyda-modifying-pie-chart-ii.md diff --git a/python/data-analysis/pyda-analyzing/modifying-pie-chart.md b/python/data-analysis/pyda-analyzing/pyda-modifying-pie-chart.md similarity index 100% rename from python/data-analysis/pyda-analyzing/modifying-pie-chart.md rename to python/data-analysis/pyda-analyzing/pyda-modifying-pie-chart.md diff --git a/python/data-analysis/pyda-analyzing/movie-tvshow-count.md b/python/data-analysis/pyda-analyzing/pyda-movie-tvshow-count.md similarity index 100% rename from python/data-analysis/pyda-analyzing/movie-tvshow-count.md rename to python/data-analysis/pyda-analyzing/pyda-movie-tvshow-count.md diff --git a/python/data-analysis/pyda-analyzing/movies-per-rating.md b/python/data-analysis/pyda-analyzing/pyda-movies-per-rating.md similarity index 100% rename from python/data-analysis/pyda-analyzing/movies-per-rating.md rename to python/data-analysis/pyda-analyzing/pyda-movies-per-rating.md diff --git a/python/data-analysis/pyda-da-analysis-environments/README.md b/python/data-analysis/pyda-da-analysis-environments/README.md index d4ab038d4c..b432be8af0 100644 --- a/python/data-analysis/pyda-da-analysis-environments/README.md +++ b/python/data-analysis/pyda-da-analysis-environments/README.md @@ -3,11 +3,11 @@ name: Analysis Environments description: Get familiar with different analysis environments. insights: - - what-are-analysis-environments - - ipython-vs-shell-vs-scripts - - different-tools-to-use - - notebooks - - creating-your-first-notebook + - pyda-what-are-analysis-environments + - pyda-ipython-vs-shell-vs-scripts + - pyda-different-tools-to-use + - pyda-notebooks + - pyda-creating-your-first-notebook aspects: - introduction \ No newline at end of file diff --git a/python/data-analysis/pyda-da-analysis-environments/creating-your-first-notebook.md b/python/data-analysis/pyda-da-analysis-environments/pyda-creating-your-first-notebook.md similarity index 100% rename from python/data-analysis/pyda-da-analysis-environments/creating-your-first-notebook.md rename to python/data-analysis/pyda-da-analysis-environments/pyda-creating-your-first-notebook.md diff --git a/python/data-analysis/pyda-da-analysis-environments/different-tools-to-use.md b/python/data-analysis/pyda-da-analysis-environments/pyda-different-tools-to-use.md similarity index 100% rename from python/data-analysis/pyda-da-analysis-environments/different-tools-to-use.md rename to python/data-analysis/pyda-da-analysis-environments/pyda-different-tools-to-use.md diff --git a/python/data-analysis/pyda-da-analysis-environments/ipython-vs-shell-vs-scripts.md b/python/data-analysis/pyda-da-analysis-environments/pyda-ipython-vs-shell-vs-scripts.md similarity index 100% rename from python/data-analysis/pyda-da-analysis-environments/ipython-vs-shell-vs-scripts.md rename to python/data-analysis/pyda-da-analysis-environments/pyda-ipython-vs-shell-vs-scripts.md diff --git a/python/data-analysis/pyda-da-analysis-environments/notebooks.md b/python/data-analysis/pyda-da-analysis-environments/pyda-notebooks.md similarity index 100% rename from python/data-analysis/pyda-da-analysis-environments/notebooks.md rename to python/data-analysis/pyda-da-analysis-environments/pyda-notebooks.md diff --git a/python/data-analysis/pyda-da-analysis-environments/what-are-analysis-environments.md b/python/data-analysis/pyda-da-analysis-environments/pyda-what-are-analysis-environments.md similarity index 100% rename from python/data-analysis/pyda-da-analysis-environments/what-are-analysis-environments.md rename to python/data-analysis/pyda-da-analysis-environments/pyda-what-are-analysis-environments.md diff --git a/python/data-analysis/pyda-da-tips/README.md b/python/data-analysis/pyda-da-tips/README.md index e796913511..940c31ead6 100644 --- a/python/data-analysis/pyda-da-tips/README.md +++ b/python/data-analysis/pyda-da-tips/README.md @@ -3,8 +3,8 @@ name: Tips description: Useful tips for analyzing data with Python. insights: - - str-contains - - pandas-profiling + - pyda-str-contains + - pyda-pandas-profiling aspects: - introduction \ No newline at end of file diff --git a/python/data-analysis/pyda-da-tips/pandas-profiling.md b/python/data-analysis/pyda-da-tips/pyda-pandas-profiling.md similarity index 100% rename from python/data-analysis/pyda-da-tips/pandas-profiling.md rename to python/data-analysis/pyda-da-tips/pyda-pandas-profiling.md diff --git a/python/data-analysis/pyda-da-tips/str-contains.md b/python/data-analysis/pyda-da-tips/pyda-str-contains.md similarity index 100% rename from python/data-analysis/pyda-da-tips/str-contains.md rename to python/data-analysis/pyda-da-tips/pyda-str-contains.md diff --git a/python/data-analysis/pyda-initializing-and-cleaning-datasets/README.md b/python/data-analysis/pyda-initializing-and-cleaning-datasets/README.md index 6f5162d199..600fc736a7 100644 --- a/python/data-analysis/pyda-initializing-and-cleaning-datasets/README.md +++ b/python/data-analysis/pyda-initializing-and-cleaning-datasets/README.md @@ -3,11 +3,11 @@ name: Preparing a Dataset description: Learn what Series and Dataframes are, and how to prepare a dataset for analysis. insights: - - what-and-why-pandas - - series-and-dataframes - - importing-data-sets - - cleaning-dataset - - cleaning-dataset-ii + - pyda-what-and-why-pandas + - pyda-series-and-dataframes + - pyda-importing-data-sets + - pyda-cleaning-dataset + - pyda-cleaning-dataset-ii aspects: - introduction diff --git a/python/data-analysis/pyda-initializing-and-cleaning-datasets/cleaning-dataset-ii.md b/python/data-analysis/pyda-initializing-and-cleaning-datasets/pyda-cleaning-dataset-ii.md similarity index 100% rename from python/data-analysis/pyda-initializing-and-cleaning-datasets/cleaning-dataset-ii.md rename to python/data-analysis/pyda-initializing-and-cleaning-datasets/pyda-cleaning-dataset-ii.md diff --git a/python/data-analysis/pyda-initializing-and-cleaning-datasets/cleaning-dataset.md b/python/data-analysis/pyda-initializing-and-cleaning-datasets/pyda-cleaning-dataset.md similarity index 100% rename from python/data-analysis/pyda-initializing-and-cleaning-datasets/cleaning-dataset.md rename to python/data-analysis/pyda-initializing-and-cleaning-datasets/pyda-cleaning-dataset.md diff --git a/python/data-analysis/pyda-initializing-and-cleaning-datasets/importing-data-sets.md b/python/data-analysis/pyda-initializing-and-cleaning-datasets/pyda-importing-data-sets.md similarity index 100% rename from python/data-analysis/pyda-initializing-and-cleaning-datasets/importing-data-sets.md rename to python/data-analysis/pyda-initializing-and-cleaning-datasets/pyda-importing-data-sets.md diff --git a/python/data-analysis/pyda-introduction/README.md b/python/data-analysis/pyda-introduction/README.md index d7ea81fcc2..da56ad23eb 100644 --- a/python/data-analysis/pyda-introduction/README.md +++ b/python/data-analysis/pyda-introduction/README.md @@ -3,11 +3,11 @@ name: Introduction description: Learn how to analyze data using Python. insights: - - brief-introduction-to-data-analysis - - why-python-for-data-analysis - - python-data-libraries - - prerequisites - - analysis-example + - pyda-brief-introduction-to-data-analysis + - pyda-why-python-for-data-analysis + - pyda-python-data-libraries + - pyda-prerequisites + - pyda-analysis-example aspects: - introduction diff --git a/python/data-analysis/pyda-introduction/analysis-example.md b/python/data-analysis/pyda-introduction/pyda-analysis-example.md similarity index 100% rename from python/data-analysis/pyda-introduction/analysis-example.md rename to python/data-analysis/pyda-introduction/pyda-analysis-example.md diff --git a/python/data-analysis/pyda-introduction/brief-introduction-to-data-analysis.md b/python/data-analysis/pyda-introduction/pyda-brief-introduction-to-data-analysis.md similarity index 100% rename from python/data-analysis/pyda-introduction/brief-introduction-to-data-analysis.md rename to python/data-analysis/pyda-introduction/pyda-brief-introduction-to-data-analysis.md diff --git a/python/data-analysis/pyda-introduction/prerequisites.md b/python/data-analysis/pyda-introduction/pyda-prerequisites.md similarity index 100% rename from python/data-analysis/pyda-introduction/prerequisites.md rename to python/data-analysis/pyda-introduction/pyda-prerequisites.md diff --git a/python/data-analysis/pyda-introduction/python-data-libraries.md b/python/data-analysis/pyda-introduction/pyda-python-data-libraries.md similarity index 100% rename from python/data-analysis/pyda-introduction/python-data-libraries.md rename to python/data-analysis/pyda-introduction/pyda-python-data-libraries.md diff --git a/python/data-analysis/pyda-introduction/why-python-for-data-analysis.md b/python/data-analysis/pyda-introduction/pyda-why-python-for-data-analysis.md similarity index 100% rename from python/data-analysis/pyda-introduction/why-python-for-data-analysis.md rename to python/data-analysis/pyda-introduction/pyda-why-python-for-data-analysis.md From 9361f797bfb4e8c099581bdf1a857bd4c11d8d6c Mon Sep 17 00:00:00 2001 From: Nemanja Stojanovic Date: Thu, 5 Dec 2024 20:13:05 -0500 Subject: [PATCH 3/3] https://github.com/enkidevs/curriculum/pull/3332\#issuecomment-2521836382 --- .../README.md | 0 .../pyda-creating-your-first-notebook.md | 0 .../pyda-different-tools-to-use.md | 0 .../pyda-ipython-vs-shell-vs-scripts.md | 0 .../pyda-notebooks.md | 0 .../pyda-what-are-analysis-environments.md | 0 ...visualize-movie-ratings.md => pyda-visualize-movie-ratings.md} | 0 ...ize-movie-show-count.md => pyda-visualize-movie-show-count.md} | 0 .../{series-and-dataframes.md => pyda-series-and-dataframes.md} | 0 .../{what-and-why-pandas.md => pyda-what-and-why-pandas.md} | 0 10 files changed, 0 insertions(+), 0 deletions(-) rename python/data-analysis/{pyda-da-analysis-environments => pyda-analysis-environments}/README.md (100%) rename python/data-analysis/{pyda-da-analysis-environments => pyda-analysis-environments}/pyda-creating-your-first-notebook.md (100%) rename python/data-analysis/{pyda-da-analysis-environments => pyda-analysis-environments}/pyda-different-tools-to-use.md (100%) rename python/data-analysis/{pyda-da-analysis-environments => pyda-analysis-environments}/pyda-ipython-vs-shell-vs-scripts.md (100%) rename python/data-analysis/{pyda-da-analysis-environments => pyda-analysis-environments}/pyda-notebooks.md (100%) rename python/data-analysis/{pyda-da-analysis-environments => pyda-analysis-environments}/pyda-what-are-analysis-environments.md (100%) rename python/data-analysis/pyda-analyzing/{visualize-movie-ratings.md => pyda-visualize-movie-ratings.md} (100%) rename python/data-analysis/pyda-analyzing/{visualize-movie-show-count.md => pyda-visualize-movie-show-count.md} (100%) rename python/data-analysis/pyda-initializing-and-cleaning-datasets/{series-and-dataframes.md => pyda-series-and-dataframes.md} (100%) rename python/data-analysis/pyda-initializing-and-cleaning-datasets/{what-and-why-pandas.md => pyda-what-and-why-pandas.md} (100%) diff --git a/python/data-analysis/pyda-da-analysis-environments/README.md b/python/data-analysis/pyda-analysis-environments/README.md similarity index 100% rename from python/data-analysis/pyda-da-analysis-environments/README.md rename to python/data-analysis/pyda-analysis-environments/README.md diff --git a/python/data-analysis/pyda-da-analysis-environments/pyda-creating-your-first-notebook.md b/python/data-analysis/pyda-analysis-environments/pyda-creating-your-first-notebook.md similarity index 100% rename from python/data-analysis/pyda-da-analysis-environments/pyda-creating-your-first-notebook.md rename to python/data-analysis/pyda-analysis-environments/pyda-creating-your-first-notebook.md diff --git a/python/data-analysis/pyda-da-analysis-environments/pyda-different-tools-to-use.md b/python/data-analysis/pyda-analysis-environments/pyda-different-tools-to-use.md similarity index 100% rename from python/data-analysis/pyda-da-analysis-environments/pyda-different-tools-to-use.md rename to python/data-analysis/pyda-analysis-environments/pyda-different-tools-to-use.md diff --git a/python/data-analysis/pyda-da-analysis-environments/pyda-ipython-vs-shell-vs-scripts.md b/python/data-analysis/pyda-analysis-environments/pyda-ipython-vs-shell-vs-scripts.md similarity index 100% rename from python/data-analysis/pyda-da-analysis-environments/pyda-ipython-vs-shell-vs-scripts.md rename to python/data-analysis/pyda-analysis-environments/pyda-ipython-vs-shell-vs-scripts.md diff --git a/python/data-analysis/pyda-da-analysis-environments/pyda-notebooks.md b/python/data-analysis/pyda-analysis-environments/pyda-notebooks.md similarity index 100% rename from python/data-analysis/pyda-da-analysis-environments/pyda-notebooks.md rename to python/data-analysis/pyda-analysis-environments/pyda-notebooks.md diff --git a/python/data-analysis/pyda-da-analysis-environments/pyda-what-are-analysis-environments.md b/python/data-analysis/pyda-analysis-environments/pyda-what-are-analysis-environments.md similarity index 100% rename from python/data-analysis/pyda-da-analysis-environments/pyda-what-are-analysis-environments.md rename to python/data-analysis/pyda-analysis-environments/pyda-what-are-analysis-environments.md diff --git a/python/data-analysis/pyda-analyzing/visualize-movie-ratings.md b/python/data-analysis/pyda-analyzing/pyda-visualize-movie-ratings.md similarity index 100% rename from python/data-analysis/pyda-analyzing/visualize-movie-ratings.md rename to python/data-analysis/pyda-analyzing/pyda-visualize-movie-ratings.md diff --git a/python/data-analysis/pyda-analyzing/visualize-movie-show-count.md b/python/data-analysis/pyda-analyzing/pyda-visualize-movie-show-count.md similarity index 100% rename from python/data-analysis/pyda-analyzing/visualize-movie-show-count.md rename to python/data-analysis/pyda-analyzing/pyda-visualize-movie-show-count.md diff --git a/python/data-analysis/pyda-initializing-and-cleaning-datasets/series-and-dataframes.md b/python/data-analysis/pyda-initializing-and-cleaning-datasets/pyda-series-and-dataframes.md similarity index 100% rename from python/data-analysis/pyda-initializing-and-cleaning-datasets/series-and-dataframes.md rename to python/data-analysis/pyda-initializing-and-cleaning-datasets/pyda-series-and-dataframes.md diff --git a/python/data-analysis/pyda-initializing-and-cleaning-datasets/what-and-why-pandas.md b/python/data-analysis/pyda-initializing-and-cleaning-datasets/pyda-what-and-why-pandas.md similarity index 100% rename from python/data-analysis/pyda-initializing-and-cleaning-datasets/what-and-why-pandas.md rename to python/data-analysis/pyda-initializing-and-cleaning-datasets/pyda-what-and-why-pandas.md