import urllib.request
import os
= "yarn.csv"
filename if not os.path.exists(filename):
= "https://raw.githubusercontent.com/rfordatascience/tidytuesday/6830f858fd0e87af47dfa1ecc7043b7c05f85e69/data/2022/2022-10-11/yarn.csv"
url "yarn.csv") urllib.request.urlretrieve(url,
In this article, I’m taking the Python data frame library polars for a spin. Polars is a super fast alternative to pandas, implemented in Rust. It also has a leaner interface and doesn’t need an index column. To learn more about how it compares to other data frame libraries, see my article about data frames.
I’m analyzing a dataset about yarns from the knitting website Ravelry. You can find the dataset on Github.
It lists 100,000 yarns, with information about the yarn’s name, brand, weight and rating by Ravelry users.
First, let’s load the data and have a look at it. I load the data directly from the Github repository.
Now I have a CSV file on disk. I can load it into a polars DataFrame. Here, I’ve specified the column types manually, so polars doesn’t have to guess them.
import polars as pl
= pl.read_csv(
yarn ="yarn.csv",
source=True,
has_header=["NA"],
null_values=True,
ignore_errors={
dtypes"discontinued": pl.Boolean,
"gauge_divisor": pl.Int32,
"grams": pl.Int32,
"id": pl.Int32,
"machine_washable": pl.Boolean,
"max_gauge": pl.Float64,
"min_gauge": pl.Float64,
"name": pl.Utf8,
"permalink": pl.Utf8,
"rating_average": pl.Float64,
"rating_count": pl.Int32,
"rating_total": pl.Int32,
"texture": pl.Utf8,
"thread_size": pl.Utf8,
"wpi": pl.Int32,
"yardage": pl.Int32,
"yarn_company_name": pl.Utf8,
"yarn_weight_crochet_gauge": pl.Float64,
"yarn_weight_id": pl.Int32,
"yarn_weight_knit_gauge": pl.Float64,
"yarn_weight_name": pl.Utf8,
"yarn_weight_ply": pl.Int32,
"yarn_weight_wpi": pl.Int32,
"texture_clean": pl.Utf8,
},
)10) yarn.head(
discontinued | gauge_divisor | grams | id | machine_washable | max_gauge | min_gauge | name | permalink | rating_average | rating_count | rating_total | texture | thread_size | wpi | yardage | yarn_company_name | yarn_weight_crochet_gauge | yarn_weight_id | yarn_weight_knit_gauge | yarn_weight_name | yarn_weight_ply | yarn_weight_wpi | texture_clean |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
bool | i32 | i32 | i32 | bool | f64 | f64 | str | str | f64 | i32 | i32 | str | str | i32 | i32 | str | f64 | i32 | f64 | str | i32 | i32 | str |
false | 4 | 198 | 2059 | true | null | 17.0 | "Super Saver So… | "red-heart-supe… | 3.58 | 17616 | 63069 | "cable plied" | null | null | 364 | "Red Heart" | null | 1 | 18.0 | "Aran" | 10 | 8 | "cable plied" |
false | 4 | 170 | 3330 | true | null | 18.0 | "Simply Soft So… | "caron-simply-s… | 4.03 | 19133 | 77147 | "plied" | null | null | 315 | "Caron" | null | 1 | 18.0 | "Aran" | 10 | 8 | "plied" |
false | 4 | 100 | 523 | null | 20.0 | 18.0 | "Cascade 220®" | "cascade-yarns-… | 4.48 | 21517 | 96470 | "plied" | null | 9 | 220 | "Cascade Yarns … | null | 12 | 20.0 | "Worsted" | 10 | 9 | "plied" |
false | 4 | 100 | 5741 | true | null | 16.0 | "Vanna's Choice… | "lion-brand-van… | 3.87 | 13959 | 54036 | "plied" | null | null | 170 | "Lion Brand" | null | 1 | 18.0 | "Aran" | 10 | 8 | "plied" |
false | 4 | 100 | 1666 | null | null | 18.0 | "Worsted" | "malabrigo-yarn… | 4.73 | 20638 | 97630 | "singles" | null | 8 | 210 | "Malabrigo Yarn… | null | 1 | 18.0 | "Aran" | 10 | 8 | "singles" |
false | 4 | 100 | 62569 | true | 22.0 | 18.0 | "Rios" | "malabrigo-yarn… | 4.81 | 20250 | 97421 | "plied" | null | null | 210 | "Malabrigo Yarn… | null | 12 | 20.0 | "Worsted" | 10 | 9 | "plied" |
false | 4 | 70 | 818 | true | null | 20.0 | "Sugar'n Cream … | "lily-sugarn-cr… | 4.11 | 13053 | 53632 | "4 single plies… | null | null | 120 | "Lily" | null | 12 | 20.0 | "Worsted" | 10 | 9 | "4 single plies… |
false | 4 | 100 | 3518 | true | 22.0 | 20.0 | "220 Superwash" | "cascade-yarns-… | 4.42 | 14828 | 65478 | null | null | null | 220 | "Cascade Yarns … | null | 12 | 20.0 | "Worsted" | 10 | 9 | null |
false | 4 | 100 | 26385 | true | null | 32.0 | "Sock" | "malabrigo-yarn… | 4.74 | 18508 | 87693 | "plied" | null | null | 440 | "Malabrigo Yarn… | null | 13 | 32.0 | "Light Fingerin… | 3 | null | "plied" |
false | 4 | null | 53539 | true | 30.0 | 26.0 | "Tosh Merino Li… | "madelinetosh-t… | 4.7 | 15991 | 75155 | "single" | null | null | 420 | "madelinetosh" | null | 5 | 28.0 | "Fingering" | 4 | 14 | "single" |
The pl.DataFrame.describe()
method gives a quick overview of the data.
yarn.describe()
describe | discontinued | gauge_divisor | grams | id | machine_washable | max_gauge | min_gauge | name | permalink | rating_average | rating_count | rating_total | texture | thread_size | wpi | yardage | yarn_company_name | yarn_weight_crochet_gauge | yarn_weight_id | yarn_weight_knit_gauge | yarn_weight_name | yarn_weight_ply | yarn_weight_wpi | texture_clean |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
str | f64 | f64 | f64 | f64 | f64 | f64 | f64 | str | str | f64 | f64 | f64 | str | str | f64 | f64 | str | f64 | f64 | f64 | str | f64 | f64 | str |
"count" | 100000.0 | 100000.0 | 100000.0 | 100000.0 | 100000.0 | 100000.0 | 100000.0 | "100000" | "100000" | 100000.0 | 100000.0 | 100000.0 | "100000" | "100000" | 100000.0 | 100000.0 | "100000" | 100000.0 | 100000.0 | 100000.0 | "100000" | 100000.0 | 100000.0 | "100000" |
"null_count" | 90.0 | 29596.0 | 3782.0 | 0.0 | 45792.0 | 79630.0 | 29052.0 | "0" | "0" | 10541.0 | 10541.0 | 10541.0 | "26691" | "99407" | 96199.0 | 4266.0 | "0" | 100000.0 | 2695.0 | 33384.0 | "2695" | 9380.0 | 24074.0 | "26691" |
"mean" | 0.356531 | 3.647705 | 92.973841 | 102988.0402 | 0.673369 | 19.162726 | 20.069264 | null | null | 4.426368 | 43.181905 | 189.281146 | null | null | 12.93949 | 339.035881 | null | null | 7.454756 | 24.481746 | null | 6.393136 | 11.144773 | null |
"std" | 0.478977 | 0.962701 | 73.082122 | 61006.727934 | 0.468985 | 10.170148 | 8.030449 | null | null | 0.631511 | 320.643238 | 1407.033498 | null | null | 7.919564 | 538.963237 | null | null | 3.677407 | 4.516639 | null | 3.179723 | 2.510025 | null |
"min" | 0.0 | 1.0 | 0.0 | 24.0 | 0.0 | 0.0 | 0.0 | ""Der Halsschme… | "-" | 1.0 | 1.0 | 1.0 | ""beads on a ch… | "1" | 0.0 | 0.0 | "! Needs Brand … | null | 1.0 | 18.0 | "Aran" | 1.0 | 7.0 | ""beads on a ch… |
"25%" | null | 4.0 | 50.0 | 51014.0 | null | 8.0 | 15.0 | null | null | 4.0 | 2.0 | 10.0 | null | null | 9.0 | 137.0 | null | null | 5.0 | 20.0 | null | 4.0 | 9.0 | null |
"50%" | null | 4.0 | 100.0 | 103017.0 | null | 20.0 | 22.0 | null | null | 4.6 | 5.0 | 23.0 | null | null | 12.0 | 246.0 | null | null | 7.0 | 22.0 | null | 5.0 | 11.0 | null |
"75%" | null | 4.0 | 100.0 | 155436.0 | null | 28.0 | 28.0 | null | null | 5.0 | 17.0 | 73.0 | null | null | 14.0 | 437.0 | null | null | 11.0 | 28.0 | null | 10.0 | 14.0 | null |
"max" | 1.0 | 4.0 | 7087.0 | 218285.0 | 1.0 | 67.75 | 99.99 | "빈센트 리치 시그니처 (V… | "zwool-worsted-… | 5.0 | 21517.0 | 97630.0 | "одиночний розр… | "floss" | 127.0 | 32839.0 | "니트러브(Knitlove)… | null | 16.0 | 32.0 | "Worsted" | 12.0 | 14.0 | "одиночний розр… |
Check for missing values
A good first step in any exploratory data analysis is to check for missing values. Here, I’d like to know the percentage of missing values per column. The pl.DataFrame.describe()
method already gives the number of missing values. I use .transpose()
to turn the columns into rows, so I can use the pl.DataFrame.with_column()
method to add a new column with the percentage of missing values.
(
yarn.describe()filter(pl.col("describe") == "null_count")
."describe")
.drop(
.transpose(=True,
include_header=["null_count"],
column_names
)"null_count").cast(pl.Float64)) # str -> float
.with_columns(pl.col("null_count") / yarn.shape[0]).alias("null_pct"))
.with_columns((pl.col("null_pct"), descending=True)
.sort(pl.col( )
column | null_count | null_pct |
---|---|---|
str | f64 | f64 |
"yarn_weight_cr… | 100000.0 | 1.0 |
"thread_size" | 99407.0 | 0.99407 |
"wpi" | 96199.0 | 0.96199 |
"max_gauge" | 79630.0 | 0.7963 |
"machine_washab… | 45792.0 | 0.45792 |
"yarn_weight_kn… | 33384.0 | 0.33384 |
"gauge_divisor" | 29596.0 | 0.29596 |
"min_gauge" | 29052.0 | 0.29052 |
"texture" | 26691.0 | 0.26691 |
"texture_clean" | 26691.0 | 0.26691 |
"yarn_weight_wp… | 24074.0 | 0.24074 |
"rating_average… | 10541.0 | 0.10541 |
"rating_count" | 10541.0 | 0.10541 |
"rating_total" | 10541.0 | 0.10541 |
"yarn_weight_pl… | 9380.0 | 0.0938 |
"yardage" | 4266.0 | 0.04266 |
"grams" | 3782.0 | 0.03782 |
"yarn_weight_id… | 2695.0 | 0.02695 |
"yarn_weight_na… | 2695.0 | 0.02695 |
"discontinued" | 90.0 | 0.0009 |
"id" | 0.0 | 0.0 |
"name" | 0.0 | 0.0 |
"permalink" | 0.0 | 0.0 |
"yarn_company_n… | 0.0 | 0.0 |
Some columns have close to 100% missing values, these won’t be useful for further analysis.
Discontinued yarns
The column boolean column “discontinued” indicates whether a manufacturer has stopped producing a yarn. This sparked a question: are unpopular yarns more likely to be discontinued?
Let’s see a boxplot of the rating average for discontinued and non-discontinued yarns. I visualize the data with plotly express. It can’t handle polars DataFrames, so I convert it to a pandas DataFrame first, using the pl.DataFrame.to_pandas()
method.
= yarn.select(
discontinued_df
["discontinued",
"rating_average",
]
).drop_nulls()
import plotly.express as px
= px.box(
fig =discontinued_df.to_pandas(),
data_frame="discontinued",
x="rating_average",
y="Rating Average by Discontinued",
title="discontinued",
color
) fig.show()
The boxplot shows that discontinued yarns (True, in red) indeed have a lower rating than non-discontinued yarns. But is this difference statistically significant? I can use a t-test to find out. scipy.stats
has a function for this. I’m choosing a two sample t-test, because I’m comparing two groups and I’m using a two-sided test because I don’t want to rule out that the discontinued yarns have a higher rating than the non-discontinued yarns.
Here, I use the pl.Series.to_numpy()
method to convert the polars Series to a numpy array.
from scipy.stats import ttest_ind
ttest_ind(=discontinued_df.filter(pl.col("discontinued") == True)
a"rating_average")
.select(
.to_numpy(),=discontinued_df.filter(pl.col("discontinued") == False)
b"rating_average")
.select(
.to_numpy(), )
TtestResult(statistic=array([-79.57208971]), pvalue=array([0.]), df=array([89384.]))
So yes, the result is statistically significant. The p-value is very small, so we can reject the null hypothesis that the two groups have the same rating average.
Most popular yarn companies
Let’s have a closer look at the yarn companies. I aggregate the data frame by yarn company and calculate a number of statistics about them.
= (
companies "yarn_company_name")
yarn.groupby(
.agg(
["yarns"),
pl.count().alias("rating_average").alias("mean_rating_average"),
pl.mean(sum("rating_count").alias("total_ratings"),
pl.
]
)filter(pl.col("total_ratings") > 499)
."total_ratings"), descending=True)
.sort(pl.col(
) companies
/var/folders/y6/r4nd18014svggynr61y82m4w0000gn/T/ipykernel_14323/1280698066.py:2: DeprecationWarning:
`groupby` is deprecated. It has been renamed to `group_by`.
yarn_company_name | yarns | mean_rating_average | total_ratings |
---|---|---|---|
str | u32 | f64 | i32 |
"Knit Picks" | 264 | 4.345615 | 168175 |
"Cascade Yarns … | 256 | 4.271111 | 153626 |
"Lion Brand" | 390 | 3.979581 | 149327 |
"Malabrigo Yarn… | 42 | 4.676585 | 111182 |
"Rowan" | 267 | 4.288669 | 99200 |
"Garnstudio" | 92 | 4.083111 | 86275 |
"Berroco" | 323 | 4.109444 | 85314 |
"madelinetosh" | 92 | 4.733 | 76651 |
"Red Heart" | 346 | 3.891916 | 72135 |
"Bernat" | 461 | 3.842055 | 63405 |
"Plymouth Yarn" | 391 | 4.153069 | 61151 |
"Patons North A… | 224 | 3.835442 | 60843 |
… | … | … | … |
"The Copper Cor… | 13 | 4.878462 | 514 |
"Graine de lain… | 16 | 4.731875 | 514 |
"Huckleberry Kn… | 53 | 4.739583 | 514 |
"Sunrise Fiber … | 35 | 4.858824 | 513 |
"Midara" | 41 | 4.478286 | 512 |
"Another Crafty… | 11 | 4.923636 | 512 |
"Kangaroo Dyer" | 17 | 4.445882 | 510 |
"Needful Yarns" | 40 | 3.782051 | 509 |
"Carnival" | 12 | 3.671 | 508 |
"Sterling Ridge… | 19 | 4.820556 | 508 |
"WOLLkenSchaf" | 21 | 4.67 | 507 |
"Farbularasa" | 29 | 4.911111 | 504 |
The table shows brands with at least 500 ratings on Ravelry. Lion Brand stands out with a particularly low average rating of 3.98, whereas madelinetosh scores an average rating of 4.73.
Yarn weights
My girlfriend, who is a passionate knitter, tells me that gauge weight is the most important factor for a knitting project. It determines the thickness and size of the finished product. It’s associated with the yarn_weight_ply, which is the number of threads combined to a yarn.
Which gauge sizes are most popular, based on the number of yarns available?
("yarn_weight_name", "yarn_weight_ply"])
yarn.groupby([
.agg(
["yarns"),
pl.count().alias(
]
)
.drop_nulls()"yarns"), descending=True)
.sort(pl.col( )
/var/folders/y6/r4nd18014svggynr61y82m4w0000gn/T/ipykernel_14323/155294860.py:2: DeprecationWarning:
`groupby` is deprecated. It has been renamed to `group_by`.
yarn_weight_name | yarn_weight_ply | yarns |
---|---|---|
str | i32 | u32 |
"Fingering" | 4 | 26004 |
"DK" | 8 | 15686 |
"Aran" | 10 | 9292 |
"Worsted" | 10 | 9156 |
"Sport" | 5 | 8464 |
"Lace" | 2 | 7504 |
"Bulky" | 12 | 7324 |
"Light Fingerin… | 3 | 6478 |
"Cobweb" | 1 | 712 |
The “Fingering” weight, a regular yarn for knitting, is the most popular gauge weight. According to my girlfriend, it’s particularly popular in Scandinavia.
The yardage, weight and thickness of yarn is expressed with multiple metrics. Let’s see the correlation between them to better understand their meanings. Polars doesn’t have a built-in function to get the correlation between all columns. The pl.pearson_corr()
function can be used to calculate the correlation between two columns. I convert it to a pandas DataFrame to use its corr()
method.
= (
corr
yarn.select(
["yardage",
"grams",
"machine_washable",
"max_gauge",
"min_gauge",
"yarn_weight_ply",
"yarn_weight_knit_gauge",
"yarn_weight_wpi",
]
)
.drop_nulls()
.to_pandas()
.corr()
)
# Visualize as a heatmap using plotly
import plotly.io as pio
import plotly.graph_objects as go
= "plotly_white"
pio.templates.default
# Only show the upper triangle of the correlation matrix
# Set the diagonal and lower triangle to NaN
import numpy as np
= np.triu(np.ones_like(corr, dtype=bool))
mask
= go.Figure()
fig
fig.add_trace(
go.Heatmap(=corr.mask(mask),
z=corr.columns,
x=corr.columns,
y=px.colors.diverging.RdBu,
colorscale=-1,
zmin=1,
zmax
) )
The correlation matrix shows some facts about yarns:
- Long yarns (high yardage) makes the yarn ball heavier (high grams)
- High ply yarns are typically sold in shorter yardage
- High ply yarns are less commonly mashine washable
- The maximum and minimum gauge are in a small range of one another, depending on the yarn weight
- A thick yarn (high ply, high WPI (wraps per inch)) means fewer stitches fit into the gauge
And that’s it! I hope you’ve enjoyed this analysis of the Ravelry yarn data. If you want to learn more about polars, check out the documentation and the GitHub repository.
Photo by Margarida Afonso on Unsplash