library(tidyverse)
library(readxl)
library(openxlsx)
ny_production_cost_acre <- 525
ny_production_cost_acre_se <- 0.25*ny_production_cost_acre
# Import fake data
drybeans <- read_xlsx(
"data_raw/coa_cost_per_acre.xlsx",
sheet = "drybeans")
# Add production cost per acre for beans from crop budget
drybeans <- drybeans %>%
mutate(
year = 2024,
baseline = ny_production_cost_acre,
baseline_se = ny_production_cost_acre_se,
production_cost_acre_mean = baseline +
(baseline*percent_diff_mean),
production_cost_acre_se = baseline_se +
(baseline_se*percent_diff_se)
)
# Rename to be consistent with other spreadsheet
drybeans <- drybeans %>%
mutate(
variable_name = case_when(
mwbe_fct=="Not MWBE" &
sales_class_agg == "GCFI $1M or more" &
organic_fct == "No organic sales" ~
"cost_of_production_drybean_more_mill_nonmwbe_conv",
mwbe_fct=="MWBE" &
sales_class_agg == "GCFI $1M or more" &
organic_fct == "No organic sales" ~
"cost_of_production_drybean_more_mill_mwbe_conv",
mwbe_fct=="Not MWBE" &
sales_class_agg == "GCFI less than $1M" &
organic_fct == "No organic sales" ~
"cost_of_production_drybean_less_mill_nonmwbe_conv",
mwbe_fct=="MWBE" &
sales_class_agg == "GCFI less than $1M" &
organic_fct == "No organic sales" ~
"cost_of_production_drybean_less_mill_mwbe_conv",
mwbe_fct=="Not MWBE" &
sales_class_agg == "GCFI less than $1M" &
organic_fct == "Organic sales" ~
"cost_of_production_drybean_less_mill_nonmwbe_org")
)
# Keep only data we need
drybeans <- drybeans %>%
select(variable_name,
production_cost_acre_mean, production_cost_acre_se) %>%
filter(!is.na(variable_name))2 Data inputs
2.1 Agricultural production
We estimate cost of production for dry bean and soybean producers using a combination of data sources including the restricted access Census of Agriculture, crop budgets from select states, and interview data. Prices received by producer on the commodity market are based on price data from NASS and prices received on the differentiated market are based on interview data.
2.1.1 Cost of production
Using the restricted access data from the Census of Agriculture, we use the NASS defined variable, total expense, which includes all production expenses listed in the production expense section of the census and includes both fixed and variable expenses. Data are pulled for producers with dry bean sales. However, these data are at the whole farm level. We do not know the cost of production of beans specifically.
2.1.1.1 Dry beans
To understand cost of production for New York bean producers, we use data from interviews conducted of New York bean producers, which estimate a cost of production of $525 per acre.
We also need a cost of production for bean producers outside of New York, as those are the producers currently supplying the New York City beans. Based on discussions with Michael, most of the beans come from North Dakota, so we base our cost of production on an enterprise budget from North Dakota. We also include the other crop budgets we could find for comparison/to provide range:
- North Dakota, 2024
- Cost per acre: $434.05
- Wyoming
- Cost per acre: $598.70, cost per CWT $23.95
- Colorado
- Cost per acre: $454.57, cost per CWT $113.64
- Texas
- Cost per acre: $400
Conclusions We will use the cost of production data from New York combined with Census of Agriculture data to estimate cost of production for different types of operations. For those outside of NY, we will provide a range based on the enterprise budgets we were able to find.
From the Census of Agriculture, we have an average dry bean yield in NY of 20.5 CWT per acre to convert to production cost per lb.
The New York cost of production per acre is based on June’s assumption.
2.1.1.1.1 NYS producers
2.1.1.1.2 Outside NYS producers
- North Dakota: 434.05
- Wyoming: 598.70
- Colorado: 454.57
- Texas: 400
# Calculate mean and se from the data that we have
outside_ny <- tibble(
location = c("North Dakota", "Wyoming", "Colorado", "Texas"),
cost_acre = c(434.05, 598.70, 454.57, 400)
)
# Calulate mean and standard error and include as vectors
mean <- outside_ny %>%
summarise(mean = mean(cost_acre)) %>%
pull()
se <- outside_ny %>%
summarise(sd = sd(cost_acre),
n = n(),
se = sd/sqrt(n)) %>%
pull(se)
drybeans <- drybeans %>%
add_row(
variable_name = "cost_of_production_drybean_outsideNY",
production_cost_acre_mean = mean,
production_cost_acre_se = se)
# ADD in N/As for options not included
names <- tibble(
variable_name =
c("cost_of_production_drybean_more_mill_nonmwbe_conv",
"cost_of_production_drybean_more_mill_mwbe_conv",
"cost_of_production_drybean_less_mill_nonmwbe_conv",
"cost_of_production_drybean_less_mill_mwbe_conv",
"cost_of_production_drybean_more_mill_nonmwbe_org",
"cost_of_production_drybean_more_mill_mwbe_org",
"cost_of_production_drybean_less_mill_nonmwbe_org",
"cost_of_production_drybean_less_mill_mwbe_org",
"cost_of_production_drybean_outsideNY")
)
# Join
drybeans <- left_join(names, drybeans)
rm(names)
# Add in $ per hectare (1 hectare) = 2.471 acres)
drybeans <- drybeans %>%
mutate(
production_cost_hectare_mean = production_cost_acre_mean*2.471,
production_cost_hectare_se = production_cost_acre_se*2.471
)
rm(outside_ny)2.1.1.2 Soybeans
We use Census of Agriculture data and keep only those producers with soybean sales. Because we only have soybean producers, then the total cost per acre is for soybeans only and we don’t need to use a crop budget.
In NY there are not enough farms in some categories. So we modify that NY data that we have up or down based on the national numbers.
Conventional soybean producers in NY have 50% higher production costs than compared to the US. We take the numbers we have by scale, MWBE, organic and increase them by 50%.
Crop budgets for comparison:
- Ohio in 2022
- Conventional: 761/acre.
- There is no data for organic until 2024, which is 946/acre. For comparison, conventional soy in 2024 was 754/acre.
- Louisiana in 2022
- 454/acre
- Alabama in 2025
- 471/acre
# Import data
soybeans <- read_xlsx(
"data_raw/coa_cost_per_acre.xlsx",
sheet = "soybeans")
ny_inflation <- soybeans %>%
filter(sample=="NY") %>%
pull(percent_diff_mean)
# Inflate the means/se by the ny inflation number
soybeans <- soybeans %>%
select(mwbe_fct:se) %>%
filter(mwbe_fct != "all") %>%
mutate(
mean = mean + (mean*ny_inflation),
se = se + (se*ny_inflation)
)
# Rename
soybeans <- soybeans %>%
mutate(
variable_name = case_when(
mwbe_fct=="Not MWBE" &
sales_class_agg == "GCFI $1M or more" &
organic_fct == "No organic sales" ~
"cost_of_production_soy_more_mill_nonmwbe_conv",
mwbe_fct=="MWBE" &
sales_class_agg == "GCFI $1M or more" &
organic_fct == "No organic sales" ~
"cost_of_production_soy_more_mill_mwbe_conv",
mwbe_fct=="Not MWBE" &
sales_class_agg == "GCFI less than $1M" &
organic_fct == "No organic sales" ~
"cost_of_production_soy_less_mill_nonmwbe_conv",
mwbe_fct=="MWBE" &
sales_class_agg == "GCFI less than $1M" &
organic_fct == "No organic sales" ~
"cost_of_production_soy_less_mill_mwbe_conv",
mwbe_fct=="Not MWBE" &
sales_class_agg == "GCFI less than $1M" &
organic_fct == "Organic sales" ~
"cost_of_production_soy_less_mill_nonmwbe_org",
mwbe_fct=="MWBE" &
sales_class_agg == "GCFI less than $1M" &
organic_fct == "Organic sales" ~
"cost_of_production_soy_less_mill_mwbe_org"))
# add all names so blanks where no data
names <- tibble(
variable_name = c(
"cost_of_production_soy_more_mill_nonmwbe_conv",
"cost_of_production_soy_more_mill_mwbe_conv",
"cost_of_production_soy_less_mill_nonmwbe_conv",
"cost_of_production_soy_less_mill_mwbe_conv",
"cost_of_production_soy_more_mill_nonmwbe_org",
"cost_of_production_soy_more_mill_mwbe_org",
"cost_of_production_soy_less_mill_nonmwbe_org",
"cost_of_production_soy_less_mill_mwbe_org",
"cost_of_production_soy_outsideNY")
)
# Join
soybeans <- left_join(names, soybeans)
# Keep columns of interest
soybeans <- soybeans %>%
select(variable_name, mean, se)
# Rename and add in $ per hectare (1 hectare) = 2.471 acres)
soybeans <- soybeans %>%
rename(
production_cost_acre_mean = mean,
production_cost_acre_se = se) %>%
mutate(
production_cost_hectare_mean = production_cost_acre_mean*2.471,
production_cost_hectare_se = production_cost_acre_se*2.471)2.1.1.3 Save
Combine all data into one file and save. Final cost of production data
# Combine dry bean and soybean data
production_cost <- bind_rows(drybeans, soybeans)
# Write to file
wb <- createWorkbook()
# Add info
info <- tibble(
sheet_name = "production_cost",
description = "Mean and standard deviation for cost of production per acre and hectare for dry bean and soybean growers. Costs are by crop, location (in NY/outside NY), scale (under/over $M), organic, and MWBE).",
notes = "Missing data means there were no farms of that type in the data.",
source = "Restricted access census of agriculture and crop budgets. See Quarto file for more information."
)
# Add worksheets and save
addWorksheet(wb, "info")
writeData(wb, "info", info)
addWorksheet(wb, "production_cost")
writeData(wb, "production_cost", production_cost)
saveWorkbook(wb, "data_final/production_cost.xlsx",
overwrite = TRUE)
rm(list=ls())2.1.2 Prices
2.1.2.1 NASS prices
For prices for conventionally grown dry beans (dry edible beans, excluding chickpeas) and soybeans, we use price data from USDA NASS Quick Stats. All data in price per cwt for dry edible beans and bushels for soybeans. Soybean data is based on soybeans for beans. For the model, they want everything in terms of kilograms, so we convert to dollars per kg.
# Import quick stats data
price <- read_csv("data_raw/quick_stats_conventional_bean_prices.csv") %>%
janitor::clean_names()
# keep columns of interest
price <- price %>%
select(year, commodity, value)2.1.2.2 Convert to $/lb and $/kg
All data in price per cwt for dry edible beans and bushels for soybeans. Soybean data is based on soybeans for beans.
We convert everything to price per killogram, as required by the model and price per lb.
Dry beans 1 cwt = 100 pounds 1 pound = 0.45359 kg 1 cwt = 45.359 kg
Soybeans 1 bushel of soybeans = 60 pounds (this is the standard weight) 1 pound = 0.45359 kg 1 bushel of soybeans = 60 × 0.45359 = 27.216 kg
# Dry beans- convert to price/kg
price <- price %>%
mutate(
price_per_kg = case_when(
commodity == "BEANS" ~ value/45.359,
commodity == "SOYBEANS" ~ value/27.216
),
price_per_lb = case_when(
commodity == "BEANS" ~ value/100,
commodity == "SOYBEANS" ~ value/60
)
)
# Rename and select colums of interest
price <- price %>%
mutate(
variable_name = case_when(
commodity == "BEANS" ~ "commodity_price_drybean_conv",
commodity == "SOYBEANS" ~ "commodity_price_soy_conv")
) %>%
select(year, commodity, variable_name, starts_with("price"))2.1.2.3 PPI
We present data in 2023 prices used the St. Louis Fed, Producer Price Index by Commodity for soybeans and dry beans.
The price index for dry beans starts in 2015, we use the 2015 value for 2014.
# Import PPI
ppi_drybeans <- read_csv("data_raw/WPU0113011_drybeans.csv") %>%
rename(
drybeans = WPU0113011
)
ppi_soybeans <- read_csv("data_raw/WPU01830131_soybeans.csv") %>%
rename(
soybeans = WPU01830131
)
ppi <- left_join(ppi_soybeans, ppi_drybeans)
rm(ppi_drybeans, ppi_soybeans)
# Add year and drop data we don't need
ppi <- ppi %>%
mutate(
year = year(observation_date)
) %>%
filter(
year <2025 & year > 2013
)
# Get average per year
ppi <- ppi %>%
group_by(year) %>%
summarise(drybeans = mean(drybeans, na.rm = TRUE),
soybeans = mean(soybeans)) %>%
ungroup
# Get index to convert to 2023 dollars
ppi <- ppi %>%
mutate(
drybeans_index = drybeans[year == 2023]/drybeans,
soybeans_index = soybeans[year == 2023]/soybeans
)
# Make 2014 index for dry beans equal to 2015
ppi <- ppi %>%
mutate(
drybeans_index = case_when(
is.nan(drybeans_index) ~ drybeans_index[year==2015],
TRUE ~ drybeans_index)
)
# Put in correct format to join with price data
ppi <- ppi %>%
select(year, drybeans_index, soybeans_index) %>%
pivot_longer(
cols = !year,
names_to = "commodity",
values_to = "ppi_index"
) %>%
mutate(
commodity = case_when(
commodity == "drybeans_index" ~ "BEANS",
commodity == "soybeans_index" ~ "SOYBEANS")
)
# Join price and ppi data
price <- price %>%
left_join(ppi)
# Convert prices to 2023 dollars
price <- price %>%
mutate(
real_price_kg_2023dollars = price_per_kg * ppi_index
) %>%
rename(
nominal_price_kg = price_per_kg,
nominal_price_lb = price_per_lb) %>%
select(-ppi_index)
# Round
price <- price %>%
mutate(across(nominal_price_kg:last_col(),
~round(., 2))) %>%
ungroup()
# Drop commodity
price <- price %>%
select(-commodity)
# rename
price_full <- price2.1.2.4 Commodity price
We use the data collected above using 2023 dollars and convert the data to be in a similar format to the cost of production data. In this case, we will have different prices for organic, conventional, and New York beans sold through differentiated channels (not data on soy for this category).
# Keep 2023 data only
price <- price %>%
filter(year == 2023)2.1.3 Organic
We use data from The Risk Management Agency (RMA) on approved the 2023 CY projected prices shown below for the following plans of insurance: Yield Protection, Revenue Protection, and Revenue Protection with Harvest Price Exclusion.
The projected price per pound for drybeans is double for organic compared to conventional. We double all conventional prices.
For soybeans, we use Price discoverty data from the RMA.
# import soy data
soy_organic <- read_xlsx("data_raw/RMA_soybeans.xlsx",
sheet = "Current Periods") %>%
janitor::clean_names()
# Select data of interest
soy_organic <- soy_organic %>%
select(commodity_year, practice_name, state_name, projected_price)
# compare prices for organic and conventional for the same state year
soy_organic <- soy_organic %>%
group_by(commodity_year, practice_name) %>%
summarise(
price = mean(projected_price)
) %>%
mutate(
organic_conventional =
price[practice_name=="Organic"] /
price[practice_name=="Conventional"]
) %>%
slice(1) %>%
pull(organic_conventional)
# Get price/per kg. for drybeans and add organic (*2)
drybean_price <- price %>%
filter(variable_name=="commodity_price_drybean_conv")
drybean_price <- drybean_price %>%
bind_rows(
drybean_price %>%
mutate(
across(nominal_price_kg:last_col(),
~. * 2),
variable_name = "commodity_price_drybean_organic")
)
# Get price/per kg. for soybeans
soybean_price <- price %>%
filter(variable_name=="commodity_price_soy_conv")
soybean_price <- soybean_price %>%
bind_rows(
soybean_price %>%
mutate(
across(nominal_price_kg:last_col(),
~. * soy_organic),
variable_name = "commodity_price_soybean_organic")
)
# Bind rows
price <- bind_rows(drybean_price, soybean_price)
rm(soy_organic, drybean_price, soybean_price)2.1.3.1 Differentiated price
We use data from the interviews to understand differentiated prices
NY Bean: Breakdown on a 50lb bag: Grower receives $52; Cleaner $5; $57 price out the door
# Dry beans ppi for coverting 2024 dollars to 2023 dollars
ppi_vector <- ppi %>%
filter(year==2024 & commodity=="BEANS") %>%
pull(ppi_index)
# Add differentiated data
diff_price <- tibble(
year = 2023,
variable_name = "differentiated_price_drybean_conventional",
nominal_price_lb = 52/50,
nominal_price_kg = nominal_price_lb * 0.45359237,
real_price_kg_2023dollars = nominal_price_kg * ppi_vector
)
# Add organic by multiplying by 2
diff_price <- diff_price %>%
bind_rows(
diff_price %>%
mutate(
across(nominal_price_lb:last_col(),
~. * 2),
variable_name = "differentiated_price_drybean_organic")
)
# Bind rows
price <- bind_rows(price, diff_price)2.1.4 Save
Combine all data into one file and save. Final cost of production data
# Add worksheet
wb <- createWorkbook()
addWorksheet(wb, "farmgate_price")
writeData(wb, "farmgate_price", price)
# Add info
info <- tibble(
sheet_name = c("farmgate_price"),
description = "Price received by the producer in $/lb. and $/kg. by year for drybeans and soybeans, conventional and organic, commodity and differentiated (i.e., grown in NY and through a source verified channel). All prices are national, except for the differentiated",
notes = "Nominal prices are reported as the price that was receved in that year. real_price_kg is the price per kg., adjusted for inflation, reported in 2023 dollars. real_price_kg the number that is used in the model. We use 2023 dollars because that is the year of data we are using for school purchases",
source = "USDA NASS Quick stats for conventional prices, RMA data to inflate numbers for organic prices, interview data for differentiated price.")
# Add worksheet
addWorksheet(wb, "info")
writeData(wb, "info", info)
# Save
saveWorkbook(wb, "data_final/farmgate_price_received.xlsx",
overwrite = TRUE)2.2 Processing and distribution
2.2.1 Description of Data Sources
All of the data from this file will be used to inform the supply chain of the ABM model.
Wholesalers and Distributors We define the list of wholesale/distributor businesses from the SimplyAnalytics database of businesses located in New York State (NYS) that could carry beans. We assume that any wholesaler/distributor can carry MWBE identified product or local identified product. We also assume that wholesalers in NYS can supply all product demanded by New York City (NYC).
The only businesses that are explicity included in the model are those that are identified as MWBE businesses or those that sold to NYC previously.
Aggregators/cleaners and processors We define the list of aggregator/cleaners and processors based on businesses that…
- were interviewed and/or identified by research team as key businesses in the bean supply chain (DryBean_SupplyChain_Data)
- sold to NYC previously and are located in NYS or located outside of NYS but sells NYS source-identified or NYS MW/BE product
- sells NYS source-identified product is defined in the NYC purchasing data by the variable “NY state spend” which indicates if an item is grown, processed, manufactured, or distributed by businesses located within New York State. The data is such that we cannot identify which business along the supply chain was a NYS product as the origin detail (processor), distributor, and vendor are all listed in one row where the “NY state spend” variable indicates yes/no all along the supply chain. We include all businesses along the supply chain that indicated at least one was a NYS product as NYS source identified.
- sold to NYC previously and are registered as a Minority/Women Owned Business (MWBE) through NYS and/or NYC
- were part of the list of the New York Food Products Database
- we include only those processors that produced legumes or plant-based protein
The following information is needed for all businesses:
- Location (in NY or not)
- Address
- Role in the supply chain
- Has the business sold to New York City
- If the business sold to NYC, was the product from NY
- MWBE or not
- Works in the local and regional food supply chain
Location All data sets have information on if the business is located in NYS or not
Address We include an address for each business and then convert this to latitude and longitude coordinates to be included in the map. This data is not available for the majority of businesses so addresses are gathered and input manually.
Role in the supply chain For those businesses that are wholesalers/distributors, we have that information in all data sets. For processors, we do not have detailed enough information available in the SimplyAnalytics data and those are gathered by hand through web searches. For example with beans, we know which businesses process beans but we do not know if they are a primary or secondary processor. We do have that level of detail available in the data gathered through our interviews.
Has the business sold to NYC Any business that is in the NYC data we indicate has sold to NYC, we assume all other businesses have not.
MWBE These data are listed in the NYC data and the interview data. For the SimplyAnalytics data, in order to determine if businesses are MWBE, we use a New York State database and a NYC database.
Works with local farms Businesses that sell locally are assumed to be those that were interviewed as part of this project and those found in the Vendor Contact List in the NY Food Product Database.
Max inventory This represents the maximum inventory that can flow through the businesses. If there is no supply contraints then we use the number of pounds of beans that NYC purchased as the max inventory with the goal that this number makes it so there is no constraint on how much the business can purchase in the model.
Organic We assume that all wholesalers/distributors can carry organic producers. Cleaners/aggregators and processors were identified by a web search if they can carry certified organic products.
Note: For confidentiality reasons, we have removed business names, phone numbers, and addresses from the publicly available data. This code will not run as the source data is not available publicly, but the code can be used to understand our process.
2.2.2 SimplyAnalytics
TODO: remove last 4 digits from the zipcode This data is used for the wholesale busineses.
Simply Analytics is a web-based mapping application that allows users to map bushiness by name or by industry. Data are available to Cornell researchers through the Management Library. We collected data from (ADD YEAR) on the NAICS codes that are relevant to processing, distribution and wholesaling of dry beans, beef and leafy greens.
We use data aggregated at the 6-digit NAICS code.
The NAICS codes of interest for us include:
- 424420 Packaged Frozen Food Merchant Wholesalers
- 424410 General Line Grocery Merchant Wholesalers
- 424420 Packaged Frozen Food Merchant Wholesalers
- 424490 Other Grocery and Related Products Merchant Wholesalers
- 424510 Grain and Field Bean Merchant Wholesalers
- 484110 General Freight Trucking, Local
- 484121 General Freight Trucking, Long-Distance, Truckload
- 484122 General Freight Trucking, Long-Distance, Less Than Truckload
- 493110 General Warehousing and Storage
- 493120 Refrigerated Warehousing and Storage
- 493130 Farm Product Warehousing and Storage
- 493190 Other Warehousing and Storage
2.2.2.1 Import and clean data
library(tidyverse)
library(openxlsx)
library(janitor)
library(readxl)
library(fuzzyjoin)
# Create workbook
wb <- createWorkbook()
# Define file path
file_path <- "../data_raw/ToddNAICS4.xlsx"
# Define sheet names of interest
sheet_names <- c("Wholesaling (42)", "Transportation (48)",
"Warehousing & Storage (49)")
# Import data
naics_df <- sheet_names %>%
set_names() %>%
map_dfr(~suppressWarnings(read_excel(file_path,
sheet = .x)) %>%
select(NAICS, Description, `Company Name`, `Street Address`,
City, State, `Zip Code`) %>%
mutate(
`Zip Code` = as.character(`Zip Code`)),
.id = "sheet_name") %>%
clean_names()
# Filter NAICS of interest
wholesale_dist <- c("424420", "424410", "424420", "424490",
"424510", "484110", "484121", "484122",
"493110", "493120", "493130", "493190")
naics_df <- naics_df %>%
filter(naics %in% wholesale_dist)
# Only keep businesses located in NY
naics_df <- naics_df %>%
filter(state=="NY")2.2.2.2 Define variables
Supply chain role For the the processing sectors of the supply chain, we specify the commodity, and for the wholesale/distribution sectors we assume the sell all three commodities.
Located in NY or not We create a dummy variable with a 1 if in NY and 0 otherwise. All of these businesses are located in NYS.
# Supply chain role
naics_df <- naics_df %>%
mutate(
wholesaler_distributor = 1)
# Define NY dummy (this will be used later, all businesses here are in NY)
naics_df <- naics_df %>%
mutate(
ny = case_when(
state=="NY" ~ 1,
TRUE ~ 0))
# Keep columns of interest
naics_df <- naics_df %>%
select(-c(naics, description, sheet_name))
# Add source
naics_df <- naics_df %>%
mutate(
simply_analytics=1)2.2.3 New York City Purchasing
Overall, we use the NYC purchasing data to get a list of aggregator/cleaner and processor businesses that are located in NYS and have sold to NYC or are located outside of NYS but sell NYS source identified products to NYC.
We also include a list of wholesalers that we will match with the Simply Analytics database to add additional information to the existing Simply Analytics data.
Here we want the data from Citywide, which is what is downloaded when you download the full purchasing data from the NYC Food Policy Dashboard.
We keep all data relevant for the year 2023, based on primary food product category equal to legumes or meals that are meatless. In these data we have company names but we do not have information on their location.
The column called ny_state_spend_y_n indicates if an item is grown, processed, manufactured, or distributed by businesses located within New York State. The challenge is that the data includes up to three companies per line (origin_detail, distributor, and vendor) and for all three of these only one value for ny_state_spend_y_n. Therefore at least one, but not necessarily all of these businesses are in NYS. We want to identify businesses that sold NYS identified products to NYC, but using these data, we can’t know which of the three businesses in each row was the one identified. As a first step, we assume all businesses in the row (i.e., origin_detail, distributor, and vendor) are assumed to supply NYS product.
We assume all businesses listed under origin_detail are processors and get the beans based on the products they sold and all other businesses are wholesaler/distributors.
We want to include all businesses from this data set that are located in NYS and all out-of-state businesses that purchased NY identified products and sold to NYC (i.e., ny_state_spend_y_n==“y”), indicated by the variable “nys_product_purchase” equal to 1 if purchased source identified NY product and 0 otherwise.
While we do have a column in this dataset that indicates MWBE, we do not know from this data which business along the supply chain is MWBE. But since we have a list of all certified MWBE businesses for NYC and NYS, we rely on that list to confirm if a business is MWBE or not (rather than using the NYC purchasing database). Note that there are no bean processors that from the NYC data that were matched with the NYC/NYS MWBE certified data.
We want to match data for all businesses listed. For each row there is a column called origin detail, distributor and vendor. We create a new column called company_name that includes all of the businesses listed in each column. We do not retain the columns pertaining to how much was purchased by NYC.
There are also a few businesses that sell meals with beans in them, specifically patties that we are including.
2.2.3.1 Import and clean
# Import data
nyc_df <- read_xlsx("../data_raw/Public-Dashboard-Data-for-Download-FY19-23.xlsx",
sheet = "Citywide") %>%
clean_names()
# filter data of interest
nyc_df <- nyc_df %>%
filter(time_period ==2023 &
primary_food_product_category == "Legumes" |
(primary_food_product_category=="Meals" &
str_detect(product_type, "MEATLESS"))
)
# Max inventory for each business is what NYC purchased
nyc_bean_max <- nyc_df %>%
summarise(sum(total_weight_in_lbs)) %>%
pull()
# Keep variables of interest
nyc_df <- nyc_df %>%
select(
origin_detail, distributor, vendor,
primary_food_product_category, ny_state_spend_y_n
)
# Add supply chain role, ny, sold to nyc
nyc_df <- nyc_df %>%
mutate(nys_product_purchased = case_when(
ny_state_spend_y_n=="Y" ~ 1,
TRUE ~ 0),
sold_to_NYC = 1) %>%
select(!c(ny_state_spend_y_n))
# Make all business names capitalized to join with Simply Analytics data
nyc_df <- nyc_df %>%
mutate(across(c(origin_detail, vendor, distributor),
~str_to_upper(.)))
# Get one column with all business names from origin_detail, vendor, distributor
nyc_df <- nyc_df %>%
pivot_longer(
cols = !c(primary_food_product_category, nys_product_purchased, sold_to_NYC),
names_to = "name",
values_to = "company_name")
# Add supply chain location
nyc_df <- nyc_df %>%
mutate(
beans_processor = case_when(
name == "origin_detail" & primary_food_product_category %in% c("Legumes", "Meals") ~ 1,
TRUE ~ 0),
wholesaler_distributor = case_when(
name %in% c("vendor", "distributor") ~ 1,
TRUE ~ 0))
# Some company names have location information included, drop everything after and including the first comma
nyc_df <- nyc_df %>%
mutate(company_name = str_remove(company_name, ",.*")) %>%
filter(company_name!="")
# Drop trailing spaces
nyc_df <- nyc_df %>%
mutate(
company_name = str_trim(company_name, side = "right")
)
# There are some companies where it looks like two companies (e.g., RIVER / METRO COMMODITIES), for now these are included and can be changed when we do the manual data manipulation.
# Keep distinct observations only and drop blanks
nyc_df <- nyc_df %>%
distinct(company_name, .keep_all = TRUE) %>%
filter(company_name != "(BLANK)")
# One company is listed multiple times but with slightly different names, keep only one
nyc_df <- nyc_df %>%
filter(!company_name %in% c("FURMANO", "FURMANO'S", "FURMANOS"))
# Drop columns we don't need
nyc_df <- nyc_df %>%
select(company_name, nys_product_purchased, sold_to_NYC,
beans_processor:last_col())
# Add data source
nyc_df <- nyc_df %>%
mutate(
nyc_dashboard = 1)2.2.3.2 Join NYC with SimplyAnalytics
Here we take the NYC wholesale data information and join it with the Simply Analytics data.
2.2.3.2.1 NYC data is the base
We add addresses from the SA data to wholesalers that sold to NYC. And then we manually add in any missing addresses. This is one set of wholesale businesses that will be included in the model.
We determine the max inventory to be a number where the quantitiy is unlimited. As a base, we add up the total pounds of products that NYC purchased.
# Keep NYC wholesale data only
nyc_wholesale_df <- nyc_df %>%
filter(wholesaler_distributor==1)
# Join with NYC data to get addresses from SimplyAnalytics data
nyc_wholesale_df <- stringdist_left_join(
nyc_wholesale_df, naics_df, by = "company_name",
max_dist = 0.15, method = "jw")
# keep only one name and Drop duplicates
nyc_wholesale_df <- nyc_wholesale_df %>%
select(-ends_with(".y"))
# remove .x at the end of some variables
nyc_wholesale_df <- nyc_wholesale_df %>%
rename_with(~str_remove(.x, "\\.x$"), ends_with(".x"))
# List missing
missing <- nyc_wholesale_df %>%
filter(is.na(street_address))
# Manually looked at missing, all but 1 located outside of NYS and none sold NYS source identified products, add missing addresses for relevant business
nyc_wholesale_df <- nyc_wholesale_df %>%
mutate(
street_address = case_when(
company_name == "TERI NICHOLS" ~ "10101c Avenue D",
TRUE ~ street_address),
city = case_when(
company_name == "TERI NICHOLS" ~ "Brooklyn",
TRUE ~ city),
state = case_when(
company_name == "TERI NICHOLS" ~ "NY",
TRUE ~ state),
zip_code = case_when(
company_name == "TERI NICHOLS" ~ "11236",
TRUE ~ zip_code)
)
# Drop businesses with missing information (i.e., out of NYS), all other businesses are in NYS
nyc_wholesale_df <- nyc_wholesale_df %>%
filter(!is.na(street_address))
# Keep columns of interest
nyc_wholesale_df <- nyc_wholesale_df %>%
select(company_name, street_address,
city, state, zip_code,
wholesaler_distributor,
nys_product_purchased, sold_to_NYC)
# add column for max inventory
nyc_wholesale_df <- nyc_wholesale_df %>%
mutate(
max_inv_lbs_year = nyc_bean_max
)
# Add NY variable
nyc_wholesale_df <- nyc_wholesale_df %>%
mutate(
ny = 1)
# Save - this will be the list of wholesalers in the model
save(nyc_wholesale_df, file = "../data_processed/NYC_wholesalers.RData")2.2.3.2.2 Simply Analystics is the base
## To join with other data
# Keep NYC wholesale data only
wholesale_df <- nyc_df %>%
filter(wholesaler_distributor==1)
# Join NYC and SimplyAnalytics data using a fuzzy join, we want to keep all data and remove duplicates, there are no matches with the interview data
joined_df <- stringdist_left_join(
naics_df, wholesale_df, by = "company_name",
max_dist = 0.15, method = "jw")
# Visually inspect, they are all the same, naics_df has address information for most businesses, so we want to keep the company name as listed in the Simply Analytics data
joined_df <- joined_df %>%
rename(company_name = company_name.x)
# Keep Simply Analytics
joined_df <- joined_df %>%
select(-ends_with(".y"))
# remove .x at the end of some variables
joined_df <- joined_df %>%
rename_with(~str_remove(.x, "\\.x$"), ends_with(".x"))
# rename
naics_df <- joined_df
rm(joined_df)2.2.4 Key supply chain businesses
We import data on the businesses that were interviewed were interviewed and/or identified by research team as key businesses in the bean supply chain. All businesses are designated as local and regional food system businesses indicating they could distribute source identified product (lrfs_business==1).
These data include some businesses that were not interviewed. I only include the rows, Wild Kale through Vermont Bean Crafters (i.e., those with data included in a additional columns).
There are a few files with the same data, we use the file DryBean_SupplyChain_data.xlsx as it has the data that we need in the easiest format to use.
While there are important wholesalers and distributors listed here that can distribute NYS beans, because we assume that this is possible for all distributors, these are not explicitly modeled in our system. We mostly utilize this data for aggregator/cleaners and processors.
2.2.4.1 Import and clean data
# Import data collected by the qualitative team
qual_df <- read_xlsx("../data_raw/DryBean_SupplyChain_Data.xlsx",
sheet = "Sheet1") %>%
clean_names()
# Keep only data from those that were interviewed
qual_df <- qual_df %>%
filter(!is.na(agents_tolerence_for_risk))
# Change columns to be consistent (make business name capitalized to match with other data)
qual_df <- qual_df %>%
mutate(
company_name = str_to_upper(business_name),
ny = case_when(
in_state_out_of_state == "In-state" ~ 1,
TRUE ~ 0),
aggregator_cleaner = case_when(
str_detect(supply_chain_segment, "Aggregator|Cleaner|cleaner") ~ 1,
TRUE ~ 0),
primary_processor = case_when(
str_detect(supply_chain_segment, "Primary Processor") ~ 1,
TRUE ~ 0),
secondary_processor = case_when(
str_detect(supply_chain_segment, "Secondary Processor") ~ 1,
TRUE ~ 0),
wholesaler_distributor = case_when(
str_detect(supply_chain_segment, "Wholesaler|Distributor|copacker") ~ 1,
TRUE ~ 0))
# Keep data of interest
qual_df <- qual_df %>%
select(company_name, ny, aggregator_cleaner:wholesaler_distributor,
mwbe)
# Add local and regional food system business indicator for all businesses
qual_df <- qual_df %>%
mutate(
lrfs_business=1)
# Add source
qual_df <- qual_df %>%
mutate(
interview_data = 1)2.2.4.2 Join to NYC/SA data
# Do any of these businesses currently sell to NYC?
joined_df <- stringdist_inner_join(qual_df, nyc_df,
by = "company_name",
max_dist = 0.15,
method = "jw")
# Only one company joined but has a different name. Easier to manually add
qual_df <- qual_df %>%
mutate(
sold_to_NYC = case_when(
company_name == "FURMANO FOODS" ~ 1,
TRUE ~ 0)
)
rm(joined_df)
## Join with SimplyAnalytics data
joined_df <- stringdist_left_join(naics_df, qual_df,
by = "company_name",
max_dist = 0.15,
method = "jw")
# Keep data from SimplyAnalytics
joined_df <- joined_df %>%
rename(company_name = company_name.x)
# Keep Simply Analytics columns when duplicated
joined_df <- joined_df %>%
select(-ends_with(".y"))
# remove .x at the end of some variables
joined_df <- joined_df %>%
rename_with(~str_remove(.x, "\\.x$"), ends_with(".x"))
# rename
naics_df <- joined_df
rm(joined_df)2.2.5 NY Food Product Database
We use data the NY Food Product Database to identify businesses that sell New York branded products (lrfs==1).
Bean processors include any manufacturer that lists the following values for Sub Category: “Legumes,Plant-Based Protein”. There are no bean processors in this data base. However, we add one manually, Community Food Works (Milestone Mill) is a processor and distributor to include that was added after our data was available. We do not rely on the MWBE certification from this database as it is unclear whether is it for the processor or wholesaler. Rather we use the list of businesses certified MWBE by NYC and NYC.
2.2.5.1 Import and clean data
# Import data
ny_foods_df <- read_csv("../data_raw/NY Food Products - Approved-Grid View.csv") %>%
clean_names()
# Select data of interest
ny_foods_df <- ny_foods_df %>%
select(sub_category, manufacturer, distributors)
# Multiple distributors are listed in each row separated by a comma, make into a separate list
ny_foods_df <- ny_foods_df %>%
separate_rows(distributors, sep = ",\\s*(?!\\s*Inc)") %>%
mutate(distributors = str_trim(distributors),
distributors = str_remove_all(distributors, '["\']'))
# Define businesses that process beans
ny_foods_df <- ny_foods_df %>%
mutate(
beans_processor = case_when(
sub_category %in% c("Legumes,Plant-Based Protein") ~ 1,
TRUE ~ 0)) %>%
select(!sub_category)
# Add in Community Food Works (was added after our data)
ny_foods_df <- ny_foods_df %>%
add_row(
manufacturer = "Community Food Works (Milestone Mill)",
distributors = "Community Food Works (Milestone Mill)",
beans_processor = 1
)
# Keep only wholesale/distributors
distributors <- ny_foods_df %>%
select(distributors) %>%
mutate(wholesaler_distributor = 1) %>%
distinct(.keep_all = TRUE) %>%
rename(company_name = distributors)
# Add processors
processors <- ny_foods_df %>%
filter(beans_processor==1) %>%
select(manufacturer, beans_processor) %>%
distinct(.keep_all = TRUE) %>%
rename(company_name = manufacturer)
# combine
ny_foods_df <- full_join(distributors, processors) %>%
mutate(across(c(beans_processor, wholesaler_distributor),
~case_when(is.na(.) ~ 0,
TRUE ~ .))
)
rm(distributors, processors)
# Make upper case and add lrfs column
ny_foods_df <- ny_foods_df %>%
mutate(company_name = toupper(company_name),
lrfs_business = 1)
# Filter if missing
ny_foods_df <- ny_foods_df %>%
filter(!is.na(company_name))
# Add source
ny_foods_df <- ny_foods_df %>%
mutate(
food_product_database = 1
)2.2.5.2 Join to SimplyAnalytics
Here we take the NY food product data base wholesale data information and join it with the Simply Analytics data.
# Keep wholesale data only
wholesale_df <- ny_foods_df %>%
filter(wholesaler_distributor==1)
# Join with SimplyAnalytics data using a fuzzy join
joined_df <- stringdist_left_join(
naics_df, wholesale_df, by = "company_name",
max_dist = 0.15, method = "jw")
# Visually inspect, they are all the same, naics_df has address information for most businesses, so we want to keep the company name as listed in the Simply Analytics data
joined_df <- joined_df %>%
rename(company_name = company_name.x)
# Keep lrfs from NY database
joined_df <- joined_df %>%
select(-lrfs_business.x) %>%
rename(lrfs_business = lrfs_business.y)
# Keep Simply Analytics
joined_df <- joined_df %>%
select(-ends_with(".y"))
# remove .x at the end of some variables
joined_df <- joined_df %>%
rename_with(~str_remove(.x, "\\.x$"), ends_with(".x"))
# remove columns we don't need
joined_df <- joined_df %>%
select(!c(beans_processor, primary_processor,
secondary_processor, aggregator_cleaner))
# rename
naics_df <- joined_df
rm(joined_df)2.2.6 MWBE
We join the NYS and NYC MWBE certified businesses with all of our data sets.
New York State provides a database with all New York State Contractors that are certified MWBE and by New York City. We download the entire directory from both.
We select the following commodity codes:
- 019 - AGRICULTURAL CROPS AND GRAINS INCLUDING FRUITS, MELONS, NUTS
- 375 - FOODS: BAKERY PRODUCTS (FRESH)
- 380 - FOODS: DAIRY PRODUCTS (FRESH)
- 385 - FOODS, FROZEN
- 390 - FOODS: PERISHABLE
- 393 - FOODS: STAPLE GROCERY AND GROCER’S MISCELLANEOUS ITEMS
2.2.6.1 Import data
# Import MWBE directories, keep data of interest and make capital
mwbe_ny <- read_csv("../data_raw/Directory_2025-05-30_862.csv",
skip = 5) %>%
clean_names() %>%
select(company_name) %>%
mutate(company_name = str_to_upper(company_name))
mwbe_nyc <- read_xlsx("../data_raw/OnlineDirectory06042025.xlsx",
skip = 6) %>%
clean_names() %>%
mutate(company_name = str_to_upper(vendor_formal_name)) %>%
select(company_name)
# Join data (convert to UTF-8 format in order to join)
mwbe <- full_join(mwbe_ny, mwbe_nyc) %>%
mutate(mwbe = 1,
company_name = iconv(company_name, from = "", to = "UTF-8"))
rm(mwbe_ny, mwbe_nyc)2.2.6.2 Join to SimplyAnalytics
# Join with other data so see if overlap
joined_df <- stringdist_inner_join(naics_df, mwbe,
by = "company_name",
max_dist = 0.03,
method = "jw")
joined_df <- joined_df %>%
select(company_name.x, company_name.y, everything())
joined_df <- joined_df %>%
rename(company_name = company_name.x) %>%
distinct(company_name)
# Create a list of mwbe businesses and use that to add mwbe to df
mwbe_list <- joined_df %>%
pull(company_name)
naics_df <- naics_df %>%
mutate(
mwbe = case_when(
company_name %in% mwbe_list ~ 1,
TRUE ~ 0))
# Keep MWBE only
naics_df <- naics_df %>%
filter(mwbe==1) %>%
arrange(company_name)
# Drop duplicates, unless they have separate addresses
naics_df <- naics_df %>%
distinct(.keep_all = TRUE)
# Fill NAs with 0
naics_df <- naics_df %>%
mutate(across(nys_product_purchased:last_col(),
~case_when(is.na(.) ~ 0,
TRUE ~ .)))
# Make zip code 5 digits
naics_df <- naics_df %>%
mutate(zip_code = str_sub(zip_code, 1,5))
# Keep only columns of interest
naics_df <- naics_df %>%
select(
company_name:ny,
nys_product_purchased, sold_to_NYC,
mwbe, lrfs_business)
# Save - this will be wholesalers in the database
save(naics_df, file = "../data_processed/MWBE_wholesalers.RData")2.2.6.3 Join to NYC processors
Here we match the data to the processor data only and find no matches.
All NYC purchasing data from a wholesaler will be in the SimplyAnalytics data.
# No matches
joined_df <- stringdist_inner_join(
ny_foods_df %>%
filter(beans_processor==1),
mwbe, by = "company_name",
max_dist = 0.2, method = "jw")2.2.6.4 Join to NY Food Product Dashboard data
Here we match the data to the processor data only and find no matches.
All wholesaler will be in the SimplyAnalytics data.
joined_df <- stringdist_inner_join(
ny_foods_df %>%
filter(beans_processor==1),
mwbe, by = "company_name",
max_dist = 0.2, method = "jw")2.2.7 Interim Final data
Here we keep wholesaler data from SimplyAnalytics (with any additional information from other data sets included) and processor data from all other data sources.
Here we add in the columns for the address data that will be added manually. At this step, the data is saved as an excel file and then addresses and other missing information are manually added by the qualitative team. After these additions, the data is imported, geocoded and then exported in its final format.
# Drop wholesale data from the NYC data
final_nyc_df <- nyc_df %>%
filter(beans_processor==1)
# Drop wholesale data from the NY food product data
final_ny_foods_df <- ny_foods_df %>%
filter(beans_processor==1)
# Drop wholesale data from the interview data
final_qual_df <- qual_df %>%
filter(aggregator_cleaner==1 |
primary_processor==1 |
secondary_processor==1)
# Combine processor data from all sources
df <- bind_rows(final_nyc_df,
final_ny_foods_df,
final_qual_df)
rm(final_ny_foods_df, final_ny_foods_df, final_qual_df)
# Arrange
df <- df %>%
arrange(company_name)
# Manually drop duplicates with different names
df <- df %>%
filter(!company_name %in% c("DR PRAEGERS",
"GARDEIN HAGERSTOWN, BRITISH COLOMBIA, CANADA MD",
"HOUSE FOODS AMERICA",
"PORT ROYAL SALES",
"SYS CLS",
"USDA")) %>%
filter(!str_detect(company_name, "BRITISH COLOMBIA")) %>%
mutate(
company_name = case_when(
company_name == "WILD KALE" ~ "WILDKALE",
TRUE ~ company_name)
)
# Manually fix duplicates
df <- df %>%
filter(company_name!="FURMANO FOODS") %>%
add_row(
company_name="FURMANO FOODS",
nys_product_purchased=1,
sold_to_NYC=1,
beans_processor=1,
wholesaler_distributor=0,
nyc_dashboard=1,
lrfs_business=1,
food_product_database=NA,
ny = 0,
aggregator_cleaner = 0,
primary_processor = 1,
secondary_processor=0,
mwbe = 0,
interview_data=1
)
df <- df %>%
filter(company_name != "GENESEE VALLEY BEAN CO") %>%
add_row(
company_name = "GENESEE VALLEY BEAN CO",
nys_product_purchased = NA,
sold_to_NYC = 0,
beans_processor = 1,
wholesaler_distributor = 1,
nyc_dashboard = NA,
lrfs_business = 1,
food_product_database = 1,
ny = 1,
aggregator_cleaner = 1,
primary_processor =0,
secondary_processor = 0,
mwbe = 0,
interview_data = 1
)
#Put in alphabetical order again
df <- df %>%
arrange(company_name)
# Add columns we need and put in correct order
df <- df %>%
mutate(
street_address = NA,
city = NA,
state = NA,
zip_code = NA,
aggregator_scale = NA,
primary_processor_scale = NA,
secondary_processor_scale = NA,
max_inv = NA) %>%
select(
company_name, street_address, city, state, zip_code,
ny, aggregator_cleaner, primary_processor,
secondary_processor,
aggregator_scale,
primary_processor_scale,
secondary_processor_scale,
max_inv,
lrfs_business,
mwbe,
sold_to_NYC, nys_product_purchased
)
# Add in zero's for the columns that should be correct
df <- df %>%
mutate(across(c(mwbe, sold_to_NYC),
~case_when(
is.na(.) ~ 0,
TRUE ~ .))
)
# Add information on what data means
info <- tibble(
variable_name = c(
"company_name", "street_address", "city", "state",
"zip_code", "ny", "aggregator_cleaner", "primary_processor" ,
"secondary_processor",
"aggregator_scale",
"primary_processor_scale",
"secondary_processor_scale",
"max_inv",
"lrfs_business",
"mwbe",
"sold_to_NYC",
"nys_product_purchased"),
variable_description = c(
"Company name",
"Street address (e.g., 1038 COURT ST)",
"City",
"State abbreviation (e.g., NY)",
"Zip code as 5 digits",
"0/1 variable indicating if the business is located in NY or not",
"0/1 variable indicating if the business is a cleaner/aggregator or not",
"0/1 variable indicating if the business is a primary processor or not",
"0/1 variable indicating if the business is a secondary processor or not",
"Scale of cleaner aggregator business. If not a cleaner/aggregator then leave blank, otherwise fill in with small, medium, large",
"Scale of primary processor business. If not a primary processor then leave blank, otherwise fill in with small, medium, large",
"Scale of secondary processor business. If not a secondary processor then leave blank, otherwise fill in with small, medium, large",
"Maxiumum inventory capacity for this scale of processor",
"0/1 variable indicating if the business is sells NYS source identified product or not",
"0/1 variable indicating if the business is an MWBE processor or not",
"0/1 variable indicating if the business is sold to NYC or not",
"0/1 variable indicating if the business was part of a supply chain that sold NYS source identified product previously to NYC agencies"),
notes = c(
"",
"Can be lower case or upper case, fine to just copy and paste from whatever source. Do NOT include state or zip code.",
"",
"Ideally it will be the state abbreviation (NY) but you can also include the full name New York.",
"",
"1 indicates yes and 0 indicates no",
"1 indicates yes and 0 indicates no",
"1 indicates yes and 0 indicates no",
"1 indicates yes and 0 indicates no",
"Enter the text `small`, `medium`, or `large`. If you have different scale categories then enter these (such as small/large)",
"Enter the text `small`, `medium`, or `large`. If you have different scale categories then enter these (such as small/large)",
"Enter the text `small`, `medium`, or `large`. If you have different scale categories then enter these (such as small/large)",
"Choose on representative business from each category (e.g., small primary processor) and report the max inventory for all busineses in that category",
"1 indicates yes and 0 indicates no. Please correct if you know the business sells source identified NYS product",
"1 indicates yes and 0 indicates no. Do not change data in this column. ",
"1 indicates yes and 0 indicates no. Do not change data in this column.",
"1 indicates yes and 0 indicates no. Do not change data in this column.")
)
# Add worksheets
wb <- createWorkbook()
addWorksheet(wb, "data")
writeData(wb, "data", df)
addWorksheet(wb, "info")
writeData(wb, "info", info)
# Save
saveWorkbook(wb, "../data_processed/bean_processor_ADD_DATA.xlsx",
overwrite = TRUE)2.2.8 Processed data
The data that was saved in the step above was sent to the qualitative team and they manually added addresses, indicated those businesses that do not sell beans, added in data on the businesses location along the supply chain where missing (notably if the business was a primary or secondary processor), and estimated values for max inventory capacity.
We add the wholesale data that are MWBE to the data.
2.2.8.1 Import and clean
# Remove all df's except for the wholesale data of interest
rm(list=setdiff(ls(), c("naics_df", "nyc_wholesale_df", "nyc_bean_max")))
# Import (n= 57)
df <- read_xlsx("../data_processed/bean_processor_PROCESSED.xlsx") %>%
clean_names()
# Drop those indicated should be deleted in the notes (n=31)
df <- df %>%
mutate(
delete = case_when(
str_detect(notes, "Delete") |
str_detect(notes, "Too generic of a name. Not sure what co. it is.") ~ 1,
TRUE ~ 0)) %>%
select(delete, everything()) %>%
filter(delete==0)
#Drop if a business is not located in NYS and does not sell NYS identified product (lrfs_business==1) (n=14)
df <- df %>%
filter(state=="NY" |
(state != "NY" & (lrfs_business==1 | mwbe==1)))
# Make unlimited max inventory equal to what NYC purchasaed last year and make numeric
df <- df %>%
mutate(
max_inv_lbs_year = as.numeric(max_inv_lbs_year),
max_inv_lbs_year = case_when(
is.na(max_inv_lbs_year) ~ nyc_bean_max,
TRUE ~ max_inv_lbs_year)
)
# Drop columns we don't need
df <- df %>%
select(!c(delete, notes, ends_with("_scale"))) %>%
relocate(max_inv_lbs_year, .after = everything())
# Modify wholesale data
naics_df <- naics_df %>%
rename(sold_to_nyc = sold_to_NYC)
# Combine aggregator/processor data with final wholesale data
df <- bind_rows(df, naics_df, nyc_wholesale_df)
# Add zeros instead of NA and keep only columns of interest
df <- df %>%
relocate(wholesaler_distributor, .after = "ny") %>%
mutate(across(wholesaler_distributor:nys_product_purchased,
~case_when(is.na(.) ~ 0,
TRUE ~ .)))
# Make max inventory what NYC purchased for all wholesalers
df <- df %>%
mutate(
max_inv_lbs_year = case_when(
is.na(max_inv_lbs_year) ~ nyc_bean_max,
TRUE ~ max_inv_lbs_year)
)
# Assume all wholesalers can sell organic and for other supply chain actors only if indicated
df <- df %>%
mutate(
organic = case_when(
wholesaler_distributor==1 ~ 1,
is.na(organic) ~ 0,
TRUE ~ organic)
)2.2.8.2 Geocode
library(tidygeocoder)
# Add full address
df <- df %>%
mutate(
address = str_c(street_address, ", ",
city, ", ",
state, " ",
zip_code)
)
# Geocode
df <- df %>%
geocode(address,
method = 'census',
lat = latitude,
long = longitude,
full_results = FALSE)
# Manually fix those missing
missing <- df %>%
filter(is.na(latitude))
# Fix missing
df <- df %>%
mutate(
latitude = case_when(
company_name== "COMMUNITY FOOD WORKS (MILESTONE MILL)" ~ 41.937258,
company_name== "NEW YORK BEAN, LLC" ~ 42.976152,
company_name== "Unipak" ~ 40.694349,
company_name== "GO AFRICA GLOBAL LLC" ~ 42.6478356,
company_name== "HOME TOWN HAULING & RECYCLING, LLC" ~ 42.6478356,
company_name== "MARJORIE MC KEOWN" ~ 42.24757385253906,
company_name== "R & N TRANSPORT, INC." ~ 40.692420959472656,
company_name== "SOMETHING GOOD TO EAT, INC." ~ 40.79843521118164,
company_name == "US FOODS" &
street_address=="702 POTENTIAL PKWY SCOTIA" ~ 42.838743,
company_name == "US FOODS" &
street_address=="95 MAIN ST" ~ 42.889337,
company_name == "US FOODS" &
street_address=="177 NGARA FRNTR FD TERMIN" ~ 42.875127,
TRUE ~ latitude),
longitude = case_when(
company_name== "COMMUNITY FOOD WORKS (MILESTONE MILL)" ~ -74.020285,
company_name== "NEW YORK BEAN, LLC" ~ -77.870366,
company_name== "Unipak" ~ -74.297359,
company_name== "GO AFRICA GLOBAL LLC" ~ -74.7924148,
company_name== "HOME TOWN HAULING & RECYCLING, LLC" ~ -74.7924148,
company_name== "MARJORIE MC KEOWN" ~ -79.3206558227539,
company_name== "R & N TRANSPORT, INC." ~ -73.58958435058594,
company_name== "SOMETHING GOOD TO EAT, INC." ~ -73.61048126220703,
street_address=="702 POTENTIAL PKWY SCOTIA" ~ -73.979981,
company_name == "US FOODS" &
street_address=="95 MAIN ST" ~ -73.349132,
company_name == "US FOODS" &
street_address=="177 NGARA FRNTR FD TERMIN" ~ -78.820302,
TRUE ~ longitude)
)
# Re-order
df <- df %>%
select(latitude, longitude, everything())
rm(missing, naics_df, nyc_wholesale_df)2.2.8.3 Save
Here we save a full version and the version that can be shared publicly with business names and addresses removed. We also drop any columns that are not relevant.
2.2.8.3.1 Full data
# Save full file - not to be shared publicly
write_csv(df, "../data_final/beans_processor_distributor_full.csv")2.2.8.3.2 Final data
Replace business names with an unique id. We also drop beans_processor as it was replaced with primary and secondary and is confusing to include both. Lastly, we drop variables related to scale and local ownership as most of the data is missing and we are relying on other sources.
# Drop columns not needed
df <- df %>%
select(
address, company_name,
latitude, longitude,
ny, wholesaler_distributor, aggregator_cleaner,
primary_processor, secondary_processor,
lrfs_business, mwbe, sold_to_nyc,
organic, max_inv_lbs_year)
# Add a unique identifier for business name
df <- df %>%
mutate(id = row_number() + 9999) %>%
select(id, everything())
# create a file with business name/address and identifier and save
beans_business_id <- df %>%
select(id, address, company_name)
write_csv(df, "../data_processed/beans_business_id.csv")
# Drop business name/address from file
df <- df %>%
select(-address, -company_name)
# Save to the folder with all data
write_csv(df, "../data_final/beans_processor_distributor.csv")
# Save to the public data folder - no max inventory
df_nomaxinv <- df %>%
select(-max_inv_lbs_year)
write_csv(df_nomaxinv, "/Users/alg24/Library/CloudStorage/Box-Box/Research/CityFoodPolicy/2023.NYC_FFARRockefeller/research/data_master_record/CityFoodPolicy/drybeans/data_final/beans_processor_distributor.csv")2.3 Markets
2.4 Consumption
2.5 Institutional purchasing in New York State
We use data from NYC Food Policy, Good Food Purchasing data from 2019-2023, we download the full purchasing data set.
We provide average and standard deviation for price per lb. and kg. for dry beans by form (canned, dried, frozen), and by agency from 2019-2023. We drop data points where the price is more than two standard deviations outside of the average price.
2.5.1 Import data
Here we import data and define variables.
library(tidyverse)
library(readxl)
library(openxlsx)
library(knitr)
library(kableExtra)
# create workbook
wb <- createWorkbook()
# Import data
df <- read_xlsx("data_raw/Public-Dashboard-Data-for-Download-FY19-23.xlsx",
sheet = "Citywide")
# Keep bean data only
df <- df %>%
filter(`Food Product Category`=="Legumes" &
str_detect(`Product Name`, "bean"))
# Define form
df <- df %>%
mutate(Form = case_when(
str_detect(`Product Type`, "10 ") ~ "Canned",
str_detect(`Product Type`, "CANS|CND") ~ "Canned",
str_detect(`Product Type`, "CANNED") ~ "Canned",
str_detect(`Product Name`, "canned") ~ "Canned",
str_detect(`Product Type`, "FRZN") ~ "Frozen",
TRUE ~ "Dried"))
# Convert lbs. to kgs.
df <- df %>%
mutate(
`Total Weight in kgs` = `Total Weight in lbs` * 0.45359237
)
# Define price per kg
df <- df %>%
mutate(
`Price per kg` = `Total Cost`/`Total Weight in kgs`
)
# Define price per lb
df <- df %>%
mutate(
`Price per lb` = `Total Cost`/`Total Weight in lbs`
)
# Define a dummy for if the product was eligible for a price premium (MWBE and/or local)
df <- df %>%
mutate(
premium = case_when(
`MWBE y/n`=="Y" |
`NY State spend y/n`=="Y" ~ 1,
TRUE ~ 0)
)2.5.2 Understand missing total weight data
The missing data does have the number of units, but does not have weight in lbs. We can use the information provided about the product and estimate how much the product weighed and estimate a total weight.
Department of Education has a large purchase that lists number of unit. Units are #10 cans, we assume they each weight 6 lbs 15 oz (111 oz).
For now, we are not going to take this step, but can in the future if needed.
# Missing data
missing <- df %>%
filter(`Total Weight in lbs`==0)
# Zero lbs. but positive price
testing <- df %>%
mutate(
zero_lbs = case_when(
`Total Weight in lbs`==0 ~ 1,
TRUE ~ 0)
) %>%
group_by(Agency, `Time Period`, zero_lbs) %>%
count()
rm(missing, testing)2.5.3 Summary
Removed outliers that were more than 2 standard deviations from the mean based on price per pound, within agency, year, and form. We dropped 17 observations (397 down to 380).
Data are pulled for purchases with no attributes that would allow for a higher price (i.e., not MWBE, not purchased from New York). Because we will run the model with different scenarios related to local and MWBE, the data of interest are the bean purchases of non-local and non-MWBE products (i.e., no premium, defined as NY State spend y/n==N and MWBE y/n == N). Note there are no purchases of beans from MWBE businesses in the data.
# Drop data without a weight
df <- df %>%
filter(
`Total Weight in lbs`!=0)
# Remove outliers that are more than 2 SD from the mean
df <- df %>%
group_by(Agency, `Time Period`, Form) %>%
mutate(
mean_lb = mean(`Price per lb`, na.rm = TRUE),
sd_lb = sd(`Price per lb`, na.rm = TRUE)
) %>%
mutate(
outlier = case_when(
`Price per lb` < mean_lb - 2*sd_lb |
`Price per lb` > mean_lb + 2*sd_lb ~ 1,
TRUE ~ 0)
) %>%
filter(outlier==0) %>%
ungroup() %>%
select(-mean_lb, -sd_lb)
# Drop purchases that are local (note there are no MWBE businesses)
df <- df %>%
filter(`NY State spend y/n`=="N")
# Summary stats with outliers removed
sum <- df %>%
group_by(Agency, `Time Period`, Form) %>%
summarise(
mean_lb = mean(`Price per lb`, na.rm = TRUE),
sd_lb = sd(`Price per lb`, na.rm = TRUE),
mean_kg = mean(`Price per kg`, na.rm = TRUE),
sd_kg = sd(`Price per kg`, na.rm = TRUE),
n = n()
) %>%
ungroup()
# Add worksheet
addWorksheet(wb, "NYC_bean_purchasing")
writeData(wb, "NYC_bean_purchasing", sum)
# Info
info <- tibble(
sheet_name = c(
"NYC_bean_purchasing"),
description = c(
"Mean and standard deviation of price/lb. and price/kg. for beans by agency, year, and form (e.g, dried, canned, frozen)",
source = "NYC dashboard",
url = "https://www.nyc.gov/site/foodpolicy/good-food-purchasing/citywidedata.page",
notes = "Includes only purchases from outside of New York and not from MWBE businesses (i.e., no premiums associated with local/MWBE attributes included in the data).")
)
# Add Worksheet
addWorksheet(wb, "info")
writeData(wb, "info", info)
# Save
saveWorkbook(wb, "data_final/NYC_bean_purchasing.xlsx",
overwrite = TRUE)# Keep data per lb. and for 2022 only
table <- sum %>%
filter(`Time Period`==2022) %>%
select(-c(`Time Period`, mean_kg:last_col())) %>%
mutate(
sd_lb = case_when(
is.na(sd_lb) ~ 0,
TRUE ~ sd_lb
)) %>%
rename(
Mean = mean_lb,
SD = sd_lb
)
table %>%
kable(
digits = 2,
caption = '<span style="color: black; font-size: 18px; font-weight: bold;">Price per pound for dried and canned beans purchased by New York City in 2022</span>') %>%
kable_styling()2.6 LCA
Jasmine and Elsie add.