This just contains the list of publications by more than one researcher of the lab. See the personal websites of the members for more publications. Many publications arise from a strong network of national and international collaborations.

### 2021

Català, N.; Baixeries, J.; Ferrer-Cancho, R.; Padró, L.; Hernández-Fernández, A.

Zipf's laws of meaning in Catalan Journal Article

In: 2021.

Abstract | Links | BibTeX | Tags:

@article{Catala2021a,

title = {Zipf's laws of meaning in Catalan},

author = {N. Català and J. Baixeries and R. Ferrer-Cancho and L. Padró and A. Hernández-Fernández},

url = {https://arxiv.org/abs/2107.00042},

year = {2021},

date = {2021-01-01},

abstract = {In his pioneering research, G. K. Zipf formulated a couple of statistical laws on the relationship between the frequency of a word with its number of meanings: the law of meaning distribution, relating the frequency of a word and its frequency rank, and the meaning-frequency law, relating the frequency of a word with its number of meanings. Although these laws were formulated more than half a century ago, they have been only investigated in a few languages. Here we present the first study of these laws in Catalan. We verify these laws in Catalan via the relationship among their exponents and that of the rank-frequency law. We present a new protocol for the analysis of these Zipfian laws that can be extended to other languages. We report the first evidence of two marked regimes for these laws in written language and speech, paralleling the two regimes in Zipf's rank-frequency law in large multi-author corpora discovered in early 2000s. Finally, the implications of these two regimes will be discussed.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

Carrera-Casado, David; Ferrer-i-Cancho, Ramon

The advent and fall of a vocabulary learning bias from communicative efficiency Journal Article

In: Biosemiotics, 2021.

Abstract | Links | BibTeX | Tags:

@article{Carrera2021a,

title = {The advent and fall of a vocabulary learning bias from communicative efficiency},

author = {David Carrera-Casado and Ramon Ferrer-i-Cancho},

url = {https://arxiv.org/abs/2105.11519},

year = {2021},

date = {2021-01-01},

journal = {Biosemiotics},

abstract = {It is well-known that, when sufficiently young children encounter a new word, they tend to attach it to a meaning that does not have a word yet in their lexicon. In previous research, the strategy was shown to be optimal from an information theoretic standpoint. However, the information theoretic model employed neither explains the weakening of that vocabulary learning bias in older children or polylinguals nor reproduces Zipf's meaning-frequency law, namely the non-linear relationship between the number of meanings of a word and its frequency. Here we consider a generalization of the model that is channeled to reproduce that law. The analysis of the new model reveals regions of the phase space where the bias disappears consistently with the weakening or loss of the bias in older children or polylinguals. In the deep learning era, the model is a transparent low-dimensional tool for future experimental research and illustrates the predictive power of a theoretical framework originally designed to shed light on the origins of Zipf's rank-frequency law.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

Alemany-Puig, L.; Ferrer-i-Cancho, R.

Linear-time calculation of the expected sum of edge lengths in random projective linearizations of trees Journal Article

In: 2021.

Abstract | Links | BibTeX | Tags:

@article{Alemany2021b,

title = {Linear-time calculation of the expected sum of edge lengths in random projective linearizations of trees},

author = {L. Alemany-Puig and R. Ferrer-i-Cancho},

url = {https://arxiv.org/abs/2107.03277},

year = {2021},

date = {2021-01-01},

abstract = {The syntactic structure of a sentence is often represented using syntactic dependency trees. The sum of the distances between syntactically related words has been in the limelight for the past decades. Research on dependency distances led to the formulation of the principle of dependency distance minimization whereby words in sentences are ordered so as to minimize that sum. Numerous random baselines have been defined to carry out related quantitative studies on languages. The simplest random baseline is the expected value of the sum in unconstrained random permutations of the words in the sentence, namely when all the shufflings of the words of a sentence are allowed and equally likely. Here we focus on a popular baseline: random projective permutations of the words of the sentence, that is, permutations where the syntactic dependency structure is projective, a formal constraint that sentences satisfy often in languages. Thus far, the expectation of the sum of dependency distances in random projective shufflings of a sentence has been estimated approximately with a Monte Carlo procedure whose cost is of the order of Zn, where n is the number of words of the sentence and Z is the number of samples; the larger Z, the lower the error of the estimation but the larger the time cost. Here we present formulae to compute that expectation without error in time of the order of n. Furthermore, we show that star trees maximize it, and devise a dynamic programming algorithm to retrieve the trees that minimize it.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

Alemany-Puig, L.; Esteban, J. L.; Ferrer-i-Cancho, R.

Minimum projective linearizations of trees in linear time Journal Article

In: under review, 2021.

Abstract | Links | BibTeX | Tags:

@article{Alemany2021a,

title = {Minimum projective linearizations of trees in linear time},

author = {L. Alemany-Puig and J. L. Esteban and R. Ferrer-i-Cancho},

url = {https://arxiv.org/abs/2102.03277},

year = {2021},

date = {2021-01-01},

journal = {under review},

abstract = {The minimum linear arrangement problem (MLA) consists of finding a mapping π from vertices of a graph to integers that minimizes the sum of dependency distances. For trees, various algorithms are available to solve the problem in polynomial time; the best known runs in subquadratic time in n=|V|. There exist variants of the MLA in which the arrangements are constrained to certain classes of projectivity. Iordanskii, and later Hochberg and Stallmann (HS), put forward O(n)-time algorithms that solve the problem when arrangements are constrained to be planar. We also consider linear arrangements of rooted trees that are constrained to be projective. Gildea and Temperley (GT) sketched an algorithm for the projectivity constraint which, as they claimed, runs in O(n) but did not provide any justification of its cost. In contrast, Park and Levy claimed that GT's algorithm runs in O(n log d_max) where dmax is the maximum degree but did not provide sufficient detail. Here we correct an error in HS's algorithm for the planar case, show its relationship with the projective case, and derive an algorithm for the projective case that runs undoubtlessly in O(n)-time.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

Torre, Iván G.; Dębowski, Łukasz; Hernández-Fernández, Antoni

Can Menzerath’s law be a criterion of complexity in communication? Journal Article

In: PLOS ONE, 16 (8), pp. 1-21, 2021.

Abstract | Links | BibTeX | Tags:

@article{Gonzales2021a,

title = {Can Menzerath’s law be a criterion of complexity in communication?},

author = {Iván G. Torre and Łukasz Dębowski and Antoni Hernández-Fernández},

url = {https://doi.org/10.1371/journal.pone.0256133},

doi = {10.1371/journal.pone.0256133},

year = {2021},

date = {2021-01-01},

journal = {PLOS ONE},

volume = {16},

number = {8},

pages = {1-21},

publisher = {Public Library of Science},

abstract = {Menzerath’s law is a quantitative linguistic law which states that, on average, the longer is a linguistic construct, the shorter are its constituents. In contrast, Menzerath-Altmann’s law (MAL) is a precise mathematical power-law-exponential formula which expresses the expected length of the linguistic construct conditioned on the number of its constituents. In this paper, we investigate the anatomy of MAL for constructs being word tokens and constituents being syllables, measuring its length in graphemes. First, we derive the exact form of MAL for texts generated by the memoryless source with three emitted symbols, which can be interpreted as a monkey typing model or a null model. We show that this null model complies with Menzerath’s law, revealing that Menzerath’s law itself can hardly be a criterion of complexity in communication. This observation does not apply to the more precise Menzerath-Altmann’s law, which predicts an inverted regime for sufficiently range constructs, i.e., the longer is a word, the longer are its syllables. To support this claim, we analyze MAL on data from 21 languages, consisting of texts from the Standardized Project Gutenberg. We show the presence of the inverted regime, not exhibited by the null model, and we demonstrate robustness of our results. We also report the complicated distribution of syllable sizes with respect to their position in the word, which might be related with the emerging MAL. Altogether, our results indicate that Menzerath’s law—in terms of correlations—is a spurious observation, while complex patterns and efficiency dynamics should be rather attributed to specific forms of Menzerath-Altmann’s law.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

### 2020

González, I.; Artime, O.; Hernández-Fernández, A.; Serrano, B. Luque

¿Es el habla una señal crítica auto-organizada? Journal Article

In: Interdisciplina, 8 (20), pp. 113–128, 2020.

Abstract | Links | BibTeX | Tags:

@article{Gonzalez2020a,

title = {¿Es el habla una señal crítica auto-organizada?},

author = {I. González and O. Artime and A. Hernández-Fernández and B. Luque Serrano},

url = {http://hdl.handle.net/2117/175291},

doi = {10.22201/ceiich.24485705e.2020.20.71206},

year = {2020},

date = {2020-01-01},

journal = {Interdisciplina},

volume = {8},

number = {20},

pages = {113--128},

abstract = {A lo largo del siglo XX, los estudios en lingüística cuantitativa han ido mostrando la aparición de leyes potenciales en las lenguas, primero en textos escritos y posteriormente en el habla. Son leyes que parecen ubicuas y robustas, pero ¿por qué aparecen en el lenguaje? ¿Son resultados espurios debidos a la arbitrariedad de la segmentación de las palabras, o realmente son universales de la comunicación compleja? ¿Podemos investigar la presencia de estas leyes en otros sistemas de comunicación animal de los que no conocemos el código? Los enfoques interdisciplinares y transdisciplinares en la lingüística y el estudio de los sistemas de comunicación se antojan imprescindibles. Se exponen a modo de ejemplo dos estudios recientes realizados sobre corpus acústicos de hasta dieciséis lenguas, mediante un método general de segmentación de señales (método de los umbrales). Exploramos aquí la posibilidad de que las leyes estadísticas que emergen en el lenguaje sean fruto de un sistema crítico auto–organizado, al igual que otros fenómenos presentes en la Naturaleza. El método de los umbrales que se presenta permite analizar cualquier tipo de señal sin necesidad de conocer su codificación o segmentación. Esto abre nuevos caminos en la investigación lingüística permitiendo entre otras cosas realizar estudios comparativos entre el lenguaje humano y otros sistemas de comunicación animal.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

Alemany-Puig, L.; Ferrer-i-Cancho, R.

Edge crossings in random linear arrangements Journal Article

In: Journal of Statistical Mechanics, (2), pp. 023403, 2020.

Abstract | Links | BibTeX | Tags:

@article{Alemany2018a,

title = {Edge crossings in random linear arrangements},

author = {L. Alemany-Puig and R. Ferrer-i-Cancho},

doi = {10.1088/1742-5468/ab6845},

year = {2020},

date = {2020-01-01},

journal = {Journal of Statistical Mechanics},

number = {2},

pages = {023403},

abstract = {In spatial networks vertices are arranged in some space and edges may cross. When arranging vertices in a 1D lattice edges may cross when drawn above the vertex sequence as it happens in linguistic and biological networks. Here we investigate the general problem of the distribution of edge crossings in random arrangements of the vertices. We generalize the existing formula for the expectation of this number in random linear arrangements of trees to any network and derive an expression for the variance of the number of crossings in an arbitrary layout relying on a novel characterization of the algebraic structure of that variance in an arbitrary space. We provide compact formulae for the expectation and the variance in complete graphs, complete bipartite graphs, cycle graphs, one-regular graphs and various kinds of trees (star trees, quasi-star trees and linear trees). In these networks, the scaling of expectation and variance as a function of network size is asymptotically power-law-like in random linear arrangements. Our work paves the way for further research and applications in one-dimension or investigating the distribution of the number of crossings in lattices of higher dimension or other embeddings.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

Alemany-Puig, L.; Ferrer-i-Cancho, R.

Fast calculation of the variance of edge crossings in random linear arrangements Journal Article

In: pp. under review, 2020.

Abstract | Links | BibTeX | Tags:

@article{Alemany2019b,

title = {Fast calculation of the variance of edge crossings in random linear arrangements},

author = {L. Alemany-Puig and R. Ferrer-i-Cancho},

url = {https://arxiv.org/abs/2003.03258},

year = {2020},

date = {2020-01-01},

pages = {under review},

abstract = {The interest in spatial networks where vertices are embedded in a one-dimensional space is growing. Remarkable examples of these networks are syntactic dependency trees and RNA structures. In this setup, the vertices of the network are arranged linearly and then edges may cross when drawn above the sequence of vertices. Recently, two aspects of the distribution of the number of crossings in uniformly random linear arrangements have been investigated: the expectation and the variance. While the computation of the expectation is straightforward, that of the variance is not. Here we present fast algorithms to calculate that variance in arbitrary graphs and forests. As for the latter, the algorithm calculates variance in linear time with respect to the number of vertices. This paves the way for many applications that rely on an exact but fast calculation of that variance. These algorithms are based on novel arithmetic expressions for the calculation of the variance that we develop from previous theoretical work.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

Alemany-Puig, L.; Mora, M.; Ferrer-i-Cancho, R.

Reappraising the distribution of the number of edge crossings of graphs on a sphere Journal Article

In: Journal of Statistical Mechanics, pp. 083401, 2020.

Abstract | Links | BibTeX | Tags:

@article{Alemany2018b,

title = {Reappraising the distribution of the number of edge crossings of graphs on a sphere},

author = {L. Alemany-Puig and M. Mora and R. Ferrer-i-Cancho},

url = {https://arxiv.org/abs/2003.03353},

doi = {10.1088/1742-5468/aba0ab},

year = {2020},

date = {2020-01-01},

journal = {Journal of Statistical Mechanics},

pages = {083401},

abstract = {Many real transportation and mobility networks have their vertices placed on the surface of the Earth. In such embeddings, the edges laid on that surface may cross. In his pioneering research, Moon analyzed the distribution of the number of crossings on complete graphs and complete bipartite graphs whose vertices are located uniformly at random on the surface of a sphere assuming that vertex placements are independent from each other. Here we revise his derivation of that variance in the light of recent theoretical developments on the variance of crossings and computer simulations. We show that Moon’s formulae are inaccurate in predicting the true variance and provide exact formulae.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

Ferrer-i-Cancho, R.; Gómez-Rodríguez, J. L. Esteban C.; Alemany-Puig, L.

The optimality of syntactic dependency distances Journal Article

In: pp. under review, 2020.

Abstract | Links | BibTeX | Tags:

@article{Ferrer2020b,

title = {The optimality of syntactic dependency distances},

author = {R. Ferrer-i-Cancho and J. L. Esteban C. Gómez-Rodríguez and L. Alemany-Puig},

url = {https://arxiv.org/abs/2007.15342},

year = {2020},

date = {2020-01-01},

pages = {under review},

abstract = {It is often stated that human languages, as other biological systems, are shaped by cost-cutting pressures but, to what extent? Attempts to quantify the degree of optimality of languages by means of an optimality score have been scarce and focused mostly on English. Here we recast the problem of the optimality of the word order of a sentence as an optimization problem on a

spatial network where the vertices are words, arcs indicate syntactic dependencies and the space is defined by the linear order of the words in the sentence. We introduce a new score to quantify the cognitive pressure to reduce the distance between linked words in a sentence. The analysis of sentences from 93 languages representing 19 linguistic families reveals that half of languages are optimized to a 70% or more. The score indicates that distances are not significantly reduced in a few languages and confirms two theoretical predictions, i.e. that longer sentences are more optimized and that distances are more likely to be longer than expected by chance in short sentences. We

present a new hierarchical ranking of languages by their degree of optimization. The statistical advantages of the new score call for a reevaluation of the evolution of dependency distance over time in languages as well as the relationship between dependency distance and linguistic competence. Finally, the principles behind the design of the score can be extended to develop more powerful normalizations of topological distances or physical distances in more dimensions.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

spatial network where the vertices are words, arcs indicate syntactic dependencies and the space is defined by the linear order of the words in the sentence. We introduce a new score to quantify the cognitive pressure to reduce the distance between linked words in a sentence. The analysis of sentences from 93 languages representing 19 linguistic families reveals that half of languages are optimized to a 70% or more. The score indicates that distances are not significantly reduced in a few languages and confirms two theoretical predictions, i.e. that longer sentences are more optimized and that distances are more likely to be longer than expected by chance in short sentences. We

present a new hierarchical ranking of languages by their degree of optimization. The statistical advantages of the new score call for a reevaluation of the evolution of dependency distance over time in languages as well as the relationship between dependency distance and linguistic competence. Finally, the principles behind the design of the score can be extended to develop more powerful normalizations of topological distances or physical distances in more dimensions.

### 2019

Hernández-Fernández, A.; González, I.; Garrido, J.; Lacasa, L.

Linguistic laws in speech: the case of Catalan and Spanish Journal Article

In: Entropy: international and interdisciplinary journal of entropy and information studies, 21 (1153), pp. e21121153:1–e21121153:16, 2019.

Abstract | Links | BibTeX | Tags:

@article{Hernandez2019b,

title = {Linguistic laws in speech: the case of Catalan and Spanish},

author = {A. Hernández-Fernández and I. González and J. Garrido and L. Lacasa},

url = {http://hdl.handle.net/2117/173623},

doi = {10.3390/e21121153},

year = {2019},

date = {2019-11-01},

journal = {Entropy: international and interdisciplinary journal of entropy and information studies},

volume = {21},

number = {1153},

pages = {e21121153:1--e21121153:16},

abstract = {In this work we consider Glissando Corpus—an oral corpus of Catalan and Spanish—and empirically analyze the presence of the four classical linguistic laws (Zipf’s law, Herdan’s law, Brevity law, and Menzerath–Altmann’s law) in oral communication, and further complement this with the analysis of two recently formulated laws: lognormality law and size-rank law. By aligning the acoustic signal of speech production with the speech transcriptions, we are able to measure and compare the agreement of each of these laws when measured in both physical and symbolic units. Our results show that these six laws are recovered in both languages but considerably more emphatically so when these are examined in physical units, hence reinforcing the so-called ‘physical hypothesis’ according to which linguistic laws might indeed have a physical origin and the patterns recovered in written texts would, therefore, be just a byproduct of the regularities already present in the acoustic signals of oral communication.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

González, I.; Serrano, B. Luque; Lacasa, L.; Kello, C.; Hernández-Fernández, A.

On the physical origin of linguistic laws and lognormality in speech Journal Article

In: Royal Society Open Science, 6 (191023), 2019.

Abstract | Links | BibTeX | Tags:

@article{Gonzalez2019a,

title = {On the physical origin of linguistic laws and lognormality in speech},

author = {I. González and B. Luque Serrano and L. Lacasa and C. Kello and A. Hernández-Fernández},

url = {http://hdl.handle.net/2117/173635},

doi = {10.1098/rsos.191023},

year = {2019},

date = {2019-08-01},

journal = {Royal Society Open Science},

volume = {6},

number = {191023},

abstract = {Physical manifestations of linguistic units include sources of variability due to factors of speech production which are by definition excluded from counts of linguistic symbols. In this work, we examine whether linguistic laws hold with respect to the physical manifestations of linguistic units in spoken English. The data we analyse come from a phonetically transcribed database of acoustic recordings of spontaneous speech known as the Buckeye Speech corpus. First, we verify with unprecedented accuracy that acoustically transcribed durations of linguistic units at several scales comply with a lognormal distribution, and we quantitatively justify this ‘lognormality law’ using a stochastic generative model. Second, we explore the four classical linguistic laws (Zipf’s Law, Herdan’s Law, Brevity Law and Menzerath–Altmann’s Law (MAL)) in oral communication, both in physical units and in symbolic units measured in the speech transcriptions, and find that the validity of these laws is typically stronger when using physical units than in their symbolic counterpart. Additional results include (i) coining a Herdan’s Law in physical units, (ii) a precise mathematical formulation of Brevity Law, which we show to be connected to optimal compression principles in information theory and allows to formulate and validate yet another law which we call the size-rank law or (iii) a mathematical derivation of MAL which also highlights an additional regime where the law is inverted. Altogether, these results support the hypothesis that statistical laws in language have a physical origin.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

Hernández-Fernández, A.; González, I.; Lacasa, L.; Kello, C.; Serrano, B. Luque

A statistical model from information theory to explain Zipf's law of brevity Inproceedings

In: International Conference on Interdisciplinary Advances in Statistical Learning, pp. 94–94, Basque Center on Cognition, Brain and Language, 2019.

Abstract | Links | BibTeX | Tags:

@inproceedings{Hernandez2019c,

title = {A statistical model from information theory to explain Zipf's law of brevity},

author = {A. Hernández-Fernández and I. González and L. Lacasa and C. Kello and B. Luque Serrano},

url = {http://hdl.handle.net/2117/173894},

year = {2019},

date = {2019-06-01},

booktitle = {International Conference on Interdisciplinary Advances in Statistical Learning},

pages = {94--94},

publisher = {Basque Center on Cognition, Brain and Language},

abstract = {Brevity and frequency are two crucial factors in the processes of statistical learning. The compression principle had already been used previously to explain the origin of Zipf’s law for the frequency of words. Here we use a model from information theory to also explain the Zipf’s law of abbreviation, or the statistical tendency of more frequent elements in language to be shorter (in characters in the case of written language, and in time durations for oral communication). As far as we know, we show for the first time that Zipf’s law of abbreviation is a global speech process that holds in words regardless of what are the linguistics units of study. In addition, the derived model from information theory allows us to fit empirically linguistic data considering both acoustic elements (phonemes, words and sentences) and its transcripts. This raises that the processes measured in units of written text are a byproduct of spontaneous speech patterns. The more a word is used, the greatest effort in compression that will make it shorter; but also the shorter it is, the more times it will be used statistically. This work paves the way for new experimental approaches to the study of statistical learning.},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

Hernández, A.; Ferrer-i-Cancho, R.

Lingüística cuantitativa. La estadística de las palabras Book

EMSE EDAPP y Prisanoticas Colecciones, 2019, (English title: Quantitative linguistics. The statistics of words).

BibTeX | Tags:

@book{Hernandez2019a,

title = {Lingüística cuantitativa. La estadística de las palabras},

author = {A. Hernández and R. Ferrer-i-Cancho},

year = {2019},

date = {2019-01-01},

publisher = {EMSE EDAPP y Prisanoticas Colecciones},

series = {Grandes ideas de las matemáticas},

note = {English title: Quantitative linguistics. The statistics of words},

keywords = {},

pubstate = {published},

tppubtype = {book}

}

Casas, B.; Català, N.; Ferrer-i-Cancho, R.; Hernández-Fernández, A.; Baixeries, J.

Polysemy and brevity versus frequency in language Journal Article

In: Computer Speech & Language, 58 , pp. 19-50, 2019.

Abstract | Links | BibTeX | Tags:

@article{Casas2019,

title = {Polysemy and brevity versus frequency in language},

author = {B. Casas and N. Català and R. Ferrer-i-Cancho and A. Hernández-Fernández and J. Baixeries},

url = {https://doi.org/10.1016/j.csl.2019.03.007},

doi = {10.1016/j.csl.2019.03.007},

year = {2019},

date = {2019-01-01},

journal = {Computer Speech & Language},

volume = {58},

pages = {19-50},

abstract = {The pioneering research of G. K. Zipf on the relationship between word frequency and other word features led to the formulation of various linguistic laws. The most popular is Zipf’s law for word frequencies. Here we focus on two laws that have been studied less intensively: the meaning-frequency law, i.e. the tendency of more frequent words to be more polysemous, and the law of abbreviation, i.e. the tendency of more frequent words to be shorter. In a previous work, we tested the robustness of these Zipfian laws for English, roughly measuring word length in number of characters and distinguishing adult from child speech. In the present article, we extend our study to other languages (Dutch and Spanish) and introduce two additional measures of length: syllabic length and phonemic length. Our correlation analysis indicates that both the meaning-frequency law and the law of abbreviation hold overall in all the analyzed languages.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

### 2018

Casas, B.; Català, N.; Ferrer-i-Cancho, R.; Hernández-Fernández, A.; Baixeries, J.

The polysemy of the words that children learn over time Journal Article

In: Interaction Studies, 19 (3), pp. 389 - 426, 2018.

Abstract | Links | BibTeX | Tags:

@article{Casas2018,

title = {The polysemy of the words that children learn over time},

author = {B. Casas and N. Català and R. Ferrer-i-Cancho and A. Hernández-Fernández and J. Baixeries},

url = {http://arxiv.org/abs/1611.08807},

doi = {10.1075/is.16036.cas},

year = {2018},

date = {2018-01-01},

journal = {Interaction Studies},

volume = {19},

number = {3},

pages = {389 - 426},

abstract = {Here we study polysemy as a potential learning bias in vocabulary learning in children. Words of low polysemy could be preferred as they reduce the disambiguation effort for the listener. However, such preference could be a side-effect of another bias: the preference of children for nouns in combination with the lower polysemy of nouns with respect to other part-of-speech categories.

Our results show that mean polysemy in children increases over time in two phases, i.e. a fast growth till the 31st month followed by a slower tendency towards adult speech. In contrast, this evolution is not found in adults interacting with children. This suggests that children have a preference for non-polysemous words in their early stages of vocabulary acquisition. Interestingly, the evolutionary pattern described above weakens when controlling for syntactic category (noun, verb, adjective or adverb) but it does not disappear completely, suggesting that it could result from a combination of a standalone bias for low polysemy and a preference for nouns.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

Our results show that mean polysemy in children increases over time in two phases, i.e. a fast growth till the 31st month followed by a slower tendency towards adult speech. In contrast, this evolution is not found in adults interacting with children. This suggests that children have a preference for non-polysemous words in their early stages of vocabulary acquisition. Interestingly, the evolutionary pattern described above weakens when controlling for syntactic category (noun, verb, adjective or adverb) but it does not disappear completely, suggesting that it could result from a combination of a standalone bias for low polysemy and a preference for nouns.

### 2017

González, I.; Serrano, B. Luque; Lacasa, L.; Miramontes, O.; Hernández-Fernández, A.

Emergence of linguistic laws in human voice Inproceedings

In: Crossroads in Complex Systems, pp. 98–98, 2017.

Abstract | Links | BibTeX | Tags:

@inproceedings{Gonzalez2017a,

title = {Emergence of linguistic laws in human voice},

author = {I. González and B. Luque Serrano and L. Lacasa and O. Miramontes and A. Hernández-Fernández},

url = {http://hdl.handle.net/2117/121202},

year = {2017},

date = {2017-06-01},

booktitle = {Crossroads in Complex Systems},

pages = {98--98},

abstract = {Linguistic laws constitute one of the quantitative cornerstones of modern cognitive sciences and have been routinely investigated in written corpora, or in the equivalent transcription of oral corpora. This means that inferences of statistical patterns of language in acoustics are biased by the arbitrary, language-dependent segmentation of the signal, and virtually precludes the possibility of making comparative studies between human voice and other animal communication systems. Here we bridge this gap by proposing a method that allows to measure such patterns in acoustic signals of arbitrary origin, without needs to have access to the language corpus underneath. The method has been applied to sixteen different human languages, recovering successfully some well-known laws of human communication at timescales even below the phoneme and finding yet another link between complexity and criticality in a biological system. These methods further pave the way for new comparative studies in animal communication or the analysis of signals of unknown code.},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

Casas, B.; Català, N.; Hernández-Fernández, A.; Ferrer-i-Cancho, R.; Baixeries, J.

Polysemy as a vocabulary learning bias Inproceedings

In: Cartmill, E. A.; Roberts, S.; Lyn, H.; Cornish, H. (Ed.): Book of Abstracts. Protolang 5 Barcelona, September 26–28, pp. 23, 2017, (Protolang 5 Barcelona, September 26–28).

@inproceedings{Casas2017a,

title = {Polysemy as a vocabulary learning bias},

author = {B. Casas and N. Català and A. Hernández-Fernández and R. Ferrer-i-Cancho and J. Baixeries},

editor = {E. A. Cartmill and S. Roberts and H. Lyn and H. Cornish},

url = {http://bioling.ub.edu/wp-content/uploads/2017/09/protolang5_book_of_abstracts.pdf},

year = {2017},

date = {2017-01-01},

booktitle = {Book of Abstracts. Protolang 5 Barcelona, September 26–28},

pages = {23},

note = {Protolang 5 Barcelona, September 26–28},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

### 2016

Lozano, A.; Casas, B.; Bentz, C.; Ferrer-i-Cancho, R.

Fast calculation of entropy with Zhang's estimator Incollection

In: Kelih, J. Macutek R. Knight E.; Wilson, A. (Ed.): Issues in Quantitative Linguistics 4. Dedicated to Reinhard Köhler on the occasion of his 65th birthday, pp. 273-285, RAM-Verlag, Lüdenscheid, 2016, (No. 23 of the series "Studies in Quantitative Linguistics").

Abstract | Links | BibTeX | Tags:

@incollection{Lozano2016a,

title = {Fast calculation of entropy with Zhang's estimator},

author = {A. Lozano and B. Casas and C. Bentz and R. Ferrer-i-Cancho},

editor = {J. Macutek R. Knight E. Kelih and A. Wilson},

url = {https://arxiv.org/abs/1707.08290},

year = {2016},

date = {2016-01-01},

booktitle = {Issues in Quantitative Linguistics 4. Dedicated to Reinhard Köhler on the occasion of his 65th birthday},

pages = {273-285},

publisher = {RAM-Verlag},

address = {Lüdenscheid},

abstract = {Entropy is a fundamental property of a repertoire. Here, we present an efficient algorithm to estimate the entropy of types with the help of Zhang's estimator. The algorithm takes advantage of the fact that the number of different frequencies in a text is in general much smaller than the number of types. We justify the convenience of the algorithm by means of an analysis of the statistical properties of texts from more than 1000 languages. Our work opens up various possibilities for future research.},

note = {No. 23 of the series "Studies in Quantitative Linguistics"},

keywords = {},

pubstate = {published},

tppubtype = {incollection}

}

Hernández-Fernández, A.; Ferrer-i-Cancho, R.

The infochemical core Journal Article

In: Journal of Quantitative Linguistics, 23 , pp. 133-153, 2016.

Abstract | Links | BibTeX | Tags:

@article{Hernandez2016b,

title = {The infochemical core},

author = {A. Hernández-Fernández and R. Ferrer-i-Cancho},

doi = {10.1080/09296174.2016.1142323},

year = {2016},

date = {2016-01-01},

journal = {Journal of Quantitative Linguistics},

volume = {23},

pages = {133-153},

abstract = {Vocalizations, and less often gestures, have been the object of linguistic research for decades. However, the development of a general theory of communication with human language as a particular case requires a clear understanding of the organization of communication through other means. Infochemicals are chemical compounds that carry information and are employed by small organisms that cannot emit acoustic signals of an optimal frequency to achieve successful communication. Here, we investigate the distribution of infochemicals across species when they are ranked by their degree or the number of species with which they are associated (because they produce them or are sensitive to them). We evaluate the quality of the fit of different functions to the dependency between degree and rank by means of a penalty for the number of parameters of the function. Surprisingly, a double Zipf (a Zipf distribution with two regimes, each with a different exponent) is the model yielding the best fit although it is the function with the largest number of parameters. This suggests that the worldwide repertoire of infochemicals contains a core which is shared by many species and is reminiscent of the core vocabularies found for human language in dictionaries or large corpora.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

Hernández-Fernández, A.; Casas, B.; Ferrer-i-Cancho, R.; Baixeries, J.

Testing the robustness of laws of polysemy and brevity versus frequency Inproceedings

In: Král, P.; Martín-Vide, C. (Ed.): 4th International Conference on Statistical Language and Speech Processing (SLSP 2016). Lecture Notes in Computer Science 9918, pp. 19–29, 2016.

Abstract | Links | BibTeX | Tags:

@inproceedings{Hernandez2016a,

title = {Testing the robustness of laws of polysemy and brevity versus frequency},

author = {A. Hernández-Fernández and B. Casas and R. Ferrer-i-Cancho and J. Baixeries},

editor = {P. Král and C. Martín-Vide},

url = {http://dx.doi.org/10.1007/978-3-319-45925-7_2},

doi = {10.1007/978-3-319-45925-7_2},

year = {2016},

date = {2016-01-01},

booktitle = {4th International Conference on Statistical Language and Speech Processing (SLSP 2016). Lecture Notes in Computer Science 9918},

pages = {19--29},

abstract = {The pioneering research of G.K. Zipf on the relationship between word frequency and other word features led to the formulation of various linguistic laws. Here we focus on a couple of them: the meaning-frequency law, i.e. the tendency of more frequent words to be more polysemous, and the law of abbreviation, i.e. the tendency of more frequent words to be shorter. Here we evaluate the robustness of these laws in contexts where they have not been explored yet to our knowledge. The recovery of the laws again in new conditions provides support for the hypothesis that they originate from abstract mechanisms.},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

### 2014

Casas, B.; Català, N.; Ferrer-i-Cancho, R.; Baixeries, J.

The evolution of polysemy in child language Inproceedings

In: Cartmill, E. A.; Roberts, S.; Lyn, H.; Cornish, H. (Ed.): THE EVOLUTION OF LANGUAGE - Proceedings of the 10th International Conference (EVOLANG10), pp. 409-410, 2014, (Evolution of Language Conference (Evolang 2014). Vienna, Austria, April 14-17).

@inproceedings{Casas2014,

title = {The evolution of polysemy in child language},

author = {B. Casas and N. Català and R. Ferrer-i-Cancho and J. Baixeries},

editor = {E. A. Cartmill and S. Roberts and H. Lyn and H. Cornish},

url = {https://dx.doi.org/10.1142/9789814603638_0068},

doi = {10.1142/9789814603638_0068},

year = {2014},

date = {2014-01-01},

booktitle = {THE EVOLUTION OF LANGUAGE - Proceedings of the 10th International Conference (EVOLANG10)},

pages = {409-410},

note = {Evolution of Language Conference (Evolang 2014). Vienna, Austria, April 14-17},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

Ferrer-i-Cancho, R.; Hernández-Fernández, A.; Baixeries, J.; Debowski, Ł.; Mačutek, J.

When is Menzerath-Altmann law mathematically trivial? A new approach Journal Article

In: Statistical Applications in Genetics and Molecular Biology, 13 , pp. 633-644, 2014.

Abstract | Links | BibTeX | Tags:

@article{Ferrer2014,

title = {When is Menzerath-Altmann law mathematically trivial? A new approach},

author = {R. Ferrer-i-Cancho and A. Hernández-Fernández and J. Baixeries and Ł. Debowski and J. Mačutek},

url = {https://dx.doi.org/10.1515/sagmb-2013-0034},

doi = {10.1515/sagmb-2013-0034},

year = {2014},

date = {2014-01-01},

journal = {Statistical Applications in Genetics and Molecular Biology},

volume = {13},

pages = {633-644},

abstract = {Menzerath’s law, the tendency of Z (the mean size of the parts) to decrease as X (the number of parts) increases, is found in language, music and genomes. Recently, it has been argued that the presence of the law in genomes is an inevitable consequence of the fact that Z=Y/X, which would imply that Z scales with X as Z∼1/X. That scaling is a very particular case of Menzerath-Altmann law that has been rejected by means of a correlation test between X and Y in genomes, being X the number of chromosomes of a species, Y its genome size in bases and Z the mean chromosome size. Here we review the statistical foundations of that test and consider three non-parametric tests based upon different correlation metrics and one parametric test to evaluate if Z∼1/X in genomes. The most powerful test is a new non-parametric one based upon the correlation ratio, which is able to reject Z∼1/X in nine out of 11 taxonomic groups and detect a borderline group. Rather than a fact, Z∼1/X is a baseline that real genomes do not meet. The view of Menzerath-Altmann law as inevitable is seriously flawed.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

### 2013

Ferrer-i-Cancho, R.; Hernández-Fernández, A.

The failure of the law of brevity in two New World primates. Statistical caveats Journal Article

In: Glottotheory, 4 (1), 2013.

Abstract | Links | BibTeX | Tags:

@article{Ferrer2013a,

title = {The failure of the law of brevity in two New World primates. Statistical caveats},

author = {R. Ferrer-i-Cancho and A. Hernández-Fernández},

url = {https://dx.doi.org/10.1524/glot.2013.0004},

doi = {10.1524/glot.2013.0004},

year = {2013},

date = {2013-01-01},

journal = {Glottotheory},

volume = {4},

number = {1},

abstract = {Parallels of Zipf’s law of brevity, the tendency of more frequent words to be shorter, have been found in bottlenose dolphins and Formosan macaques. Although these findings suggest that behavioral repertoires are shaped by a general principle of compression, common marmosets and golden-backed uakaris do not exhibit the law. However, we argue that the law may be impossible or difficult to detect statistically in a given species if the repertoire is too small, a problem that could be affecting golden backed uakaris, and show that the law is present in a subset of the repertoire of common marmosets. We suggest that the visibility of the law will depend on the subset of the repertoire under consideration or the repertoire size.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

Ferrer-i-Cancho, R.; Hernández-Fernández, A.; Lusseau, D.; Agoramoorthy, G.; Hsu, M. J.; Semple, S.

Compression as a universal principle of animal behavior Journal Article

In: Cognitive Science, 37 (8), pp. 1565-1578, 2013.

Abstract | Links | BibTeX | Tags:

@article{Ferrer2013b,

title = {Compression as a universal principle of animal behavior},

author = {R. Ferrer-i-Cancho and A. Hernández-Fernández and D. Lusseau and G. Agoramoorthy and M. J. Hsu and S. Semple},

url = {http://arxiv.org/abs/1303.6175},

doi = {10.1111/cogs.12061},

year = {2013},

date = {2013-01-01},

journal = {Cognitive Science},

volume = {37},

number = {8},

pages = {1565-1578},

abstract = {A key aim in biology and psychology is to identify fundamental principles underpinning the behavior of animals, including humans. Analyses of human language and the behavior of a range of non‐human animal species have provided evidence for a common pattern underlying diverse behavioral phenomena: Words follow Zipf's law of brevity (the tendency of more frequently used words to be shorter), and conformity to this general pattern has been seen in the behavior of a number of other animals. It has been argued that the presence of this law is a sign of efficient coding in the information theoretic sense. However, no strong direct connection has been demonstrated between the law and compression, the information theoretic principle of minimizing the expected length of a code. Here, we show that minimizing the expected code length implies that the length of a word cannot increase as its frequency increases. Furthermore, we show that the mean code length or duration is significantly small in human language, and also in the behavior of other species in all cases where agreement with the law of brevity has been found. We argue that compression is a general principle of animal behavior that reflects selection for efficiency of coding.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

Ferrer-i-Cancho, R.; Forns, N.; Hernández-Fernández, A.; Bel-Enguix, G.; Baixeries, J.

The challenges of statistical patterns of language: the case of Menzerath's law in genomes Journal Article

In: Complexity, 18 (3), pp. 11-17, 2013.

Abstract | Links | BibTeX | Tags:

@article{Ferrer2013c,

title = {The challenges of statistical patterns of language: the case of Menzerath's law in genomes},

author = {R. Ferrer-i-Cancho and N. Forns and A. Hernández-Fernández and G. Bel-Enguix and J. Baixeries},

url = {https://dx.doi.org/10.1002/cplx.21429},

doi = {10.1002/cplx.21429},

year = {2013},

date = {2013-01-01},

journal = {Complexity},

volume = {18},

number = {3},

pages = {11-17},

abstract = {The importance of statistical patterns of language has been debated over decades. Although Zipf's law is perhaps the most popular case, recently, Menzerath's law has begun to be involved. Menzerath's law manifests in language, music and genomes as a tendency of the mean size of the parts to decrease as the number of parts increases in many situations. This statistical regularity emerges also in the context of genomes, for instance, as a tendency of species with more chromosomes to have a smaller mean chromosome size. It has been argued that the instantiation of this law in genomes is not indicative of any parallel between language and genomes because (a) the law is inevitable and (b) noncoding DNA dominates genomes. Here mathematical, statistical, and conceptual challenges of these criticisms are discussed. Two major conclusions are drawn: the law is not inevitable and languages also have a correlate of noncoding DNA. However, the wide range of manifestations of the law in and outside genomes suggests that the striking similarities between noncoding DNA and certain linguistics units could be anecdotal for understanding the recurrence of that statistical law.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

Ferrer-i-Cancho, R.; Baixeries, J.; Hernández-Fernández, A.

Erratum to "Random models of Menzerath-Altmann law in genomes" (BioSystems 107 (3), 167-173) Journal Article

In: Biosystems, 111 (3), pp. 216-217, 2013.

@article{Ferrer2013d,

title = {Erratum to "Random models of Menzerath-Altmann law in genomes" (BioSystems 107 (3), 167-173)},

author = {R. Ferrer-i-Cancho and J. Baixeries and A. Hernández-Fernández},

url = {https://dx.doi.org/10.1016/j.biosystems.2013.01.004},

doi = {10.1016/j.biosystems.2013.01.004},

year = {2013},

date = {2013-01-01},

journal = {Biosystems},

volume = {111},

number = {3},

pages = {216-217},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

Baixeries, J.; Hernández-Fernández, A.; Forns, N.; Ferrer-i-Cancho, R.

The parameters of Menzerath-Altmann law in genomes Journal Article

In: Journal of Quantitative Linguistics, 20 (2), pp. 94-104, 2013.

Abstract | Links | BibTeX | Tags:

@article{Baixeries2013a,

title = {The parameters of Menzerath-Altmann law in genomes},

author = {J. Baixeries and A. Hernández-Fernández and N. Forns and R. Ferrer-i-Cancho},

url = {https://dx.doi.org/10.1080/09296174.2013.773141},

doi = {10.1080/09296174.2013.773141},

year = {2013},

date = {2013-01-01},

journal = {Journal of Quantitative Linguistics},

volume = {20},

number = {2},

pages = {94-104},

abstract = {Here we improve the mathematical arguments of Baixeries et al (BioSystems 107(3) (2012) 167–173). The corrections do not alter the conclusion that the random breakage model yields an insufficient fit to the scaling of mean chromosome length as a function of chromosome number in real genomes.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

Baixeries, J.; Elvevåg, B.; Ferrer-i-Cancho, R.

The evolution of the exponent of Zipf's law in language ontogeny Journal Article

In: PLoS ONE, 8 (3), pp. e53227, 2013.

Abstract | Links | BibTeX | Tags:

@article{Baixeries2013b,

title = {The evolution of the exponent of Zipf's law in language ontogeny},

author = {J. Baixeries and B. Elvevåg and R. Ferrer-i-Cancho},

url = {https://dx.doi.org/10.1371/journal.pone.0053227},

doi = {10.1371/journal.pone.0053227},

year = {2013},

date = {2013-01-01},

journal = {PLoS ONE},

volume = {8},

number = {3},

pages = {e53227},

abstract = {Recently, a random breakage model has been proposed to explain the negative correlation between mean chromosome length and chromosome number that is found in many groups of species and is consistent with Menzerath–Altmann law, a statistical law that defines the dependency between the mean size of the whole and the number of parts in quantitative linguistics. Here, the central assumption of the model, namely that genome size is independent from chromosome number is reviewed. This assumption is shown to be unrealistic from the perspective of chromosome structure and the statistical analysis of real genomes. A general class of random models, including that random breakage model, is analyzed. For any model within this class, a power law with an exponent of −1 is predicted for the expectation of the mean chromosome size as a function of chromosome length, a functional dependency that is not supported by real genomes. The random breakage and variants keeping genome size and chromosome number independent raise no serious objection to the relevance of correlations consistent with Menzerath–Altmann law across taxonomic groups and the possibility of a connection between human language and genomes through that law.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

### 2012

Baixeries, J.; Hernández-Fernández, A.; Ferrer-i-Cancho, R.

Random models of Menzerath-Altmann law in genomes Journal Article

In: Biosystems, 107 , pp. 167-173, 2012.

@article{Baixeries2012a,

title = {Random models of Menzerath-Altmann law in genomes},

author = {J. Baixeries and A. Hernández-Fernández and R. Ferrer-i-Cancho},

url = {https://dx.doi.org/10.1016/j.biosystems.2011.11.010},

doi = {10.1016/j.biosystems.2011.11.010},

year = {2012},

date = {2012-01-01},

journal = {Biosystems},

volume = {107},

pages = {167-173},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

### 2011

Hernández-Fernández, A.; Baixeries, J.; Forns, N.; Ferrer-i-Cancho, R.

Size of the whole versus number of parts in genomes Journal Article

In: Entropy, 13 , pp. 1465-1480, 2011.

Abstract | Links | BibTeX | Tags:

@article{Hernandez2011a,

title = {Size of the whole versus number of parts in genomes},

author = {A. Hernández-Fernández and J. Baixeries and N. Forns and R. Ferrer-i-Cancho},

url = {https://dx.doi.org/10.3390/e13081465},

doi = {10.3390/e13081465},

year = {2011},

date = {2011-01-01},

journal = {Entropy},

volume = {13},

pages = {1465-1480},

abstract = {It is known that chromosome number tends to decrease as genome size increases in angiosperm plants. Here the relationship between number of parts (the chromosomes) and size of the whole (the genome) is studied for other groups of organisms from different kingdoms. Two major results are obtained. First, the finding of relationships of the kind “the more parts the smaller the whole” as in angiosperms, but also relationships of the kind “the more parts the larger the whole”. Second, these dependencies are not linear in general. The implications of the dependencies between genome size and chromosome number are two-fold. First, they indicate that arguments against the relevance of the finding of negative correlations consistent with Menzerath-Altmann law (a linguistic law that relates the size of the parts with the size of the whole) in genomes are seriously flawed. Second, they unravel the weakness of a recent model of chromosome lengths based upon random breakage that assumes that chromosome number and genome size are independent.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

### 2008

Ferrer-i-Cancho, R.; Hernández-Fernández, A.

Power laws and the golden number Incollection

In: Kelih, E.; Levickij, V.; Altmann, G. (Ed.): Problems of text analysis, 2008.

Abstract | Links | BibTeX | Tags:

@incollection{Ferrer2008d,

title = {Power laws and the golden number},

author = {R. Ferrer-i-Cancho and A. Hernández-Fernández},

editor = {E. Kelih and V. Levickij and G. Altmann},

url = {https://www.cs.upc.edu/%7Erferrericancho/Ferrer_i_Cancho_and_Hernandez_2008.pdf},

year = {2008},

date = {2008-01-01},

booktitle = {Problems of text analysis},

abstract = {The distribution of many real discrete random variables (e.g., the frequency of words, the population of cities) can be approximated by a zeta distribution, that is known popularly as Zipf’s law, or power law in physics. Here we revisit the relationship between power law distribution of a magnitude and the corresponding power relationship between the magnitude of a certain element and its rank. We show that the exponents of the two power laws coincide when its value is the famous golden number.},

keywords = {},

pubstate = {published},

tppubtype = {incollection}

}